提交 b6993ba3 编写于 作者: H hjchen2

Merge conflicts caused by FPGA

...@@ -4,7 +4,7 @@ option(USE_OPENMP "build with openmp support" ON) ...@@ -4,7 +4,7 @@ option(USE_OPENMP "build with openmp support" ON)
option(USE_EXCEPTION "build with exception" ON) option(USE_EXCEPTION "build with exception" ON)
option(WITH_LOGGING "print logging for debug" ON) option(WITH_LOGGING "print logging for debug" ON)
option(WITH_SYMBOL "build with all symbols" ON) # turn off if use jni or ios io option(WITH_SYMBOL "build with all symbols" ON) # turn off if use jni or ios io
option(WITH_PROFILE "print op profile for debug" ON) option(WITH_PROFILE "print op profile for debug" OFF)
option(WITH_TEST "build with unit tests" ON) option(WITH_TEST "build with unit tests" ON)
# select the platform to build # select the platform to build
......
...@@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) { ...@@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims(); auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>(); auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float); auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
fpga_copy(new_data, data_ptr, memory_size); float *old_p = p_data;
image::format_image(&new_data, channel, height, width); image::format_image(&p_data, channel, height, width);
image_tensor->reset_data_ptr(new_data); if (old_p != p_data) {
image_tensor->reset_data_ptr(p_data);
}
} }
void format_ofm(framework::Tensor *ofm_tensor) {
if (ofm_tensor->type() == typeid(float)) {
format_fp32_ofm(ofm_tensor);
} else {
format_fp16_ofm(ofm_tensor);
}
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) { void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims(); auto dims = ofm_tensor->dims();
size_t memory_size = 0; size_t memory_size = 0;
...@@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { ...@@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size); auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size); memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p); ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
} }
void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
...@@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { ...@@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto p = fpga_malloc(memory_size); auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size); memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p); ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
} }
void format_fp32_ofm(framework::Tensor *ofm_tensor) { void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims(); auto dims = ofm_tensor->dims();
...@@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) { ...@@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size); auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size); memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p); ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(float));
} }
float filter_find_max(framework::Tensor *filter_tensor) { float filter_find_max(framework::Tensor *filter_tensor) {
...@@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, ...@@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter::format_filter(&new_data, num, channel, height, width, group_num, filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value); max_value);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
} }
void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
auto dims = filter_tensor->dims(); auto dims = filter_tensor->dims();
...@@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { ...@@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy(new_data, data_ptr, memory_size); fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
} }
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
...@@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, ...@@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width}); // framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new); // filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
} }
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
...@@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { ...@@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter::format_fc_filter(&new_data, num, channel, height, width, 1, filter::format_fc_filter(&new_data, num, channel, height, width, 1,
max_value); max_value);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
} }
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
int group_num, int stride) { int group_num, int stride) {
...@@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, ...@@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework::make_ddim({num, channel, height, width}); framework::make_ddim({num, channel, height, width});
filter_tensor->Resize(dims_new); filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
} }
void format_bias_scale_array(float **bias_scale_array, void format_bias_scale_array(float **bias_scale_array,
...@@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width, ...@@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto ddim = framework::make_ddim({1, sum_channel, height, width}); auto ddim = framework::make_ddim({1, sum_channel, height, width});
out->Resize(ddim); out->Resize(ddim);
out->reset_data_ptr(data_ptr); out->reset_data_ptr(data_ptr);
out->set_type(typeid(half));
} }
void format_conv_data(framework::Tensor *filter_tensor, void format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr, framework::Tensor *ofm_tensor, float **bs_ptr,
...@@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num, int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w, int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) { float *bs_ptr) {
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<float>(); auto out_ptr = out->data<half>();
auto deleter = [](void *p) { fpga_free(p); }; auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
...@@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num, int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w, int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) { float *bs_ptr) {
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); }; auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
...@@ -603,9 +621,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -603,9 +621,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim( framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new); fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>(); auto out_ptr = out->data<half>();
arg->output.address = arg->output.address =
(half *)out_ptr + // NOLINT out_ptr +
omit_size * sizeof(half) * omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
...@@ -695,7 +713,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -695,7 +713,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
} }
for (int j = 0; j < split_num; ++j) { for (int j = 0; j < split_num; ++j) {
// arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
activation_enable; activation_enable;
arg->split_conv_args[i] arg->split_conv_args[i]
...@@ -741,9 +758,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -741,9 +758,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) * FILTER_NUM_ALIGNMENT) *
sizeof(int8_t); sizeof(int8_t);
auto filter_head = &(( auto filter_head =
int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT &filter_ptr[j * element_num * filter_num_per_div + // NOLINT
i * filter_sub_conv_offset]; i * filter_sub_conv_offset];
arg->split_conv_args[i]->conv_arg[j].filter_address = arg->split_conv_args[i]->conv_arg[j].filter_address =
fpga_malloc(filter_size); fpga_malloc(filter_size);
arg->split_conv_args[i]->vector_conv_space.push_back( arg->split_conv_args[i]->vector_conv_space.push_back(
...@@ -793,7 +810,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -793,7 +810,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->conv_arg[j].output.scale_address), arg->split_conv_args[i]->conv_arg[j].output.scale_address),
deleter)); deleter));
} }
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int16_t *>( arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
arg->split_conv_args[i]->conv_arg[j].output.address); arg->split_conv_args[i]->conv_arg[j].output.address);
arg->split_conv_args[i]->concat_arg.scales_in[j] = arg->split_conv_args[i]->concat_arg.scales_in[j] =
arg->split_conv_args[i]->conv_arg[j].output.scale_address; arg->split_conv_args[i]->conv_arg[j].output.scale_address;
...@@ -818,9 +835,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, ...@@ -818,9 +835,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h, int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w, int stride_w, int padding_h, int padding_w,
float *bias_ptr) { float *bias_ptr) {
auto filter_ptr = filter->data<float>(); auto deleter = [](void *p) { fpga_free(p); };
auto input_ptr = input->data<float>(); arg->vector_dwconv_space.push_back(
auto output_ptr = out->mutable_data<float>(); std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
auto filter_ptr = filter->data<uint8_t>();
auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>();
arg->sub_conv_num = 1; arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled; // arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable; arg->output.activation.activation_type = activation_enable;
...@@ -848,9 +869,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, ...@@ -848,9 +869,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h, int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w, int stride_w, int padding_h, int padding_w,
float *bias_ptr) { float *bias_ptr) {
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<float>();
auto deleter = [](void *p) { fpga_free(p); }; auto deleter = [](void *p) { fpga_free(p); };
...@@ -885,7 +905,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, ...@@ -885,7 +905,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim( framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, real_out_height, real_out_width}); {1, arg->filter_num, real_out_height, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new); fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>(); auto out_ptr = out->data<half>();
/*====For Addition /*====For Addition
arg->output.address = arg->output.address =
......
...@@ -23,6 +23,7 @@ namespace paddle_mobile { ...@@ -23,6 +23,7 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
void format_image(framework::Tensor* image_tensor); void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor); void format_fp32_ofm(framework::Tensor* ofm_tensor);
......
...@@ -247,6 +247,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -247,6 +247,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
fpga_free(ptr_tmp); fpga_free(ptr_tmp);
} }
fpga_free(ptr_ptr_data);
*data_in = reinterpret_cast<float*>(ptr_space); *data_in = reinterpret_cast<float*>(ptr_space);
/* { /* {
......
...@@ -22,7 +22,6 @@ namespace fpga { ...@@ -22,7 +22,6 @@ namespace fpga {
namespace image { namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) { void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp = float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel; int64_t amount_per_row = width * channel;
...@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) { ...@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
} }
} }
*data_in = data_tmp; *data_in = data_tmp;
fpga_free(tmp);
} }
void align_element_conv(float **data_in, int height, int cw) { void align_element_conv(float **data_in, int height, int cw) {
int h = 0; int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
memset(data_tmp, 0, height * align_cw * sizeof(float)); float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
for (h = 0; h < height; h++) { memset(data_tmp, 0, height * align_cw * sizeof(float));
memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
}
*data_in = data_tmp; for (h = 0; h < height; h++) {
fpga_free(tmp); memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
} }
*data_in = data_tmp;
} }
void format_image(float **data_in, int channel, int height, int width) { void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width); convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width); int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *hwc_temp = *data_in;
align_element_conv(data_in, height, channel * width);
fpga_free(hwc_temp);
}
fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(float)); sizeof(float));
} }
......
...@@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq(args.driver.deconv_param, 0xd18); reg_writeq(args.driver.deconv_param, 0xd18);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD); reg_writeq(args.driver.cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "Conv Wait Irq Timeout!"; DLOG << "Conv Wait Irq Timeout!";
} }
DLOG << "after reg poll";
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
......
...@@ -164,7 +164,7 @@ void fpga_free(void *ptr) { ...@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes"; // << counter << " bytes";
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Address: " << ptr << " Invalid pointer";
} }
} }
void fpga_copy(void *dest, const void *src, size_t num) { void fpga_copy(void *dest, const void *src, size_t num) {
......
...@@ -19,17 +19,16 @@ limitations under the License. */ ...@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
namespace paddle_mobile {
namespace fpga {
#ifdef PADDLE_MOBILE_FPGA_V1 #ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16 #define IMAGE_ALIGNMENT (16) // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 #define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 #define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8 #define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT 16 #define BIAS_NUM_ALIGNMENT (16)
#endif #endif
namespace paddle_mobile {
namespace fpga {
enum DataType { enum DataType {
DATA_TYPE_FP32 = 1, DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0, DATA_TYPE_FP16 = 0,
...@@ -49,7 +48,7 @@ enum ActivationType { ...@@ -49,7 +48,7 @@ enum ActivationType {
}; };
struct ActivationArgs { struct ActivationArgs {
enum ActivationType activation_type; enum ActivationType activation_type = NONE;
int16_t leaky_relu_negative_slope; int16_t leaky_relu_negative_slope;
}; };
...@@ -188,6 +187,7 @@ struct SplitArgs { ...@@ -188,6 +187,7 @@ struct SplitArgs {
uint32_t* out_channel_nums; uint32_t* out_channel_nums;
uint32_t height; uint32_t height;
uint32_t width; uint32_t width;
std::vector<std::shared_ptr<char>> vector_split_space;
}; };
struct PoolingArgs { struct PoolingArgs {
...@@ -237,6 +237,7 @@ struct DWconvArgs { ...@@ -237,6 +237,7 @@ struct DWconvArgs {
struct KernelArgs kernel; struct KernelArgs kernel;
struct ImageInputArgs image; struct ImageInputArgs image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
std::vector<std::shared_ptr<char>> vector_dwconv_space;
}; };
struct DWDeconvArgs { struct DWDeconvArgs {
......
...@@ -83,6 +83,11 @@ Executor<Device, T>::Executor(const Program<Device> &program, ...@@ -83,6 +83,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
// resize feed and fetch list // resize feed and fetch list
InitFeedFetchList(); InitFeedFetchList();
#ifdef PADDLE_MOBILE_FPGA
program_.scope->EraseVars({"feed", "fetch"});
program_.scope->print_vars();
#endif
int count = 0; int count = 0;
for (auto &op_handler : ops_of_block0_) { for (auto &op_handler : ops_of_block0_) {
DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type(); DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
...@@ -291,6 +296,7 @@ template <typename Device, typename T> ...@@ -291,6 +296,7 @@ template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory( bool Executor<Device, T>::varInputMemory(
const std::shared_ptr<VarDesc> &var_desc, Variable *var) const { const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
tensor->init(typeid(float)); tensor->init(typeid(float));
return true; return true;
#endif #endif
...@@ -506,14 +512,41 @@ template <typename Device, typename T> ...@@ -506,14 +512,41 @@ template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t, void Executor<Device, T>::InjectVariable(const Tensor &t,
std::string var_name) { std::string var_name) {
Variable *g_feed_value = program_.scope->Var(var_name); Variable *g_feed_value = program_.scope->Var(var_name);
Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>(); Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
feed_tensor->Resize(t.dims()); feed_tensor->Resize(t.dims());
feed_tensor->ShareDataWith(t); feed_tensor->ShareDataWith(t);
} }
template <typename Device, typename T> template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) { void Executor<Device, T>::FeedData(const Tensor &t) {
InjectVariable(t, "feed"); InjectVariable(t, "feed0");
}
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
auto input_size = v.size();
auto vars = program_.scope->VarContain("feed");
PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
"input data number not correct");
for (int i = 0; i < input_size; i++) {
auto var = program_.scope->Var("feed", i);
auto feed_tensor = var->template GetMutable<LoDTensor>();
feed_tensor->external_data = v[i];
}
}
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
auto output_size = v->size();
PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
auto vars = program_.scope->VarContain("fetch");
PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
"output data number not correct");
for (int i = 0; i < output_size; i++) {
auto var = program_.scope->Var("fetch", i);
auto fetch_tensor = var->template GetMutable<LoDTensor>();
(*v)[i] = fetch_tensor->template data<float>();
}
} }
template <typename Device, typename T> template <typename Device, typename T>
......
...@@ -52,6 +52,8 @@ class Executor { ...@@ -52,6 +52,8 @@ class Executor {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const Tensor &t, std::string var_name); void InjectVariable(const Tensor &t, std::string var_name);
void FeedData(const Tensor &t); void FeedData(const Tensor &t);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<Tensor> FetchResult(int id = -1); std::shared_ptr<Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1); void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start); void Predict_From(int start);
......
...@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type, ...@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_(attrs), attrs_(attrs),
scope_(scope) { scope_(scope) {
CheckAllInputOutputSet(); CheckAllInputOutputSet();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors();
#endif
} }
template <typename Dtype> template <typename Dtype>
...@@ -133,6 +136,25 @@ void OperatorBase<GPU_CL>::Run() { ...@@ -133,6 +136,25 @@ void OperatorBase<GPU_CL>::Run() {
} }
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype>
void OperatorBase<Dtype>::InsertTensors() {
static int feed_num = 0;
static int fetch_num = 0;
if (type_ == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
inputs_.at("X") = {string(new_name)};
} else if (type_ == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
outputs_.at("Out") = {string(new_name)};
}
}
#endif
template class OperatorBase<CPU>; template class OperatorBase<CPU>;
template class OperatorBase<FPGA>; template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>; template class OperatorBase<GPU_MALI>;
......
...@@ -78,6 +78,9 @@ class OperatorBase { ...@@ -78,6 +78,9 @@ class OperatorBase {
this->scope_->EraseVars(var_names); this->scope_->EraseVars(var_names);
} }
} }
#ifdef PADDLE_MOBILE_FPGA
void InsertTensors();
#endif
protected: protected:
framework::Scope *scope_; framework::Scope *scope_;
...@@ -102,7 +105,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> { ...@@ -102,7 +105,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_.InitCLHelper(scope->GetCLScpoe()); kernel_.InitCLHelper(scope->GetCLScpoe());
#endif #endif
} }
virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0; virtual void InferShape() const = 0;
......
...@@ -72,7 +72,8 @@ void ProgramDesc::Description(std::string header) { ...@@ -72,7 +72,8 @@ void ProgramDesc::Description(std::string header) {
} }
} }
for (auto &attr : op->GetAttrMap()) { for (auto &attr : op->GetAttrMap()) {
LOG(kLOG_DEBUG2) << "attr name:: " << attr.first; if (attr.first == "op_callstack") continue;
LOG(kLOG_DEBUG2) << "attr name: " << attr.first;
LOG(kLOG_DEBUG3) << "argument - " << attr.second; LOG(kLOG_DEBUG3) << "argument - " << attr.second;
} }
} }
......
...@@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const { ...@@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const {
return nullptr; return nullptr;
} }
#ifdef PADDLE_MOBILE_FPGA
Variable *Scope::Var(const std::string &name, const int id) {
return Var(name + std::to_string(id));
}
std::vector<Variable *> Scope::VarContain(const std::string substring) {
std::vector<Variable *> v;
for (auto pair : vars_) {
if (pair.first.find(substring) == 0) {
v.push_back(pair.second);
}
}
return v;
}
void Scope::print_vars() {
DLOG << "====================start to print variables=================";
for (auto pair : vars_) {
DLOG << pair.first;
}
DLOG << "==================complete printing variables================";
}
#endif
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -75,6 +75,12 @@ class Scope { ...@@ -75,6 +75,12 @@ class Scope {
Variable *FindVarLocally(const std::string &name) const; Variable *FindVarLocally(const std::string &name) const;
#ifdef PADDLE_MOBILE_FPGA
Variable *Var(const std::string &name, const int id);
std::vector<Variable *> VarContain(const std::string substring);
void print_vars();
#endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
CLScope *GetCLScpoe() { return cl_scope_; } CLScope *GetCLScpoe() { return cl_scope_; }
#endif #endif
......
...@@ -202,6 +202,11 @@ class Tensor : public TensorBase { ...@@ -202,6 +202,11 @@ class Tensor : public TensorBase {
inline void reset_data_ptr(void *p) { inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT
} }
inline void set_type(std::type_index type) { holder_->set_type(type); }
inline void *get_data() {
return (
void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get()); // NOLINT
}
inline void *init(std::type_index type) { inline void *init(std::type_index type) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
...@@ -217,7 +222,8 @@ class Tensor : public TensorBase { ...@@ -217,7 +222,8 @@ class Tensor : public TensorBase {
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
} }
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void *external_data = nullptr; // only used for Feed
#endif #endif
}; };
......
...@@ -110,6 +110,91 @@ bool PaddleMobilePredictor<Device, T>::Run( ...@@ -110,6 +110,91 @@ bool PaddleMobilePredictor<Device, T>::Run(
return true; return true;
} }
#ifdef PADDLE_MOBILE_FPGA
template <typename Device, typename T>
bool PaddleMobilePredictor<Device, T>::Run(
const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, std::vector<int> *index_data,
int batch_size) {
if (inputs.empty()) {
LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
return false;
}
auto input = inputs[0];
if (input.shape.size() != 4) {
LOG(kLOG_ERROR) << "input shape not equal to 4!";
return false;
}
std::vector<int64_t> dims;
for (auto d : input.shape) {
dims.push_back(static_cast<int64_t>(d));
}
// use tensor
framework::DDim ddim =
framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
framework::Tensor input_tensor;
input_tensor.Resize(ddim);
int input_length = framework::product(ddim);
auto input_ptr = input_tensor.mutable_data<T>();
memcpy(input_ptr, static_cast<T *>(input.data.data()),
input_length * sizeof(T));
paddle_mobile_->Predict(input_tensor);
auto num_result = index_data->size();
if (output_data->size() != num_result) {
LOG(kLOG_ERROR) << "index and output number don't match";
return false;
}
for (int i = 0; i < num_result; i++) {
auto output_tensor = paddle_mobile_->FetchResult((*index_data)[i]);
if (output_data->empty()) {
LOG(kLOG_ERROR)
<< "At least one output should be set with tensors' names.";
return false;
}
auto &output = (*output_data)[i];
int output_length = output_tensor->numel();
std::vector<int64_t> tensor_shape =
framework::vectorize(output_tensor->dims());
for (auto d : tensor_shape) {
output.shape.push_back(static_cast<int>(d));
}
if (output.data.length() < output_length * sizeof(T)) {
output.data.Resize(output_length * sizeof(T));
}
memcpy(output.data.data(), output_tensor->template data<T>(),
output_length * sizeof(T));
}
return true;
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::FeedData(
const std::vector<void *> &inputs) {
paddle_mobile_->FeedData(inputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::GetResults(
std::vector<void *> *outputs) {
paddle_mobile_->GetResults(outputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
paddle_mobile_->Predict_From_To(start, end);
}
#endif
template <typename Device, typename T> template <typename Device, typename T>
PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() { PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
paddle_mobile_->Clear(); paddle_mobile_->Clear();
......
...@@ -31,7 +31,14 @@ class PaddleMobilePredictor : public PaddlePredictor { ...@@ -31,7 +31,14 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool Run(const std::vector<PaddleTensor>& inputs, bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) override; int batch_size = -1) override;
#ifdef PADDLE_MOBILE_FPGA
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<int>* index_data,
int batch_size = -1) override;
void FeedData(const std::vector<void*>& inputs) override;
void GetResults(std::vector<void*>* outputs) override;
void Predict_From_To(int start = 0, int end = -1) override;
#endif
~PaddleMobilePredictor() override; ~PaddleMobilePredictor() override;
private: private:
......
...@@ -26,8 +26,16 @@ limitations under the License. */ ...@@ -26,8 +26,16 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
// #define PADDLE_MOBILE_FPGA
namespace paddle_mobile { namespace paddle_mobile {
#ifdef PADDLE_MOBILE_FPGA
namespace fpga {
int open_device();
}
#endif
enum PaddleDType { enum PaddleDType {
FLOAT32, FLOAT32,
INT64, INT64,
...@@ -107,6 +115,14 @@ class PaddlePredictor { ...@@ -107,6 +115,14 @@ class PaddlePredictor {
std::string prog_file; std::string prog_file;
std::string param_file; std::string param_file;
}; };
#ifdef PADDLE_MOBILE_FPGA
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
std::vector<int>* index_data, int batch_size = -1) = 0;
virtual void FeedData(const std::vector<void*>& inputs) = 0;
virtual void GetResults(std::vector<void*>* outputs) = 0;
virtual void Predict_From_To(int start = 0, int end = -1) = 0;
#endif
protected: protected:
PaddlePredictor() = default; PaddlePredictor() = default;
......
...@@ -228,6 +228,16 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) { ...@@ -228,6 +228,16 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_->FeedData(t); executor_->FeedData(t);
} }
template <typename Device, typename T>
void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
executor_->FeedData(v);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
executor_->GetResults(v);
}
template <typename Device, typename T> template <typename Device, typename T>
std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult( std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
int id) { int id) {
......
...@@ -90,6 +90,8 @@ class PaddleMobile { ...@@ -90,6 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name); void InjectVariable(const framework::Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t); void FeedData(const framework::Tensor &t);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1); std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1); void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start); void Predict_From(int start);
......
...@@ -22,6 +22,7 @@ namespace operators { ...@@ -22,6 +22,7 @@ namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
void AnchorGeneratorOp<DeviceType, T>::InferShape() const { void AnchorGeneratorOp<DeviceType, T>::InferShape() const {
const auto &input_dims = this->param_.input_->dims(); const auto &input_dims = this->param_.input_->dims();
// DLOG << "AnchorGenerator input dim =" << input_dims.size();
PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
const auto &anchor_sizes = this->param_.anchor_sizes_; const auto &anchor_sizes = this->param_.anchor_sizes_;
const auto &aspect_ratios = this->param_.aspect_ratios_; const auto &aspect_ratios = this->param_.aspect_ratios_;
...@@ -98,3 +99,15 @@ REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp); ...@@ -98,3 +99,15 @@ REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp); REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp);
#endif #endif
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef ANCHOR_GENERATOR_OP
REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp);
#endif
#ifdef PROPOSAL_OP
REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
#endif
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
#endif
#endif
...@@ -103,6 +103,10 @@ class ProposalParam : public OpParam { ...@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float nms_thresh_; float nms_thresh_;
float min_size_; float min_size_;
float eta_; float eta_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_score, float_bbox;
fpga::BypassArgs score_arg, bbox_arg;
#endif
}; };
DECLARE_KERNEL(Proposal, ProposalParam); DECLARE_KERNEL(Proposal, ProposalParam);
...@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam { ...@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int pooled_height_; int pooled_height_;
int pooled_width_; int pooled_width_;
float spatial_scale_; float spatial_scale_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_input, float_output;
fpga::BypassArgs input_arg, output_arg;
#endif
}; };
DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ANCHOR_GENERATOR_OP
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool AnchorGeneratorKernel<FPGA, float>::Init(
AnchorGeneratorParam<FPGA> *param) {
auto input = param->input_;
auto anchors = param->output_anchors_;
auto anchor_ptr = anchors->mutable_data<float>();
auto stride = param->stride_;
auto feature_width = input->dims()[3], feature_height = input->dims()[2];
auto stride_width = stride[0], stride_height = stride[1];
int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23,
-20, 39, 36, -43, -34, 59, 49, -63, -54,
79, 69, -96, -77, 112, 93, -137, -118, 153,
134, -204, -188, 220, 204, -281, -395, 296, 441};
int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset = h_idx * w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) {
anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] =
anchors_offset[idx * 4 + 1] + h_idx * stride_height;
anchor_ptr[offset + 2] =
anchors_offset[idx * 4 + 2] + w_idx * stride_width;
anchor_ptr[offset + 3] =
anchors_offset[idx * 4 + 3] + h_idx * stride_height;
}
}
}
return true;
}
template <>
void AnchorGeneratorKernel<FPGA, float>::Compute(
const AnchorGeneratorParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif // ANCHOR_GENERATOR_OP
...@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width, input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified"); "Image height & width should be unified");
images_in[i] = (half *)input->data<float>(); // NOLINT images_in[i] = input->data<half>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
scales_in[i] = input->scale; scales_in[i] = input->scale;
} }
...@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs.image_num = image_num; concatArgs.image_num = image_num;
concatArgs.images_in = images_in; concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in; concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->data<float>(); // NOLINT concatArgs.image_out = out->data<half>();
concatArgs.scale_out = out->scale; concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num; concatArgs.channel_num = channel_num;
concatArgs.height = height; concatArgs.height = height;
......
...@@ -26,11 +26,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -26,11 +26,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias(); auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
...@@ -59,8 +59,6 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -59,8 +59,6 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
...@@ -70,6 +68,9 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -70,6 +68,9 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true; return true;
} }
......
...@@ -27,10 +27,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -27,10 +27,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU; paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias(); auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
...@@ -60,8 +60,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -60,8 +60,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
const int groups = param->Groups(); const int groups = param->Groups();
if (groups == channel) { if (groups == channel) {
...@@ -71,6 +69,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -71,6 +69,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope, strides[0], strides[1], leaky_relu_negative_slope, strides[0], strides[1],
paddings[0], paddings[1], new_bias_ptr); paddings[0], paddings[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg); param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(new_scale_ptr);
fpga::fpga_free(bs_ptr);
} else { } else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
...@@ -78,6 +78,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -78,6 +78,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope, param->Groups(), strides[0], leaky_relu_negative_slope, param->Groups(), strides[0],
strides[1], paddings[0], paddings[1], bs_ptr); strides[1], paddings[0], paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
} }
return true; return true;
} }
......
...@@ -25,10 +25,10 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) { ...@@ -25,10 +25,10 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
...@@ -25,10 +25,10 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -25,10 +25,10 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU; paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
...@@ -26,8 +26,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -26,8 +26,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
...@@ -51,8 +51,6 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -51,8 +51,6 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
...@@ -61,6 +59,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -61,6 +59,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true; return true;
} }
......
...@@ -26,8 +26,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -26,8 +26,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU; paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
...@@ -51,8 +51,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -51,8 +51,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
...@@ -61,6 +59,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -61,6 +59,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true; return true;
} }
......
...@@ -27,10 +27,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -27,10 +27,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
...@@ -28,10 +28,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init( ...@@ -28,10 +28,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU; paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
...@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { ...@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_x = const_cast<LoDTensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled; // ewaddArgs.relu_enabled = relu_enabled;
......
...@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto *input_x = const_cast<LoDTensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled; // ewaddArgs.relu_enabled = relu_enabled;
......
...@@ -19,19 +19,37 @@ namespace operators { ...@@ -19,19 +19,37 @@ namespace operators {
template <> template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) { bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
Tensor *output = param->Out(); auto output = param->Out();
int col = param->Col();
auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
input->init(typeid(float));
input->Resize(output->dims());
if (output->dims().size() != 4) {
auto input_ptr = input->mutable_data<float>();
size_t size = output->numel() * sizeof(float);
auto p = fpga::fpga_malloc(size);
memcpy(p, input_ptr, size);
output->reset_data_ptr(p);
return true;
}
fpga::format_fp16_ofm(output); fpga::format_fp16_ofm(output);
return true; return true;
} }
template <> template <>
void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto input = auto output = param.Out();
reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX())); int col = param.Col();
auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
if (input->dims().size() != 4) {
return;
}
fpga::format_image(input); fpga::format_image(input);
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
Tensor *output = param.Out(); auto output_ptr = output->data<half>();
auto output_ptr = output->data<float>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
...@@ -39,7 +57,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -39,7 +57,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW; args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr); args.image.address = input_ptr;
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2]; args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3]; args.image.width = (uint32_t)input->dims()[3];
...@@ -48,6 +66,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -48,6 +66,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output.address = output_ptr; args.output.address = output_ptr;
args.output.scale_address = output->scale; args.output.scale_address = output->scale;
fpga::PerformBypass(args); fpga::PerformBypass(args);
input->external_data = nullptr;
} }
template class FeedKernel<FPGA, float>; template class FeedKernel<FPGA, float>;
......
...@@ -19,20 +19,15 @@ namespace operators { ...@@ -19,20 +19,15 @@ namespace operators {
template <> template <>
bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) { bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
Tensor *output = param->Out(); auto input = const_cast<LoDTensor *>(param->InputX());
// fpga::format_fp16_ofm(output); int col = param->Col();
return true; auto output = &(param->Out()->at(col));
} if (input->type() == typeid(float)) {
return true;
template <> }
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) { output->init(typeid(float));
param.Out()->ShareDataWith(*(param.InputX())); output->Resize(input->dims());
/*auto input = fpga::format_fp32_ofm(output);
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
...@@ -40,13 +35,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) { ...@@ -40,13 +35,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP32; args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW; args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr); args.image.address = input->data<half>();
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)product(input->dims());
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : args.image.height = 1;
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3] args.image.width = 1;
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address args.image.pad_height = 0;
= output_ptr; args.output.scale_address = output->scale; args.image.pad_width = 0;
fpga::PerformBypass(args);*/ args.output.address = output->data<float>();
args.output.scale_address = output->scale;
param->fpga_bypass_args = args;
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX();
if (input->type() == typeid(float)) {
int col = param.Col();
auto output = &(param.Out()->at(col));
output->ShareDataWith(*input);
return;
}
fpga::PerformBypass(param.fpga_bypass_args);
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
param.fpga_bypass_args.image.channels * sizeof(float));
// TODO: DEalign: get rid of extra 0
} }
template class FetchKernel<FPGA, float>; template class FetchKernel<FPGA, float>;
......
...@@ -25,7 +25,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -25,7 +25,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
paddle_mobile::fpga::NONE; paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input_x = const_cast<LoDTensor *>(param->InputX()); auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<Tensor *>(param->InputY()); auto filter = const_cast<LoDTensor *>(param->InputY());
const Tensor *input_z = param->InputZ(); const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
auto out = param->Out(); auto out = param->Out();
......
...@@ -21,11 +21,11 @@ namespace operators { ...@@ -21,11 +21,11 @@ namespace operators {
template <> template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input()); auto *input = const_cast<LoDTensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
Tensor *output = param->Output(); Tensor *output = param->Output();
fpga::format_fp16_ofm(output); fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<float>(); auto output_ptr = output->mutable_data<half>();
vector<int> ksize = param->Ksize(); vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides(); vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PROPOSAL_OP
#include <algorithm>
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
template <>
bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
int post_nms_top_n = param->post_nms_topn_;
int64_t batch = param->scores_->dims()[0];
auto total = post_nms_top_n * batch;
param->rpn_rois_->mutable_data<float>({total, 4});
param->rpn_probs_->mutable_data<float>({total, 1});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param->float_bbox = std::make_shared<Tensor>();
param->float_bbox->Resize(param->bbox_deltas_->dims());
param->float_bbox->init(typeid(float));
fpga::format_fp32_ofm(param->float_bbox.get());
param->float_score = std::make_shared<Tensor>();
param->float_score->Resize(param->scores_->dims());
param->float_score->init(typeid(float));
fpga::format_fp32_ofm(param->float_score.get());
auto input = param->bbox_deltas_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_bbox->mutable_data<float>();
args.output.scale_address = param->float_bbox->scale;
param->bbox_arg = args;
input = param->scores_;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_score->mutable_data<float>();
args.output.scale_address = param->float_score->scale;
param->score_arg = args;
return true;
}
void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
auto *out_data = dst->data<void>();
auto *to_add_data = src.data<void>();
size_t size_of_t = framework::SizeOfType(src.type());
offset *= size_of_t;
std::memcpy(
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
to_add_data, src.numel() * size_of_t);
}
template <class T>
static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
Tensor *variances, Tensor *proposals) {
T *proposals_data = proposals->mutable_data<T>();
int64_t row = all_anchors->dims()[0];
int64_t len = all_anchors->dims()[1];
auto *bbox_deltas_data = bbox_deltas->data<T>();
auto *anchor_data = all_anchors->data<T>();
const T *variances_data = nullptr;
if (variances) {
variances_data = variances->data<T>();
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
if (variances) {
bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
anchor_center_x;
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
}
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
}
// return proposals;
}
template <class T>
static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
T *boxes_data = boxes->mutable_data<T>();
const T *im_info_data = im_info.data<T>();
T zero(0);
for (int64_t i = 0; i < boxes->numel(); ++i) {
if (i % 4 == 0) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else if (i % 4 == 1) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
} else if (i % 4 == 2) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
}
}
}
template <class T>
static inline void FilterBoxes(Tensor *boxes, float min_size,
const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>();
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>();
int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i;
}
}
keep->Resize({keep_len});
}
template <class T>
static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
const std::vector<T> &scores) {
std::vector<std::pair<T, int>> sorted_indices;
sorted_indices.reserve(scores.size());
for (size_t i = 0; i < scores.size(); ++i) {
sorted_indices.emplace_back(scores[i], i);
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<T, int> &a, const std::pair<T, int> &b) {
return a.first < b.first;
});
return sorted_indices;
}
template <class T>
static inline T BBoxArea(const T *box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <typename T>
static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
int selected_num) {
Tensor keep_nms;
keep_nms.Resize({selected_num});
auto *keep_data = keep_nms.mutable_data<T>();
for (int i = 0; i < selected_num; ++i) {
keep_data[i] = selected_indices[i];
}
return keep_nms;
}
template <class T>
static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <class T>
static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
float eta) {
int64_t num_boxes = bbox->dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox->dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices =
GetSortedScoreIndex<T>(scores_data);
std::vector<int> selected_indices;
int selected_num = 0;
T adaptive_threshold = nms_threshold;
const T *bbox_data = bbox->data<T>();
while (sorted_indices.size() != 0) {
int idx = sorted_indices.back().second;
bool flag = true;
for (int kept_idx : selected_indices) {
if (flag) {
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, false);
flag = (overlap <= adaptive_threshold);
} else {
break;
}
}
if (flag) {
selected_indices.push_back(idx);
++selected_num;
}
sorted_indices.erase(sorted_indices.end() - 1);
if (flag && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
return VectorToTensor(selected_indices, selected_num);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas_slice, // [M, 4]
const Tensor &scores_slice, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
auto *scores_data = scores_slice.data<T>();
// Sort index
Tensor index_t;
index_t.Resize({scores_slice.numel()});
int *index = index_t.mutable_data<int>();
for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}
auto compare = [scores_data](const int64_t &i, const int64_t &j) {
return scores_data[i] > scores_data[j];
};
if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
std::sort(index, index + scores_slice.numel(), compare);
} else {
std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
compare);
index_t.Resize({pre_nms_top_n});
}
Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
scores_sel.mutable_data<T>({index_t.numel(), 1});
bbox_sel.mutable_data<T>({index_t.numel(), 4});
anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4});
Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals);
Tensor keep;
FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
Tensor scores_filter;
bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1});
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter);
}
Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
proposals.mutable_data<T>({keep_nms.numel(), 4});
scores_sel.mutable_data<T>({keep_nms.numel(), 1});
return std::make_pair(proposals, scores_sel);
}
template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(),
score_tensor->numel() * sizeof(float));
auto bbox_tensor = param.float_bbox.get();
fpga::PerformBypass(param.bbox_arg);
fpga::fpga_invalidate(bbox_tensor->data<float>(),
bbox_tensor->numel() * sizeof(float));
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
auto *im_info = param.im_info_;
auto anchors = *param.anchors_;
auto variances = *param.variances_;
auto *rpn_rois = param.rpn_rois_;
auto *rpn_roi_probs = param.rpn_probs_;
int pre_nms_top_n = param.pre_nms_topn_;
int post_nms_top_n = param.post_nms_topn_;
float nms_thresh = param.nms_thresh_;
float min_size = param.min_size_;
float eta = param.eta_;
auto &scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto &bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
//
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
Tensor &proposals = tensor_pair.first;
Tensor &scores = tensor_pair.second;
AppendProposals(rpn_rois, 4 * num_proposals, proposals);
AppendProposals(rpn_roi_probs, num_proposals, scores);
num_proposals += proposals.dims()[0];
lod0.push_back(num_proposals);
}
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
} // namespace operators
} // namespace paddle_mobile
#endif // PROPOSAL_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
auto dims = param->input_x_->dims();
PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
"data not aligned");
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
param->float_output = std::make_shared<Tensor>();
param->float_output->mutable_data<float>(param->output_->dims());
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_input->mutable_data<float>();
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
fpga::format_fp16_ofm(param->output_);
input = param->float_output.get();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input->data<float>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
param->input_arg = args;
return true;
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
fpga::PerformBypass(param.input_arg);
fpga::fpga_invalidate(input_tensor->data<float>(),
input_tensor->numel() * sizeof(float));
auto* in = input_tensor;
auto* rois = param.input_rois_;
auto* out = param.float_output.get();
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
auto spatial_scale = param.spatial_scale_;
auto output_channels = param.output_channels_;
auto in_dims = in->dims();
int batch_size = in_dims[0];
int input_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto in_stride =
framework::stride({batch_size, height, width, input_channels});
auto out_stride = framework::stride(
{out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
const float* input_data = in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
return;
PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_MOBILE_ENFORCE(
rois_batch_size == batch_size,
"the rois_batch_size and input(X) batch_size should be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
"the rois_num from input and lod must be the same");
PADDLE_MOBILE_ENFORCE(
input_channels == output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
// calculate batch id index for each roi according to LoD
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
rois_batch_id_data[i] = n;
}
}
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI
for (int n = 0; n < rois_num; ++n) {
// set roi batch id
int roi_batch_id = rois_batch_id_data[n];
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
DLOG << 3;
// calculate each pixel of the output feature map.
int out_roi_offset = n * out_stride[0];
for (int c = 0; c < output_channels; ++c) {
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int out_plane_offset = out_roi_offset + c;
for (int ph = 0; ph < pooled_height; ++ph) {
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int out_row_offset = out_plane_offset + ph * out_stride[1];
for (int pw = 0; pw < pooled_width; ++pw) {
// calculate w and h at input feature map
int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
int hend =
ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
int wend =
ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
wstart = std::min(std::max(wstart, 0), width);
hend = std::min(std::max(hend, 0), height);
wend = std::min(std::max(wend, 0), width);
// TODO int output_index = out_row_offset + pw;
int output_index = out_row_offset + pw * output_channels;
int input_channel = (c * pooled_height + ph) * pooled_width + pw;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
auto offset_input_data = input_data + input_plane_offset;
float out_sum = 0.;
bool is_empty = (hend <= hstart) || (wend <= wstart);
for (int ih = hstart; ih < hend; ++ih) {
for (int iw = wstart; iw < wend; ++iw) {
int input_index = ih * in_stride[1] + iw * input_channel;
out_sum += offset_input_data[input_index];
}
}
float bin_area = (hend - hstart) * (wend - wstart);
output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
}
}
}
}
fpga::format_image(out);
fpga::PerformBypass(param.output_arg);
}
} // namespace operators
} // namespace paddle_mobile
#endif // PSROI_POOL_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto output = param->Out();
auto shape = param->Shape();
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
output->set_type(input->type());
fpga::format_ofm(output);
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
void reshape(LoDTensor *input, LoDTensor *output) {
// Subscript r means after reshape
// TODO zhangyang verify this function
float *input_ptr_f, *output_ptr_f;
half *input_ptr_h, *output_ptr_h;
bool is_float = false;
if (input->type() == typeid(float)) {
input_ptr_f = input->data<float>();
output_ptr_f = output->data<float>();
is_float = true;
} else {
input_ptr_h = input->data<half>();
output_ptr_h = output->data<half>();
}
auto C = static_cast<int>(input->dims()[1]);
auto H = static_cast<int>(input->dims()[2]);
auto W = static_cast<int>(input->dims()[3]);
auto Cr = static_cast<int>(output->dims()[1]);
auto Hr = static_cast<int>(output->dims()[2]);
auto Wr = static_cast<int>(output->dims()[3]);
PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
auto WC = W * C;
auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
auto HW = H * W;
auto WCr = Wr * Cr;
auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
auto HWr = Hr * Wr;
int offset_align = 0;
int offset_r = 0, offset_align_r = 0;
int cr = 0, hr = 0, wr = 0;
for (int h = 0; h < H; h++) {
int offset0 = h * WC_align;
for (int w = 0; w < W; w++) {
int offset1 = w * C + offset0;
for (int c = 0; c < C; c++) {
offset_align = offset1 + c;
offset_r = c * HW + h * W + c;
cr = offset_r / HWr;
hr = offset_r % HWr / Wr;
wr = offset_r % Wr;
offset_align_r = hr * WCr_align + wr * Cr + cr;
// DLOG << "hwc"<< h<< " " << w << " " << c;
// DLOG << "hrwrcr" << hr<< " " << wr << " " << cr;
if (is_float) {
output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
} else {
output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
}
}
}
}
}
template <>
void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
auto input = const_cast<LoDTensor *>(param.InputX());
auto output = param.Out();
auto shape = param.Shape();
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
if (output->dims() == input->dims()) {
DLOG << "No need to reshape";
return;
}
reshape(input, output);
//
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) { ...@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile::fpga::SIGMOID; paddle_mobile::fpga::SIGMOID;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
auto out = param->Out(); auto out = param->Out();
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
...@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) { ...@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args.image.width = args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)input->dims()[1];
args.output.address = out->data<float>(); args.output.address = out->data<half>();
args.output.scale_address = out->scale; args.output.scale_address = out->scale;
args.output.activation.activation_type = activation_enable; args.output.activation.activation_type = activation_enable;
args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SLICE_OP
#include "operators/kernel/slice_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
auto output = param->output_;
fpga::format_fp16_ofm(output);
DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_;
if (param->input_->type() != typeid(half)) {
DLOG << "wrong type";
}
return true;
}
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension
auto input = param.input_;
DLOG << input;
int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1];
auto input_ptr = input->data<half>();
auto output_ptr = param.output_->data<half>();
int start = param.starts_[0], end = param.ends_[0];
start = start < 0 ? start + channel : start;
end = end < 0 ? end + channel : end;
start = start > channel ? channel : start;
end = end > channel ? channel : end;
int len = end - start;
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -23,49 +23,72 @@ namespace operators { ...@@ -23,49 +23,72 @@ namespace operators {
template <> template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX()); auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<half>();
auto out = param->Out(); auto out = param->Out();
fpga::format_fp32_ofm(out);
auto float_input = new Tensor; auto float_input = new LoDTensor;
if (input->dims().size() == 2) {
float_input->mutable_data<float>({1, input->dims()[1]}); PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
} else if (input->dims().size() == 4) { "Softmax should have 4-order input");
float_input->mutable_data<float>( auto dims = framework::vectorize(input->dims());
{1, input->dims()[2], input->dims()[3], input->dims()[1]}); auto channel = dims[3];
} else { if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1]
DLOG << "wrong dimension of softmax input"; PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
dims[3] = dims[1];
dims[1] = 1;
}
input->Resize(framework::make_ddim(dims));
float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU
float_input->init(typeid(float));
fpga::format_fp32_ofm(float_input);
fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
fpga::format_fp16_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input_ptr;
args.image.height = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[2];
args.image.channels = (uint32_t)input->dims()[3];
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = fpga::SOFTMAX;
param->SetFpgaArgs(args);
} }
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height =
(input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
return true; return true;
} }
template <> template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *in_x = param.FloatInput();
Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
in_x->numel() * sizeof(float)); if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT Tensor *out = param.Out();
math::SoftmaxFuntor<CPU, float>()(in_x, out); Tensor *in_x = param.FloatInput();
fpga::fpga_flush(out->data<float>(), out->memory_size()); fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
} }
} // namespace operators } // namespace operators
......
...@@ -20,7 +20,7 @@ namespace paddle_mobile { ...@@ -20,7 +20,7 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) { bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
auto *in = const_cast<Tensor *>(param->InputX()); auto *in = const_cast<LoDTensor *>(param->InputX());
auto outs = param->Outs(); auto outs = param->Outs();
auto sections = param->Sections(); auto sections = param->Sections();
int axis = param->Axis(); int axis = param->Axis();
...@@ -34,22 +34,32 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) { ...@@ -34,22 +34,32 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga::fpga_malloc(image_num * sizeof(float *))); fpga::fpga_malloc(image_num * sizeof(float *)));
auto out_channels = reinterpret_cast<uint32_t *>( auto out_channels = reinterpret_cast<uint32_t *>(
fpga::fpga_malloc(image_num * sizeof(uint32_t))); fpga::fpga_malloc(image_num * sizeof(uint32_t)));
DLOG << "input: " << in;
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
fpga::format_fp16_ofm(outs[i]); fpga::format_fp16_ofm(outs[i]);
images_out[i] = outs[i]->mutable_data<float>(); DLOG << "output: " << outs[i];
images_out[i] = outs[i]->mutable_data<half>();
scales_out[i] = outs[i]->scale; scales_out[i] = outs[i]->scale;
out_channels[i] = (uint32_t)sections[i]; out_channels[i] = (uint32_t)sections[i];
} }
auto deleter = [](void *p) { fpga::fpga_free(p); };
fpga::SplitArgs arg = {0}; fpga::SplitArgs arg = {0};
arg.image_num = image_num; arg.image_num = image_num;
arg.image_in = (half *)in->data<float>(); arg.image_in = in->data<half>();
arg.scale_in = in->scale; arg.scale_in = in->scale;
arg.images_out = images_out; arg.images_out = images_out;
arg.scales_out = scales_out; arg.scales_out = scales_out;
arg.out_channel_nums = out_channels; arg.out_channel_nums = out_channels;
arg.height = (uint32_t)in->dims()[2]; arg.height = (uint32_t)in->dims()[2];
arg.width = (uint32_t)in->dims()[3]; arg.width = (uint32_t)in->dims()[3];
arg.vector_split_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
arg.vector_split_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
arg.vector_split_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
param->SetFpgaArgs(arg); param->SetFpgaArgs(arg);
return true; return true;
......
...@@ -21,9 +21,11 @@ namespace operators { ...@@ -21,9 +21,11 @@ namespace operators {
template <> template <>
bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) { bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<float>(); DLOG << "input: " << input;
auto float_input = new Tensor; auto input_ptr = input->data<half>();
auto float_input = new LoDTensor;
float_input->mutable_data<float>( float_input->mutable_data<float>(
{1, input->dims()[1], input->dims()[2], input->dims()[3]}); {1, input->dims()[1], input->dims()[2], input->dims()[3]});
fpga::format_fp32_ofm(float_input); fpga::format_fp32_ofm(float_input);
......
...@@ -20,7 +20,21 @@ namespace operators { ...@@ -20,7 +20,21 @@ namespace operators {
template <> template <>
bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) { bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
param->Out()->ShareDataWith(*param->InputX()); auto input = param->InputX();
auto output = param->Out();
auto axis = param->Axis();
auto dim = input->dims();
output->ShareDataWith(*input);
auto dim_v = vectorize(dim);
for (int i = 0; i < axis.size(); i++) {
dim_v[i] = dim[axis[i]];
}
output->Resize(framework::make_ddim(dim_v));
DLOG << "input: " << input;
DLOG << "output: " << output;
return true; return true;
} }
......
...@@ -1053,7 +1053,7 @@ class SoftmaxParam : public OpParam { ...@@ -1053,7 +1053,7 @@ class SoftmaxParam : public OpParam {
GType *FloatInput() const { GType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
} }
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif #endif
...@@ -1212,18 +1212,8 @@ class FetchParam : public OpParam { ...@@ -1212,18 +1212,8 @@ class FetchParam : public OpParam {
framework::LoDTensorArray *out_; framework::LoDTensorArray *out_;
int col_; int col_;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private:
std::shared_ptr<GType> float_input_x_;
fpga::BypassArgs fpga_bypass_args;
public: public:
GType *FloatInput() const { fpga::BypassArgs fpga_bypass_args;
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
}
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif #endif
}; };
...@@ -1660,7 +1650,7 @@ class TanhParam : public OpParam { ...@@ -1660,7 +1650,7 @@ class TanhParam : public OpParam {
GType *FloatInput() const { GType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
} }
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif #endif
......
...@@ -43,5 +43,8 @@ REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op); ...@@ -43,5 +43,8 @@ REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op); REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
#endif
#endif #endif
...@@ -74,6 +74,9 @@ if (CON GREATER -1) ...@@ -74,6 +74,9 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-densebox paddle-mobile) target_link_libraries(test-densebox paddle-mobile)
ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-rfcn paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
......
...@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width, ...@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
} }
} }
void dump(std::string filename, const Tensor input_tensor) { void dump(std::string filename, Tensor input_tensor) {
auto dataptr = input_tensor.data<float>(); auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
std::ofstream out(filename.c_str()); std::ofstream out(filename.c_str());
float result = 0; float result = 0;
for (int i = 0; i < input_tensor.numel(); ++i) { for (int i = 0; i < input_tensor.numel(); ++i) {
...@@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) { ...@@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) {
} }
out.close(); out.close();
} }
void dump_stride(std::string filename, const Tensor input_tensor, void dump_stride_half(std::string filename, Tensor input_tensor,
const int dumpnum) { const int dumpnum) {
int c = (input_tensor.dims())[1]; int c = (input_tensor.dims())[1];
int h = (input_tensor.dims())[2]; int h = (input_tensor.dims())[2];
int w = (input_tensor.dims())[3]; int w = (input_tensor.dims())[3];
auto data_ptr = input_tensor.data<float>(); auto data_ptr = input_tensor.get_data();
int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t)); auto *data_tmp =
int16_t *data_ptr_16 = (int16_t *)data_ptr; reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
convert_to_chw(&data_ptr_16, c, h, w, data_tmp); convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
// const int16_t *dataptr = input_tensor.data<int16_t>();
std::ofstream out(filename.c_str()); std::ofstream out(filename.c_str());
float result = 0; float result = 0;
int stride = input_tensor.numel() / dumpnum; int stride = input_tensor.numel() / dumpnum;
...@@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor, ...@@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor,
out.close(); out.close();
free(data_tmp); free(data_tmp);
} }
void dump_stride_float(std::string filename, Tensor input_tensor,
const int dumpnum) {
auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = data_ptr[i];
out << result << std::endl;
}
out.close();
}
static const char *g_resnet50 = "../models/resnet50"; static const char *g_resnet50 = "../models/resnet50";
const std::string g_image_src_float = "../images/image_src_float"; const std::string g_image_src_float = "../images/image_src_float";
int main() { int main() {
...@@ -98,24 +112,21 @@ int main() { ...@@ -98,24 +112,21 @@ int main() {
for (int i = 0; i < 73; i++) { for (int i = 0; i < 73; i++) {
auto tensor_ptr = paddle_mobile.FetchResult(i); auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "resnet50_result_" + std::to_string(i); std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(), paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(half)); tensor_ptr->numel() * sizeof(half));
dump_stride(saveName, (*tensor_ptr), 20); dump_stride_half(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr)); // dump(saveName, (*tensor_ptr));
} }
std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73); auto tensor_ptr = paddle_mobile.FetchResult(73);
//(*output_tensor).dump<float>("resnet50_result_73"); dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
output_tensor = paddle_mobile.FetchResult(74); tensor_ptr = paddle_mobile.FetchResult(74);
//(*output_tensor).dump<float>("resnet50_result_74"); dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
// std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
// output_tensor = paddle_mobile.FetchResult(74);
float max = 0; float max = 0;
auto data_ptr = output_tensor->data<float>(); auto data_ptr = tensor_ptr->data<float>();
int maximumIdx = 0; int maximumIdx = 0;
for (int i = 0; i < (*output_tensor).numel(); i++) { for (int i = 0; i < (*tensor_ptr).numel(); i++) {
if (data_ptr[i] > max) { if (data_ptr[i] > max) {
maximumIdx = i; maximumIdx = i;
max = data_ptr[i]; max = data_ptr[i];
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#ifdef PADDLE_MOBILE_FPGA_V1
#include "fpga/V1/api.h"
#endif
#ifdef PADDLE_MOBILE_FPGA_V2
#include "fpga/V2/api.h"
#endif
void readStream(std::string filename, uint8_t *buf) {
std::ifstream in;
in.open(filename, std::ios::in);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
return;
}
int i = 0;
while (!in.eof()) {
in >> buf[i];
i++;
}
in.close();
}
static const char *g_rfcn_combine = "../models/rfcn";
static const char *g_image_src_float = "../models/rfcn/data.bin";
int main() {
paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
std::string(g_rfcn_combine) + "/params", true, false,
1, true)) {
float img_info[3] = {768, 1536, 768.0f / 960.0f};
auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
std::vector<void *> v(3, nullptr);
paddle_mobile.FeedData({img_info, img});
paddle_mobile.Predict_To(-1);
paddle_mobile.GetResults(&v);
DLOG << "Computation done";
fpga::fpga_free(img);
}
return 0;
}
...@@ -126,6 +126,11 @@ if (CON GREATER -1) ...@@ -126,6 +126,11 @@ if (CON GREATER -1)
set(RESHAPE_OP ON) set(RESHAPE_OP ON)
set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON) set(FUSION_CONVADDBN_OP ON)
set(RESHAPE2_OP ON)
set(PSROI_POOL_OP ON)
set(PROPOSAL_OP ON)
set(ANCHOR_GENERATOR_OP ON)
set(SLICE_OP ON)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册