未验证 提交 4263a4dd 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

Merge pull request #1437 from zhangyang0701/develop

Add support for RFCN for FPGA track close #1432
......@@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data);
auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
float *old_p = p_data;
image::format_image(&p_data, channel, height, width);
if (old_p != p_data) {
image_tensor->reset_data_ptr(p_data);
}
}
void format_ofm(framework::Tensor *ofm_tensor) {
if (ofm_tensor->type() == typeid(float)) {
format_fp32_ofm(ofm_tensor);
} else {
format_fp16_ofm(ofm_tensor);
}
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
......@@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
}
void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
......@@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
}
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
......@@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(float));
}
float filter_find_max(framework::Tensor *filter_tensor) {
......@@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
auto dims = filter_tensor->dims();
......@@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
......@@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
......@@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter::format_fc_filter(&new_data, num, channel, height, width, 1,
max_value);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
int group_num, int stride) {
......@@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework::make_ddim({num, channel, height, width});
filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_bias_scale_array(float **bias_scale_array,
......@@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto ddim = framework::make_ddim({1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
out->set_type(typeid(half));
}
void format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
......@@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>();
auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<half>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -603,7 +621,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
auto out_ptr = out->data<half>();
arg->output.address =
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
......@@ -793,7 +811,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->conv_arg[j].output.scale_address),
deleter));
}
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int16_t *>(
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
arg->split_conv_args[i]->conv_arg[j].output.address);
arg->split_conv_args[i]->concat_arg.scales_in[j] =
arg->split_conv_args[i]->conv_arg[j].output.scale_address;
......@@ -818,9 +836,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto filter_ptr = filter->data<uint8_t>();
auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>();
arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable;
......@@ -848,9 +866,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<half>();
auto deleter = [](void *p) { fpga_free(p); };
......@@ -885,7 +902,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, real_out_height, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
auto out_ptr = out->data<half>();
/*====For Addition
arg->output.address =
......
......@@ -23,6 +23,7 @@ namespace paddle_mobile {
namespace fpga {
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor);
......
......@@ -22,7 +22,6 @@ namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel;
......@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_conv(float **data_in, int height, int cw) {
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
memset(data_tmp, 0, height * align_cw * sizeof(float));
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
}
memset(data_tmp, 0, height * align_cw * sizeof(float));
*data_in = data_tmp;
fpga_free(tmp);
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
}
*data_in = data_tmp;
}
void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width);
int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *hwc_temp = *data_in;
align_element_conv(data_in, height, channel * width);
fpga_free(hwc_temp);
}
fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(float));
}
......
......@@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq(args.driver.deconv_param, 0xd18);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO;
DLOG << "Conv Wait Irq Timeout!";
}
DLOG << "after reg poll";
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
......
......@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
DLOG << "Address: " << ptr << " Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
......
......@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory>
#include <vector>
namespace paddle_mobile {
namespace fpga {
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
#define IMAGE_ALIGNMENT (16) // Aligned to 16
#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#endif
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
......@@ -49,7 +48,7 @@ enum ActivationType {
};
struct ActivationArgs {
enum ActivationType activation_type;
enum ActivationType activation_type = NONE;
int16_t leaky_relu_negative_slope;
};
......
......@@ -84,6 +84,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
InitMemory();
}
#ifdef PADDLE_MOBILE_FPGA
program_.scope->EraseVars({"feed", "fetch"});
program_.scope->print_vars();
#endif
int count = 0;
for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
for (auto &op_handler : ops_of_block_[block_id]) {
......@@ -447,14 +452,41 @@ template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
std::string var_name) {
Variable *g_feed_value = program_.scope->Var(var_name);
Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
feed_tensor->Resize(t.dims());
feed_tensor->ShareDataWith(t);
}
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
InjectVariable(t, "feed");
InjectVariable(t, "feed0");
}
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
auto input_size = v.size();
auto vars = program_.scope->VarContain("feed");
PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
"input data number not correct");
for (int i = 0; i < input_size; i++) {
auto var = program_.scope->Var("feed", i);
auto feed_tensor = var->template GetMutable<LoDTensor>();
feed_tensor->external_data = v[i];
}
}
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
auto output_size = v->size();
PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
auto vars = program_.scope->VarContain("fetch");
PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
"output data number not correct");
for (int i = 0; i < output_size; i++) {
auto var = program_.scope->Var("fetch", i);
auto fetch_tensor = var->template GetMutable<LoDTensor>();
(*v)[i] = fetch_tensor->template data<float>();
}
}
template <typename Device, typename T>
......
......@@ -52,6 +52,8 @@ class Executor {
#ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const Tensor &t, std::string var_name);
void FeedData(const Tensor &t);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
......
......@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_(attrs),
scope_(scope) {
CheckAllInputOutputSet();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors();
#endif
}
template <typename Dtype>
......@@ -131,6 +134,25 @@ void OperatorBase<GPU_CL>::Run() {
}
#endif
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype>
void OperatorBase<Dtype>::InsertTensors() {
static int feed_num = 0;
static int fetch_num = 0;
if (type_ == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
inputs_.at("X") = {string(new_name)};
} else if (type_ == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
outputs_.at("Out") = {string(new_name)};
}
}
#endif
template class OperatorBase<CPU>;
template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>;
......
......@@ -78,7 +78,9 @@ class OperatorBase {
this->scope_->EraseVars(var_names);
}
}
#ifdef PADDLE_MOBILE_FPGA
void InsertTensors();
#endif
protected:
std::shared_ptr<Scope> scope_;
std::string type_;
......@@ -102,7 +104,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_.InitCLHelper(scope->GetCLScpoe());
#endif
}
virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
......
......@@ -53,7 +53,8 @@ void ProgramDesc::Description(std::string header) {
}
}
for (auto &attr : op->GetAttrMap()) {
LOG(kLOG_DEBUG2) << "attr name:: " << attr.first;
if (attr.first == "op_callstack") continue;
LOG(kLOG_DEBUG2) << "attr name: " << attr.first;
LOG(kLOG_DEBUG3) << "argument - " << attr.second;
}
}
......
......@@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const {
return nullptr;
}
#ifdef PADDLE_MOBILE_FPGA
Variable *Scope::Var(const std::string &name, const int id) {
return Var(name + std::to_string(id));
}
std::vector<Variable *> Scope::VarContain(const std::string substring) {
std::vector<Variable *> v;
for (auto pair : vars_) {
if (pair.first.find(substring) == 0) {
v.push_back(pair.second);
}
}
return v;
}
void Scope::print_vars() {
DLOG << "====================start to print variables=================";
for (auto pair : vars_) {
DLOG << pair.first;
}
DLOG << "==================complete printing variables================";
}
#endif
} // namespace framework
} // namespace paddle_mobile
......@@ -83,6 +83,12 @@ class Scope {
Variable *FindVarLocally(const std::string &name) const;
#ifdef PADDLE_MOBILE_FPGA
Variable *Var(const std::string &name, const int id);
std::vector<Variable *> VarContain(const std::string substring);
void print_vars();
#endif
#ifdef PADDLE_MOBILE_CL
CLScope *GetCLScpoe() { return cl_scope_; }
#endif
......
......@@ -202,6 +202,10 @@ class Tensor : public TensorBase {
inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT
}
inline void set_type(std::type_index type) { holder_->set_type(type); }
inline void *get_data() {
return (void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());
} // NOLINT
inline void *init(std::type_index type) {
if (holder_ != nullptr) {
......@@ -217,7 +221,8 @@ class Tensor : public TensorBase {
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void *external_data = nullptr; // only used for Feed
#endif
};
......
......@@ -177,6 +177,23 @@ bool PaddleMobilePredictor<Device, T>::Run(
return true;
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::FeedData(
const std::vector<void *> &inputs) {
paddle_mobile_->FeedData(inputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::GetResults(
std::vector<void *> *outputs) {
paddle_mobile_->GetResults(outputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
paddle_mobile_->Predict_From_To(start, end);
}
#endif
template <typename Device, typename T>
PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
......
......@@ -35,6 +35,9 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<int>* index_data,
int batch_size = -1) override;
void FeedData(const std::vector<void*>& inputs) override;
void GetResults(std::vector<void*>* outputs) override;
void Predict_From_To(int start = 0, int end = -1) override;
#endif
~PaddleMobilePredictor() override;
......
......@@ -119,6 +119,9 @@ class PaddlePredictor {
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
std::vector<int>* index_data, int batch_size = -1) = 0;
virtual void FeedData(const std::vector<void*>& inputs) = 0;
virtual void GetResults(std::vector<void*>* outputs) = 0;
virtual void Predict_From_To(int start = 0, int end = -1) = 0;
#endif
protected:
......
......@@ -227,6 +227,14 @@ template <typename Device, typename T>
void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_->FeedData(t);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
executor_->FeedData(v);
};
template <typename Device, typename T>
void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
executor_->GetResults(v);
}
template <typename Device, typename T>
std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
......
......@@ -90,6 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
......
......@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float nms_thresh_;
float min_size_;
float eta_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_score, float_bbox;
fpga::BypassArgs score_arg, bbox_arg;
#endif
};
DECLARE_KERNEL(Proposal, ProposalParam);
......@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int pooled_height_;
int pooled_width_;
float spatial_scale_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_input, float_output;
fpga::BypassArgs input_arg, output_arg;
#endif
};
DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
......
......@@ -23,15 +23,46 @@ namespace operators {
template <>
bool AnchorGeneratorKernel<FPGA, float>::Init(
AnchorGeneratorParam<FPGA> *param) {
// TODO zhangyang
auto input = param->input_;
auto anchors = param->output_anchors_;
auto anchor_ptr = anchors->mutable_data<float>();
auto stride = param->stride_;
auto feature_width = input->dims()[3], feature_height = input->dims()[2];
auto stride_width = stride[0], stride_height = stride[1];
int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23,
-20, 39, 36, -43, -34, 59, 49, -63, -54,
79, 69, -96, -77, 112, 93, -137, -118, 153,
134, -204, -188, 220, 204, -281, -395, 296, 441};
int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset = h_idx * w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) {
anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] =
anchors_offset[idx * 4 + 1] + h_idx * stride_height;
anchor_ptr[offset + 2] =
anchors_offset[idx * 4 + 2] + w_idx * stride_width;
anchor_ptr[offset + 3] =
anchors_offset[idx * 4 + 3] + h_idx * stride_height;
}
}
}
return true;
}
template <>
void AnchorGeneratorKernel<FPGA, float>::Compute(
const AnchorGeneratorParam<FPGA> &param) {
// TODO(hjchen2)
}
const AnchorGeneratorParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = (half *)input->data<float>(); // NOLINT
images_in[i] = input->data<half>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
scales_in[i] = input->scale;
}
......@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs.image_num = image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->data<float>(); // NOLINT
concatArgs.image_out = out->data<half>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = height;
......
......@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
......
......@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
......
......@@ -19,19 +19,35 @@ namespace operators {
template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
Tensor *output = param->Out();
auto output = param->Out();
auto input = const_cast<LoDTensor *>(param->InputX());
input->init(typeid(float));
input->Resize(output->dims());
if (output->dims().size() != 4) {
auto input_ptr = input->mutable_data<float>();
size_t size = output->numel() * sizeof(float);
auto p = fpga::fpga_malloc(size);
memcpy(p, input_ptr, size);
output->reset_data_ptr(p);
return true;
}
fpga::format_fp16_ofm(output);
return true;
}
template <>
void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto input =
reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
auto output = param.Out();
auto input = const_cast<LoDTensor *>(param.InputX());
if (input->dims().size() != 4) {
return;
}
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto output_ptr = output->data<half>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
......@@ -39,7 +55,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.address = input_ptr;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
......@@ -48,6 +64,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
input->external_data = nullptr;
}
template class FeedKernel<FPGA, float>;
......
......@@ -19,20 +19,14 @@ namespace operators {
template <>
bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
Tensor *output = param->Out();
// fpga::format_fp16_ofm(output);
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
param.Out()->ShareDataWith(*(param.InputX()));
/*auto input =
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto input = const_cast<Tensor *>(param->InputX());
auto output = param->Out();
if (input->type() == typeid(float)) {
return true;
}
output->init(typeid(float));
output->Resize(input->dims());
fpga::format_fp32_ofm(output);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
......@@ -40,13 +34,32 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
= output_ptr; args.output.scale_address = output->scale;
fpga::PerformBypass(args);*/
args.image.address = input->data<half>();
args.image.channels = (uint32_t)product(input->dims());
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output->data<float>();
args.output.scale_address = output->scale;
param->fpga_bypass_args = args;
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX();
if (input->type() == typeid(float)) {
auto output = param.Out();
output->ShareDataWith(*input);
return;
}
fpga::PerformBypass(param.fpga_bypass_args);
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
param.fpga_bypass_args.image.channels * sizeof(float));
// TODO: DEalign: get rid of extra 0
}
template class FetchKernel<FPGA, float>;
......
......@@ -22,10 +22,10 @@ namespace operators {
template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
Tensor *output = param->Output();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<float>();
auto output_ptr = output->mutable_data<half>();
vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings();
......
......@@ -14,20 +14,422 @@ limitations under the License. */
#ifdef PROPOSAL_OP
#include <algorithm>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
template <>
bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
int post_nms_top_n = param->post_nms_topn_;
int64_t batch = param->scores_->dims()[0];
auto total = post_nms_top_n * batch;
param->rpn_rois_->mutable_data<float>({total, 4});
param->rpn_probs_->mutable_data<float>({total, 1});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param->float_bbox = std::make_shared<Tensor>();
param->float_bbox->Resize(param->bbox_deltas_->dims());
param->float_bbox->init(typeid(float));
fpga::format_fp32_ofm(param->float_bbox.get());
param->float_score = std::make_shared<Tensor>();
param->float_score->Resize(param->scores_->dims());
param->float_score->init(typeid(float));
fpga::format_fp32_ofm(param->float_score.get());
auto input = param->bbox_deltas_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_bbox->mutable_data<float>();
args.output.scale_address = param->float_bbox->scale;
param->bbox_arg = args;
input = param->scores_;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_score->mutable_data<float>();
args.output.scale_address = param->float_score->scale;
param->score_arg = args;
return true;
}
void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
auto *out_data = dst->data<void>();
auto *to_add_data = src.data<void>();
size_t size_of_t = framework::SizeOfType(src.type());
offset *= size_of_t;
std::memcpy(
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
to_add_data, src.numel() * size_of_t);
}
template <class T>
static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
Tensor *variances, Tensor *proposals) {
T *proposals_data = proposals->mutable_data<T>();
int64_t row = all_anchors->dims()[0];
int64_t len = all_anchors->dims()[1];
auto *bbox_deltas_data = bbox_deltas->data<T>();
auto *anchor_data = all_anchors->data<T>();
const T *variances_data = nullptr;
if (variances) {
variances_data = variances->data<T>();
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
if (variances) {
bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
anchor_center_x;
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
}
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
}
// return proposals;
}
template <class T>
static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
T *boxes_data = boxes->mutable_data<T>();
const T *im_info_data = im_info.data<T>();
T zero(0);
for (int64_t i = 0; i < boxes->numel(); ++i) {
if (i % 4 == 0) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else if (i % 4 == 1) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
} else if (i % 4 == 2) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
}
}
}
template <class T>
static inline void FilterBoxes(Tensor *boxes, float min_size,
const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>();
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>();
int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i;
}
}
keep->Resize({keep_len});
}
template <class T>
static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
const std::vector<T> &scores) {
std::vector<std::pair<T, int>> sorted_indices;
sorted_indices.reserve(scores.size());
for (size_t i = 0; i < scores.size(); ++i) {
sorted_indices.emplace_back(scores[i], i);
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<T, int> &a, const std::pair<T, int> &b) {
return a.first < b.first;
});
return sorted_indices;
}
template <class T>
static inline T BBoxArea(const T *box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <typename T>
static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
int selected_num) {
Tensor keep_nms;
keep_nms.Resize({selected_num});
auto *keep_data = keep_nms.mutable_data<T>();
for (int i = 0; i < selected_num; ++i) {
keep_data[i] = selected_indices[i];
}
return keep_nms;
}
template <class T>
static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <class T>
static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
float eta) {
int64_t num_boxes = bbox->dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox->dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices =
GetSortedScoreIndex<T>(scores_data);
std::vector<int> selected_indices;
int selected_num = 0;
T adaptive_threshold = nms_threshold;
const T *bbox_data = bbox->data<T>();
while (sorted_indices.size() != 0) {
int idx = sorted_indices.back().second;
bool flag = true;
for (int kept_idx : selected_indices) {
if (flag) {
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, false);
flag = (overlap <= adaptive_threshold);
} else {
break;
}
}
if (flag) {
selected_indices.push_back(idx);
++selected_num;
}
sorted_indices.erase(sorted_indices.end() - 1);
if (flag && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
return VectorToTensor(selected_indices, selected_num);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas_slice, // [M, 4]
const Tensor &scores_slice, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
auto *scores_data = scores_slice.data<T>();
// Sort index
Tensor index_t;
index_t.Resize({scores_slice.numel()});
int *index = index_t.mutable_data<int>();
for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}
auto compare = [scores_data](const int64_t &i, const int64_t &j) {
return scores_data[i] > scores_data[j];
};
if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
std::sort(index, index + scores_slice.numel(), compare);
} else {
std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
compare);
index_t.Resize({pre_nms_top_n});
}
Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
scores_sel.mutable_data<T>({index_t.numel(), 1});
bbox_sel.mutable_data<T>({index_t.numel(), 4});
anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4});
Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals);
Tensor keep;
FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
Tensor scores_filter;
bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1});
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter);
}
Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
proposals.mutable_data<T>({keep_nms.numel(), 4});
scores_sel.mutable_data<T>({keep_nms.numel(), 1});
return std::make_pair(proposals, scores_sel);
}
template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
// TODO(hjchen2)
auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(),
score_tensor->numel() * sizeof(float));
auto bbox_tensor = param.float_bbox.get();
fpga::PerformBypass(param.bbox_arg);
fpga::fpga_invalidate(bbox_tensor->data<float>(),
bbox_tensor->numel() * sizeof(float));
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
auto *im_info = param.im_info_;
auto anchors = *param.anchors_;
auto variances = *param.variances_;
auto *rpn_rois = param.rpn_rois_;
auto *rpn_roi_probs = param.rpn_probs_;
int pre_nms_top_n = param.pre_nms_topn_;
int post_nms_top_n = param.post_nms_topn_;
float nms_thresh = param.nms_thresh_;
float min_size = param.min_size_;
float eta = param.eta_;
auto &scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto &bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
//
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
Tensor &proposals = tensor_pair.first;
Tensor &scores = tensor_pair.second;
AppendProposals(rpn_rois, 4 * num_proposals, proposals);
AppendProposals(rpn_roi_probs, num_proposals, scores);
num_proposals += proposals.dims()[0];
lod0.push_back(num_proposals);
}
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
} // namespace operators
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
......@@ -21,13 +22,180 @@ namespace paddle_mobile {
namespace operators {
template <>
bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA> *param) {
bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
auto dims = param->input_x_->dims();
PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
"data not aligned");
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
param->float_output = std::make_shared<Tensor>();
param->float_output->mutable_data<float>(param->output_->dims());
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_input->mutable_data<float>();
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
fpga::format_fp16_ofm(param->output_);
input = param->float_output.get();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input->data<float>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
param->input_arg = args;
return true;
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA> &param) {
// TODO(hjchen2)
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
fpga::PerformBypass(param.input_arg);
fpga::fpga_invalidate(input_tensor->data<float>(),
input_tensor->numel() * sizeof(float));
auto* in = input_tensor;
auto* rois = param.input_rois_;
auto* out = param.float_output.get();
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
auto spatial_scale = param.spatial_scale_;
auto output_channels = param.output_channels_;
auto in_dims = in->dims();
int batch_size = in_dims[0];
int input_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto in_stride =
framework::stride({batch_size, height, width, input_channels});
auto out_stride = framework::stride(
{out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
const float* input_data = in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
return;
PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_MOBILE_ENFORCE(
rois_batch_size == batch_size,
"the rois_batch_size and input(X) batch_size should be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
"the rois_num from input and lod must be the same");
PADDLE_MOBILE_ENFORCE(
input_channels == output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
// calculate batch id index for each roi according to LoD
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
rois_batch_id_data[i] = n;
}
}
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI
for (int n = 0; n < rois_num; ++n) {
// set roi batch id
int roi_batch_id = rois_batch_id_data[n];
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
DLOG << 3;
// calculate each pixel of the output feature map.
int out_roi_offset = n * out_stride[0];
for (int c = 0; c < output_channels; ++c) {
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int out_plane_offset = out_roi_offset + c;
for (int ph = 0; ph < pooled_height; ++ph) {
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int out_row_offset = out_plane_offset + ph * out_stride[1];
for (int pw = 0; pw < pooled_width; ++pw) {
// calculate w and h at input feature map
int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
int hend =
ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
int wend =
ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
wstart = std::min(std::max(wstart, 0), width);
hend = std::min(std::max(hend, 0), height);
wend = std::min(std::max(wend, 0), width);
// TODO int output_index = out_row_offset + pw;
int output_index = out_row_offset + pw * output_channels;
int input_channel = (c * pooled_height + ph) * pooled_width + pw;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
auto offset_input_data = input_data + input_plane_offset;
float out_sum = 0.;
bool is_empty = (hend <= hstart) || (wend <= wstart);
for (int ih = hstart; ih < hend; ++ih) {
for (int iw = wstart; iw < wend; ++iw) {
int input_index = ih * in_stride[1] + iw * input_channel;
out_sum += offset_input_data[input_index];
}
}
float bin_area = (hend - hstart) * (wend - wstart);
output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
}
}
}
}
fpga::format_image(out);
fpga::PerformBypass(param.output_arg);
}
} // namespace operators
......
......@@ -15,18 +15,119 @@ limitations under the License. */
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto output = param->Out();
auto shape = param->Shape();
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
output->set_type(input->type());
fpga::format_ofm(output);
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
void reshape(LoDTensor *input, LoDTensor *output) {
// Subscript r means after reshape
// TODO zhangyang verify this function
float *input_ptr_f, *output_ptr_f;
half *input_ptr_h, *output_ptr_h;
bool is_float = false;
if (input->type() == typeid(float)) {
input_ptr_f = input->data<float>();
output_ptr_f = output->data<float>();
is_float = true;
} else {
input_ptr_h = input->data<half>();
output_ptr_h = output->data<half>();
}
auto C = static_cast<int>(input->dims()[1]);
auto H = static_cast<int>(input->dims()[2]);
auto W = static_cast<int>(input->dims()[3]);
auto Cr = static_cast<int>(output->dims()[1]);
auto Hr = static_cast<int>(output->dims()[2]);
auto Wr = static_cast<int>(output->dims()[3]);
PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
auto WC = W * C;
auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
auto HW = H * W;
auto WCr = Wr * Cr;
auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
auto HWr = Hr * Wr;
int offset_align = 0;
int offset_r = 0, offset_align_r = 0;
int cr = 0, hr = 0, wr = 0;
for (int h = 0; h < H; h++) {
int offset0 = h * WC_align;
for (int w = 0; w < W; w++) {
int offset1 = w * C + offset0;
for (int c = 0; c < C; c++) {
offset_align = offset1 + c;
offset_r = c * HW + h * W + c;
cr = offset_r / HWr;
hr = offset_r % HWr / Wr;
wr = offset_r % Wr;
offset_align_r = hr * WCr_align + wr * Cr + cr;
// DLOG << "hwc"<< h<< " " << w << " " << c;
// DLOG << "hrwrcr" << hr<< " " << wr << " " << cr;
if (is_float) {
output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
} else {
output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
}
}
}
}
}
template <>
void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
return;
auto input = const_cast<LoDTensor *>(param.InputX());
auto output = param.Out();
auto shape = param.Shape();
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
if (output->dims() == input->dims()) {
DLOG << "No need to reshape";
return;
}
reshape(input, output);
//
}
} // namespace operators
......
......@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile::fpga::SIGMOID;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
auto out = param->Out();
fpga::format_fp16_ofm(out);
......@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = out->data<float>();
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = activation_enable;
args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
......
......@@ -21,10 +21,37 @@ namespace operators {
template <>
bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
auto output = param->output_;
fpga::format_fp16_ofm(output);
DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_;
if (param->input_->type() != typeid(half)) {
DLOG << "wrong type";
}
return true;
}
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension
auto input = param.input_;
DLOG << input;
int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1];
auto input_ptr = input->data<half>();
auto output_ptr = param.output_->data<half>();
int start = param.starts_[0], end = param.ends_[0];
start = start < 0 ? start + channel : start;
end = end < 0 ? end + channel : end;
start = start > channel ? channel : start;
end = end > channel ? channel : end;
int len = end - start;
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,49 +23,72 @@ namespace operators {
template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
auto out = param->Out();
fpga::format_fp32_ofm(out);
auto float_input = new Tensor;
if (input->dims().size() == 2) {
float_input->mutable_data<float>({1, input->dims()[1]});
} else if (input->dims().size() == 4) {
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
} else {
DLOG << "wrong dimension of softmax input";
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input");
auto dims = framework::vectorize(input->dims());
auto channel = dims[3];
if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
dims[3] = dims[1];
dims[1] = 1;
}
input->Resize(framework::make_ddim(dims));
float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU
float_input->init(typeid(float));
fpga::format_fp32_ofm(float_input);
fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
fpga::format_fp16_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input_ptr;
args.image.height = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[2];
args.image.channels = (uint32_t)input->dims()[3];
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = fpga::SOFTMAX;
param->SetFpgaArgs(args);
}
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height =
(input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
return true;
}
template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *in_x = param.FloatInput();
Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
in_x->numel() * sizeof(float));
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x = param.FloatInput();
fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
}
} // namespace operators
......
......@@ -34,16 +34,18 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga::fpga_malloc(image_num * sizeof(float *)));
auto out_channels = reinterpret_cast<uint32_t *>(
fpga::fpga_malloc(image_num * sizeof(uint32_t)));
DLOG << "input: " << in;
for (int i = 0; i < image_num; i++) {
fpga::format_fp16_ofm(outs[i]);
images_out[i] = outs[i]->mutable_data<float>();
DLOG << "output: " << outs[i];
images_out[i] = outs[i]->mutable_data<half>();
scales_out[i] = outs[i]->scale;
out_channels[i] = (uint32_t)sections[i];
}
fpga::SplitArgs arg = {0};
arg.image_num = image_num;
arg.image_in = (half *)in->data<float>();
arg.image_in = in->data<half>();
arg.scale_in = in->scale;
arg.images_out = images_out;
arg.scales_out = scales_out;
......
......@@ -22,8 +22,10 @@ namespace operators {
template <>
bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
DLOG << "input: " << input;
auto input_ptr = input->data<half>();
auto float_input = new Tensor;
float_input->mutable_data<float>(
{1, input->dims()[1], input->dims()[2], input->dims()[3]});
fpga::format_fp32_ofm(float_input);
......
......@@ -20,7 +20,21 @@ namespace operators {
template <>
bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
param->Out()->ShareDataWith(*param->InputX());
auto input = param->InputX();
auto output = param->Out();
auto axis = param->Axis();
auto dim = input->dims();
output->ShareDataWith(*input);
auto dim_v = vectorize(dim);
for (int i = 0; i < axis.size(); i++) {
dim_v[i] = dim[axis[i]];
}
output->Resize(framework::make_ddim(dim_v));
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
......
......@@ -1172,6 +1172,12 @@ class FeedParam : public OpParam {
public:
FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
#ifdef PADDLE_MOBILE_FPGA
static int feed_num = 0;
auto new_name = std::string("feed") + std::to_string(feed_num++);
const_cast<VariableNameMap &>(inputs).at("X") = {string(new_name)};
#endif
input_x_ = InputXFrom<LoDTensor>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
auto var = scope.FindVar("batch_size");
......@@ -1195,6 +1201,11 @@ class FetchParam : public OpParam {
public:
FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
#ifdef PADDLE_MOBILE_FPGA
static int fetch_num = 0;
auto new_name = std::string("fetch") + std::to_string(fetch_num++);
const_cast<VariableNameMap &>(outputs).at("Out") = {string(new_name)};
#endif
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom(outputs, scope);
}
......@@ -1210,18 +1221,9 @@ class FetchParam : public OpParam {
RType *input_x_;
Tensor *out_;
#ifdef PADDLE_MOBILE_FPGA
private:
std::shared_ptr<RType> float_input_x_;
public:
fpga::BypassArgs fpga_bypass_args;
public:
RType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
}
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif
};
......
......@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
}
}
void dump(std::string filename, const Tensor input_tensor) {
auto dataptr = input_tensor.data<float>();
void dump(std::string filename, Tensor input_tensor) {
auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
for (int i = 0; i < input_tensor.numel(); ++i) {
......@@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) {
}
out.close();
}
void dump_stride(std::string filename, const Tensor input_tensor,
const int dumpnum) {
void dump_stride_half(std::string filename, Tensor input_tensor,
const int dumpnum) {
int c = (input_tensor.dims())[1];
int h = (input_tensor.dims())[2];
int w = (input_tensor.dims())[3];
auto data_ptr = input_tensor.data<float>();
int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t));
int16_t *data_ptr_16 = (int16_t *)data_ptr;
auto data_ptr = input_tensor.get_data();
auto *data_tmp =
reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
// const int16_t *dataptr = input_tensor.data<int16_t>();
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
......@@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor,
out.close();
free(data_tmp);
}
void dump_stride_float(std::string filename, Tensor input_tensor,
const int dumpnum) {
auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = data_ptr[i];
out << result << std::endl;
}
out.close();
}
static const char *g_resnet50 = "../models/resnet50";
const std::string g_image_src_float = "../images/image_src_float";
int main() {
......@@ -98,24 +112,21 @@ int main() {
for (int i = 0; i < 73; i++) {
auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(half));
dump_stride(saveName, (*tensor_ptr), 20);
dump_stride_half(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr));
}
std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
//(*output_tensor).dump<float>("resnet50_result_73");
output_tensor = paddle_mobile.FetchResult(74);
//(*output_tensor).dump<float>("resnet50_result_74");
// std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
// output_tensor = paddle_mobile.FetchResult(74);
auto tensor_ptr = paddle_mobile.FetchResult(73);
dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
tensor_ptr = paddle_mobile.FetchResult(74);
dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
float max = 0;
auto data_ptr = output_tensor->data<float>();
auto data_ptr = tensor_ptr->data<float>();
int maximumIdx = 0;
for (int i = 0; i < (*output_tensor).numel(); i++) {
for (int i = 0; i < (*tensor_ptr).numel(); i++) {
if (data_ptr[i] > max) {
maximumIdx = i;
max = data_ptr[i];
......
......@@ -23,29 +23,38 @@ limitations under the License. */
#include "fpga/V2/api.h"
#endif
// static const char *g_densebox_combine = "../models/densebox";
static const char *g_densebox_combine = "../models/rfcn";
void readStream(std::string filename, uint8_t *buf) {
std::ifstream in;
in.open(filename, std::ios::in);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
return;
}
int i = 0;
while (!in.eof()) {
in >> buf[i];
i++;
}
in.close();
}
static const char *g_rfcn_combine = "../models/rfcn";
const std::string g_image_src_float = "../models/rfcn/data.bin";
int main() {
paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
// paddle_mobile.SetThreadNum(4);
if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model",
std::string(g_densebox_combine) + "/params", true,
false, 1, true)) {
// std::vector<float> input;
// std::vector<int64_t> dims{1, 3, 512, 1024};
// GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
// auto vec_result = paddle_mobile.Predict(input, dims);
return 0;
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 512, 1024}, static_cast<float>(0),
static_cast<float>(1));
// readStream(g_image_src_float,
// input_tensor.mutable_data<float>({1, 3, 224, 224}));
paddle_mobile.FeedData(input_tensor);
if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
std::string(g_rfcn_combine) + "/params", true, false,
1, true)) {
float img_info[3] = {768, 1536, 768.0f / 960.0f};
auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
std::vector<void *> v(3, nullptr);
paddle_mobile.FeedData({img_info, img});
paddle_mobile.Predict_To(-1);
paddle_mobile.GetResults(&v);
DLOG << "Computation done";
}
return 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册