提交 16927084 编写于 作者: Z zhangyang0701

reconstruct code to support RFCN for FPGA track

上级 458183af
......@@ -28,11 +28,13 @@ void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data);
auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
float *old_p = p_data;
image::format_image(&p_data, channel, height, width);
if (old_p != p_data) {
image_tensor->reset_data_ptr(p_data);
}
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
......@@ -50,6 +52,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
}
void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
......@@ -67,6 +70,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
}
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
......@@ -83,6 +87,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(float));
}
float filter_find_max(framework::Tensor *filter_tensor) {
......@@ -139,6 +144,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
auto dims = filter_tensor->dims();
......@@ -149,6 +155,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
......@@ -173,6 +180,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
......@@ -187,6 +195,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter::format_fc_filter(&new_data, num, channel, height, width, 1,
max_value);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
int group_num, int stride) {
......@@ -213,6 +222,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework::make_ddim({num, channel, height, width});
filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_bias_scale_array(float **bias_scale_array,
......@@ -236,6 +246,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto ddim = framework::make_ddim({1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
out->set_type(typeid(half));
}
void format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
......@@ -447,9 +458,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>();
auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<half>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -571,8 +582,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -603,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
auto out_ptr = out->data<half>();
arg->output.address =
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
......@@ -793,7 +804,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->conv_arg[j].output.scale_address),
deleter));
}
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int16_t *>(
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
arg->split_conv_args[i]->conv_arg[j].output.address);
arg->split_conv_args[i]->concat_arg.scales_in[j] =
arg->split_conv_args[i]->conv_arg[j].output.scale_address;
......@@ -818,9 +829,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto filter_ptr = filter->data<uint8_t>();
auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>();
arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable;
......@@ -848,9 +859,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<half>();
auto deleter = [](void *p) { fpga_free(p); };
......@@ -885,7 +895,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, real_out_height, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
auto out_ptr = out->data<half>();
/*====For Addition
arg->output.address =
......
......@@ -22,7 +22,6 @@ namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel;
......@@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_conv(float **data_in, int height, int cw) {
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
memset(data_tmp, 0, height * align_cw * sizeof(float));
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
}
memset(data_tmp, 0, height * align_cw * sizeof(float));
*data_in = data_tmp;
fpga_free(tmp);
for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float));
}
*data_in = data_tmp;
}
void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width);
int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *hwc_temp = *data_in;
align_element_conv(data_in, height, channel * width);
fpga_free(hwc_temp);
}
fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(float));
}
......
......@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
DLOG << "Address: " << ptr << " Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
......
......@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory>
#include <vector>
namespace paddle_mobile {
namespace fpga {
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
#define IMAGE_ALIGNMENT (16) // Aligned to 16
#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#endif
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
......@@ -49,7 +48,7 @@ enum ActivationType {
};
struct ActivationArgs {
enum ActivationType activation_type;
enum ActivationType activation_type = NONE;
int16_t leaky_relu_negative_slope;
};
......
......@@ -84,6 +84,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
InitMemory();
}
#ifdef PADDLE_MOBILE_FPGA
program_.scope->EraseVars({"feed", "fetch"});
program_.scope->print_vars();
#endif
int count = 0;
for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
for (auto &op_handler : ops_of_block_[block_id]) {
......@@ -92,14 +97,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
ops_list_.push_back(op_handler);
}
}
#ifdef PADDLE_MOBILE_FPGA
TalorFeedOp();
DLOG << "TalorFeed finished";
TalorFetchdOp();
DLOG << "TalorFetch finished";
program_.scope->print_vars();
#endif
}
template <typename T>
......@@ -451,49 +448,6 @@ std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
}
#ifdef PADDLE_MOBILE_FPGA
template <typename Device, typename T>
void Executor<Device, T>::TalorFeedOp() {
auto &ops = ops_of_block_[0];
int num = 0;
program_.scope->EraseVars(std::vector<string>{string("feed")});
for (auto op : ops) {
if (op->Type() == "feed") {
auto new_name = string("feed") + std::to_string(num++);
auto var = program_.scope->Var(new_name);
auto tensor = var->template GetMutable<LoDTensor>();
auto output_map = op->Outputs();
std::vector<std::string> out_keys = op->GetOutKeys();
PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
auto output_tensor =
GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
tensor->Resize(output_tensor->dims());
tensor->init(typeid(float));
op->ChangeNameMap("X", std::vector<string>{new_name});
}
}
}
template <typename Device, typename T>
void Executor<Device, T>::TalorFetchdOp() {
auto &ops = ops_of_block_[0];
int num = 0;
program_.scope->EraseVars(std::vector<string>{string("fetch")});
for (auto op : ops) {
if (op->Type() == "fetch") {
auto new_name = string("fetch") + std::to_string(num++);
auto var = program_.scope->Var(new_name);
auto tensor = var->template GetMutable<LoDTensor>();
auto input_map = op->Inputs();
std::vector<std::string> in_keys = op->GetInputKeys();
PADDLE_MOBILE_ENFORCE(!in_keys.empty(), "this op contains no input");
auto input_tensor =
GetVarValue<LoDTensor>(in_keys[0], input_map, *(program_.scope));
tensor->Resize(input_tensor->dims());
tensor->init(typeid(float));
op->ChangeNameMap("Out", std::vector<string>{new_name});
}
}
}
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
std::string var_name) {
......@@ -509,18 +463,29 @@ void Executor<Device, T>::FeedData(const Tensor &t) {
}
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const std::vector<Tensor> &v) {
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
auto input_size = v.size();
PADDLE_MOBILE_ENFORCE(input_size > 0, "Empty input");
int counter = 0;
auto vars = program_.scope->VarContain("feed");
for (auto var : vars) {
Tensor *feed_tensor = var->template GetMutable<LoDTensor>();
feed_tensor->Resize(v[counter].dims());
feed_tensor->ShareDataWith(v[counter]);
if (++counter > v.size()) {
return;
}
PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
"input data number not correct");
for (int i = 0; i < input_size; i++) {
auto var = program_.scope->Var("feed", i);
auto feed_tensor = var->template GetMutable<LoDTensor>();
feed_tensor->external_data = v[i];
}
}
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
auto output_size = v->size();
PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
auto vars = program_.scope->VarContain("fetch");
PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
"output data number not correct");
for (int i = 0; i < output_size; i++) {
auto var = program_.scope->Var("fetch", i);
auto fetch_tensor = var->template GetMutable<LoDTensor>();
(*v)[i] = fetch_tensor->template data<float>();
}
}
......
......@@ -50,11 +50,10 @@ class Executor {
std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);
#ifdef PADDLE_MOBILE_FPGA
void TalorFeedOp();
void TalorFetchdOp();
void InjectVariable(const Tensor &t, std::string var_name);
void FeedData(const Tensor &t);
void FeedData(const std::vector<Tensor> &v);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
......
......@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_(attrs),
scope_(scope) {
CheckAllInputOutputSet();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors();
#endif
}
template <typename Dtype>
......@@ -133,15 +136,19 @@ void OperatorBase<GPU_CL>::Run() {
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype>
void OperatorBase<Dtype>::ChangeNameMap(string key, std::vector<string> value) {
auto it = inputs_.find(key);
if (it != inputs_.end()) {
inputs_[key] = value;
return;
}
it = outputs_.find(key);
if (it != outputs_.end()) {
inputs_[key] = value;
void OperatorBase<Dtype>::InsertTensors() {
static int feed_num = 0;
static int fetch_num = 0;
if (type_ == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
inputs_.at("X") = {string(new_name)};
} else if (type_ == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
outputs_.at("Out") = {string(new_name)};
}
}
#endif
......
......@@ -79,6 +79,7 @@ class OperatorBase {
}
}
#ifdef PADDLE_MOBILE_FPGA
void InsertTensors();
void ChangeNameMap(string key, std::vector<string> value);
#endif
protected:
......@@ -95,6 +96,7 @@ class OperatorBase {
template <typename Dtype, typename ParamType, typename KernelType>
class OperatorWithKernel : public OperatorBase<Dtype> {
public:
#ifndef PADDLE_MOBILE_FPGA1
OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
......@@ -104,7 +106,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_.InitCLHelper(scope->GetCLScpoe());
#endif
}
#else
OperatorWithKernel(const std::string &type, const VariableNameMap inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {
static int feed_num = 0;
static int fetch_num = 0;
if (type == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope->Var(new_name);
(const_cast<VariableNameMap &>(inputs)).at("X") = {string(new_name)};
} else if (type == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope->Var(new_name);
(const_cast<VariableNameMap &>(outputs)).at("Out") = {string(new_name)};
}
param_ = ParamType(inputs, outputs, attrs, *scope);
}
#endif
virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
......
......@@ -202,6 +202,10 @@ class Tensor : public TensorBase {
inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT
}
inline void set_type(std::type_index type) { holder_->set_type(type); }
inline void *get_data() {
return (void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());
} // NOLINT
inline void *init(std::type_index type) {
if (holder_ != nullptr) {
......@@ -217,7 +221,8 @@ class Tensor : public TensorBase {
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void *external_data = nullptr; // only used for Feed
#endif
};
......
......@@ -177,6 +177,23 @@ bool PaddleMobilePredictor<Device, T>::Run(
return true;
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::FeedData(
const std::vector<void *> &inputs) {
paddle_mobile_->FeedData(inputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::GetResults(
std::vector<void *> *outputs) {
paddle_mobile_->GetResults(outputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
paddle_mobile_->Predict_From_To(start, end);
}
#endif
template <typename Device, typename T>
PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
......
......@@ -35,6 +35,9 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<int>* index_data,
int batch_size = -1) override;
void FeedData(const std::vector<void*>& inputs) override;
void GetResults(std::vector<void*>* outputs) override;
void Predict_From_To(int start = 0, int end = -1) override;
#endif
~PaddleMobilePredictor() override;
......
......@@ -119,6 +119,9 @@ class PaddlePredictor {
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
std::vector<int>* index_data, int batch_size = -1) = 0;
virtual void FeedData(const std::vector<void*>& inputs) = 0;
virtual void GetResults(std::vector<void*>* outputs) = 0;
virtual void Predict_From_To(int start = 0, int end = -1) = 0;
#endif
protected:
......
......@@ -228,10 +228,14 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_->FeedData(t);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::FeedData(
const std::vector<framework::Tensor> &v) {
void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
executor_->FeedData(v);
};
template <typename Device, typename T>
void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
executor_->GetResults(v);
}
template <typename Device, typename T>
std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
int id) {
......
......@@ -90,7 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t);
void FeedData(const std::vector<framework::Tensor> &v);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
......
......@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float nms_thresh_;
float min_size_;
float eta_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_score, float_bbox;
fpga::BypassArgs score_arg, bbox_arg;
#endif
};
DECLARE_KERNEL(Proposal, ProposalParam);
......@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int pooled_height_;
int pooled_width_;
float spatial_scale_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_input, float_output;
fpga::BypassArgs input_arg, output_arg;
#endif
};
DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
......
......@@ -23,15 +23,46 @@ namespace operators {
template <>
bool AnchorGeneratorKernel<FPGA, float>::Init(
AnchorGeneratorParam<FPGA> *param) {
// TODO zhangyang
auto input = param->input_;
auto anchors = param->output_anchors_;
auto anchor_ptr = anchors->mutable_data<float>();
auto stride = param->stride_;
auto feature_width = input->dims()[3], feature_height = input->dims()[2];
auto stride_width = stride[0], stride_height = stride[1];
int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23,
-20, 39, 36, -43, -34, 59, 49, -63, -54,
79, 69, -96, -77, 112, 93, -137, -118, 153,
134, -204, -188, 220, 204, -281, -395, 296, 441};
int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset = h_idx * w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) {
anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] =
anchors_offset[idx * 4 + 1] + h_idx * stride_height;
anchor_ptr[offset + 2] =
anchors_offset[idx * 4 + 2] + w_idx * stride_width;
anchor_ptr[offset + 3] =
anchors_offset[idx * 4 + 3] + h_idx * stride_height;
}
}
}
return true;
}
template <>
void AnchorGeneratorKernel<FPGA, float>::Compute(
const AnchorGeneratorParam<FPGA> &param) {
// TODO(hjchen2)
}
const AnchorGeneratorParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = (half *)input->data<float>(); // NOLINT
images_in[i] = input->data<half>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
scales_in[i] = input->scale;
}
......@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs.image_num = image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->data<float>(); // NOLINT
concatArgs.image_out = out->data<half>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = height;
......
......@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
......
......@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
......
......@@ -19,19 +19,35 @@ namespace operators {
template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
Tensor *output = param->Out();
auto output = param->Out();
auto input = const_cast<LoDTensor *>(param->InputX());
input->init(typeid(float));
input->Resize(output->dims());
if (output->dims().size() != 4) {
auto input_ptr = input->mutable_data<float>();
size_t size = output->numel() * sizeof(float);
auto p = fpga::fpga_malloc(size);
memcpy(p, input_ptr, size);
output->reset_data_ptr(p);
return true;
}
fpga::format_fp16_ofm(output);
return true;
}
template <>
void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto input =
reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
auto output = param.Out();
auto input = const_cast<LoDTensor *>(param.InputX());
if (input->dims().size() != 4) {
return;
}
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto output_ptr = output->data<half>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
......@@ -39,7 +55,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.address = input_ptr;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
......@@ -48,6 +64,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
input->external_data = nullptr;
}
template class FeedKernel<FPGA, float>;
......
......@@ -19,20 +19,15 @@ namespace operators {
template <>
bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
Tensor *output = param->Out();
// fpga::format_fp16_ofm(output);
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
param.Out()->ShareDataWith(*(param.InputX()));
/*auto input =
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto input = const_cast<Tensor *>(param->InputX());
auto output = param->Out();
if (input->type() == typeid(float)) {
output->ShareDataWith(*input);
return true;
}
output->init(typeid(float));
output->Resize(input->dims());
fpga::format_fp32_ofm(output);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
......@@ -40,13 +35,28 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
= output_ptr; args.output.scale_address = output->scale;
fpga::PerformBypass(args);*/
args.image.address = input->data<half>();
args.image.channels = (uint32_t)product(input->dims());
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output->data<float>();
args.output.scale_address = output->scale;
param->fpga_bypass_args = args;
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX();
if (input->type() == typeid(float)) {
return;
}
fpga::PerformBypass(param.fpga_bypass_args);
// TODO: DEalign: get rid of extra 0
}
template class FetchKernel<FPGA, float>;
......
......@@ -22,10 +22,10 @@ namespace operators {
template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
Tensor *output = param->Output();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<float>();
auto output_ptr = output->mutable_data<half>();
vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings();
......
......@@ -14,20 +14,422 @@ limitations under the License. */
#ifdef PROPOSAL_OP
#include <algorithm>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
template <>
bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
int post_nms_top_n = param->post_nms_topn_;
int64_t batch = param->scores_->dims()[0];
auto total = post_nms_top_n * batch;
param->rpn_rois_->mutable_data<float>({total, 4});
param->rpn_probs_->mutable_data<float>({total, 1});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param->float_bbox = std::make_shared<Tensor>();
param->float_bbox->Resize(param->bbox_deltas_->dims());
param->float_bbox->init(typeid(float));
fpga::format_fp32_ofm(param->float_bbox.get());
param->float_score = std::make_shared<Tensor>();
param->float_score->Resize(param->scores_->dims());
param->float_score->init(typeid(float));
fpga::format_fp32_ofm(param->float_score.get());
auto input = param->bbox_deltas_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_bbox->mutable_data<float>();
args.output.scale_address = param->float_bbox->scale;
param->bbox_arg = args;
input = param->scores_;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_score->mutable_data<float>();
args.output.scale_address = param->float_score->scale;
param->score_arg = args;
return true;
}
void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
auto *out_data = dst->data<void>();
auto *to_add_data = src.data<void>();
size_t size_of_t = framework::SizeOfType(src.type());
offset *= size_of_t;
std::memcpy(
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
to_add_data, src.numel() * size_of_t);
}
template <class T>
static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
Tensor *variances, Tensor *proposals) {
T *proposals_data = proposals->mutable_data<T>();
int64_t row = all_anchors->dims()[0];
int64_t len = all_anchors->dims()[1];
auto *bbox_deltas_data = bbox_deltas->data<T>();
auto *anchor_data = all_anchors->data<T>();
const T *variances_data = nullptr;
if (variances) {
variances_data = variances->data<T>();
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
if (variances) {
bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
anchor_center_x;
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
}
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
}
// return proposals;
}
template <class T>
static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
T *boxes_data = boxes->mutable_data<T>();
const T *im_info_data = im_info.data<T>();
T zero(0);
for (int64_t i = 0; i < boxes->numel(); ++i) {
if (i % 4 == 0) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else if (i % 4 == 1) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
} else if (i % 4 == 2) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
}
}
}
template <class T>
static inline void FilterBoxes(Tensor *boxes, float min_size,
const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>();
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>();
int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i;
}
}
keep->Resize({keep_len});
}
template <class T>
static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
const std::vector<T> &scores) {
std::vector<std::pair<T, int>> sorted_indices;
sorted_indices.reserve(scores.size());
for (size_t i = 0; i < scores.size(); ++i) {
sorted_indices.emplace_back(scores[i], i);
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<T, int> &a, const std::pair<T, int> &b) {
return a.first < b.first;
});
return sorted_indices;
}
template <class T>
static inline T BBoxArea(const T *box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <typename T>
static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
int selected_num) {
Tensor keep_nms;
keep_nms.Resize({selected_num});
auto *keep_data = keep_nms.mutable_data<T>();
for (int i = 0; i < selected_num; ++i) {
keep_data[i] = selected_indices[i];
}
return keep_nms;
}
template <class T>
static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <class T>
static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
float eta) {
int64_t num_boxes = bbox->dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox->dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices =
GetSortedScoreIndex<T>(scores_data);
std::vector<int> selected_indices;
int selected_num = 0;
T adaptive_threshold = nms_threshold;
const T *bbox_data = bbox->data<T>();
while (sorted_indices.size() != 0) {
int idx = sorted_indices.back().second;
bool flag = true;
for (int kept_idx : selected_indices) {
if (flag) {
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, false);
flag = (overlap <= adaptive_threshold);
} else {
break;
}
}
if (flag) {
selected_indices.push_back(idx);
++selected_num;
}
sorted_indices.erase(sorted_indices.end() - 1);
if (flag && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
return VectorToTensor(selected_indices, selected_num);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas_slice, // [M, 4]
const Tensor &scores_slice, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
auto *scores_data = scores_slice.data<T>();
// Sort index
Tensor index_t;
index_t.Resize({scores_slice.numel()});
int *index = index_t.mutable_data<int>();
for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}
auto compare = [scores_data](const int64_t &i, const int64_t &j) {
return scores_data[i] > scores_data[j];
};
if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
std::sort(index, index + scores_slice.numel(), compare);
} else {
std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
compare);
index_t.Resize({pre_nms_top_n});
}
Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
scores_sel.mutable_data<T>({index_t.numel(), 1});
bbox_sel.mutable_data<T>({index_t.numel(), 4});
anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4});
Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals);
Tensor keep;
FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
Tensor scores_filter;
bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1});
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter);
}
Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
proposals.mutable_data<T>({keep_nms.numel(), 4});
scores_sel.mutable_data<T>({keep_nms.numel(), 1});
return std::make_pair(proposals, scores_sel);
}
template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
// TODO(hjchen2)
auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(),
score_tensor->numel() * sizeof(float));
auto bbox_tensor = param.float_bbox.get();
fpga::PerformBypass(param.bbox_arg);
fpga::fpga_invalidate(bbox_tensor->data<float>(),
bbox_tensor->numel() * sizeof(float));
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
auto *im_info = param.im_info_;
auto anchors = *param.anchors_;
auto variances = *param.variances_;
auto *rpn_rois = param.rpn_rois_;
auto *rpn_roi_probs = param.rpn_probs_;
int pre_nms_top_n = param.pre_nms_topn_;
int post_nms_top_n = param.post_nms_topn_;
float nms_thresh = param.nms_thresh_;
float min_size = param.min_size_;
float eta = param.eta_;
auto &scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto &bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
//
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
Tensor &proposals = tensor_pair.first;
Tensor &scores = tensor_pair.second;
AppendProposals(rpn_rois, 4 * num_proposals, proposals);
AppendProposals(rpn_roi_probs, num_proposals, scores);
num_proposals += proposals.dims()[0];
lod0.push_back(num_proposals);
}
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
} // namespace operators
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
......@@ -21,13 +22,180 @@ namespace paddle_mobile {
namespace operators {
template <>
bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA> *param) {
bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
auto dims = param->input_x_->dims();
PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
"data not aligned");
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
param->float_output = std::make_shared<Tensor>();
param->float_output->mutable_data<float>(param->output_->dims());
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_input->mutable_data<float>();
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
fpga::format_fp16_ofm(param->output_);
input = param->float_output.get();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input->data<float>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
param->input_arg = args;
return true;
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA> &param) {
// TODO(hjchen2)
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
fpga::PerformBypass(param.input_arg);
fpga::fpga_invalidate(input_tensor->data<float>(),
input_tensor->numel() * sizeof(float));
auto* in = input_tensor;
auto* rois = param.input_rois_;
auto* out = param.float_output.get();
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
auto spatial_scale = param.spatial_scale_;
auto output_channels = param.output_channels_;
auto in_dims = in->dims();
int batch_size = in_dims[0];
int input_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto in_stride =
framework::stride({batch_size, height, width, input_channels});
auto out_stride = framework::stride(
{out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
const float* input_data = in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
return;
PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_MOBILE_ENFORCE(
rois_batch_size == batch_size,
"the rois_batch_size and input(X) batch_size should be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
"the rois_num from input and lod must be the same");
PADDLE_MOBILE_ENFORCE(
input_channels == output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
// calculate batch id index for each roi according to LoD
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
rois_batch_id_data[i] = n;
}
}
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI
for (int n = 0; n < rois_num; ++n) {
// set roi batch id
int roi_batch_id = rois_batch_id_data[n];
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
DLOG << 3;
// calculate each pixel of the output feature map.
int out_roi_offset = n * out_stride[0];
for (int c = 0; c < output_channels; ++c) {
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int out_plane_offset = out_roi_offset + c;
for (int ph = 0; ph < pooled_height; ++ph) {
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int out_row_offset = out_plane_offset + ph * out_stride[1];
for (int pw = 0; pw < pooled_width; ++pw) {
// calculate w and h at input feature map
int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
int hend =
ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
int wend =
ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
wstart = std::min(std::max(wstart, 0), width);
hend = std::min(std::max(hend, 0), height);
wend = std::min(std::max(wend, 0), width);
// TODO int output_index = out_row_offset + pw;
int output_index = out_row_offset + pw * output_channels;
int input_channel = (c * pooled_height + ph) * pooled_width + pw;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
auto offset_input_data = input_data + input_plane_offset;
float out_sum = 0.;
bool is_empty = (hend <= hstart) || (wend <= wstart);
for (int ih = hstart; ih < hend; ++ih) {
for (int iw = wstart; iw < wend; ++iw) {
int input_index = ih * in_stride[1] + iw * input_channel;
out_sum += offset_input_data[input_index];
}
}
float bin_area = (hend - hstart) * (wend - wstart);
output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
}
}
}
}
fpga::format_image(out);
fpga::PerformBypass(param.output_arg);
}
} // namespace operators
......
......@@ -15,18 +15,61 @@ limitations under the License. */
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto output = param->Out();
auto shape = param->Shape();
output->ShareDataWith(*input);
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
template <>
void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
return;
auto input = const_cast<LoDTensor *>(param.InputX());
auto output = param.Out();
auto shape = param.Shape();
if (output->type() != typeid(half)) {
DLOG << "wrong type";
}
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
if (output->type() != typeid(half)) {
DLOG << "wrong type";
DLOG << output;
}
//
}
} // namespace operators
......
......@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile::fpga::SIGMOID;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
auto out = param->Out();
fpga::format_fp16_ofm(out);
......@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = out->data<float>();
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = activation_enable;
args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
......
......@@ -21,10 +21,37 @@ namespace operators {
template <>
bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
auto output = param->output_;
fpga::format_fp16_ofm(output);
DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_;
if (param->input_->type() != typeid(half)) {
DLOG << "wrong type";
}
return true;
}
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension
auto input = param.input_;
DLOG << input;
int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1];
auto input_ptr = input->data<half>();
auto output_ptr = param.output_->data<half>();
int start = param.starts_[0], end = param.ends_[0];
start = start < 0 ? start + channel : start;
end = end < 0 ? end + channel : end;
start = start > channel ? channel : start;
end = end > channel ? channel : end;
int len = end - start;
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,49 +23,72 @@ namespace operators {
template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
auto out = param->Out();
fpga::format_fp32_ofm(out);
auto float_input = new Tensor;
if (input->dims().size() == 2) {
float_input->mutable_data<float>({1, input->dims()[1]});
} else if (input->dims().size() == 4) {
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
} else {
DLOG << "wrong dimension of softmax input";
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input");
auto dims = framework::vectorize(input->dims());
auto channel = dims[3];
if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
dims[3] = dims[1];
dims[1] = 1;
}
input->Resize(framework::make_ddim(dims));
float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU
float_input->init(typeid(float));
fpga::format_fp32_ofm(float_input);
fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
fpga::format_fp16_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input_ptr;
args.image.height = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[2];
args.image.channels = (uint32_t)input->dims()[3];
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = fpga::SOFTMAX;
param->SetFpgaArgs(args);
}
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height =
(input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
return true;
}
template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *in_x = param.FloatInput();
Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
in_x->numel() * sizeof(float));
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x = param.FloatInput();
fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
}
} // namespace operators
......
......@@ -34,16 +34,18 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga::fpga_malloc(image_num * sizeof(float *)));
auto out_channels = reinterpret_cast<uint32_t *>(
fpga::fpga_malloc(image_num * sizeof(uint32_t)));
DLOG << "input: " << in;
for (int i = 0; i < image_num; i++) {
fpga::format_fp16_ofm(outs[i]);
images_out[i] = outs[i]->mutable_data<float>();
DLOG << "output: " << outs[i];
images_out[i] = outs[i]->mutable_data<half>();
scales_out[i] = outs[i]->scale;
out_channels[i] = (uint32_t)sections[i];
}
fpga::SplitArgs arg = {0};
arg.image_num = image_num;
arg.image_in = (half *)in->data<float>();
arg.image_in = in->data<half>();
arg.scale_in = in->scale;
arg.images_out = images_out;
arg.scales_out = scales_out;
......
......@@ -22,8 +22,10 @@ namespace operators {
template <>
bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
DLOG << "input: " << input;
auto input_ptr = input->data<half>();
auto float_input = new Tensor;
float_input->mutable_data<float>(
{1, input->dims()[1], input->dims()[2], input->dims()[3]});
fpga::format_fp32_ofm(float_input);
......
......@@ -20,7 +20,21 @@ namespace operators {
template <>
bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
param->Out()->ShareDataWith(*param->InputX());
auto input = param->InputX();
auto output = param->Out();
auto axis = param->Axis();
auto dim = input->dims();
output->ShareDataWith(*input);
auto dim_v = vectorize(dim);
for (int i = 0; i < axis.size(); i++) {
dim_v[i] = dim[axis[i]];
}
output->Resize(framework::make_ddim(dim_v));
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
......
......@@ -1172,6 +1172,12 @@ class FeedParam : public OpParam {
public:
FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
#ifdef PADDLE_MOBILE_FPGA
static int feed_num = 0;
auto new_name = std::string("feed") + std::to_string(feed_num++);
const_cast<VariableNameMap &>(inputs).at("X") = {string(new_name)};
#endif
input_x_ = InputXFrom<LoDTensor>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
auto var = scope.FindVar("batch_size");
......@@ -1195,6 +1201,11 @@ class FetchParam : public OpParam {
public:
FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
#ifdef PADDLE_MOBILE_FPGA
static int fetch_num = 0;
auto new_name = std::string("fetch") + std::to_string(fetch_num++);
const_cast<VariableNameMap &>(outputs).at("Out") = {string(new_name)};
#endif
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom(outputs, scope);
}
......@@ -1210,18 +1221,9 @@ class FetchParam : public OpParam {
RType *input_x_;
Tensor *out_;
#ifdef PADDLE_MOBILE_FPGA
private:
std::shared_ptr<RType> float_input_x_;
public:
fpga::BypassArgs fpga_bypass_args;
public:
RType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
}
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif
};
......
......@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
}
}
void dump(std::string filename, const Tensor input_tensor) {
auto dataptr = input_tensor.data<float>();
void dump(std::string filename, Tensor input_tensor) {
auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
for (int i = 0; i < input_tensor.numel(); ++i) {
......@@ -61,12 +61,11 @@ void dump(std::string filename, const Tensor input_tensor) {
}
out.close();
}
void dump_stride(std::string filename, const Tensor input_tensor,
const int dumpnum) {
void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum) {
int c = (input_tensor.dims())[1];
int h = (input_tensor.dims())[2];
int w = (input_tensor.dims())[3];
auto data_ptr = input_tensor.data<float>();
auto data_ptr = input_tensor.get_data();
int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t));
int16_t *data_ptr_16 = (int16_t *)data_ptr;
convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
......@@ -98,9 +97,9 @@ int main() {
for (int i = 0; i < 73; i++) {
auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(half));
dump_stride(saveName, (*tensor_ptr), 20);
// dump_stride(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr));
}
......
......@@ -23,29 +23,38 @@ limitations under the License. */
#include "fpga/V2/api.h"
#endif
// static const char *g_densebox_combine = "../models/densebox";
static const char *g_densebox_combine = "../models/rfcn";
void readStream(std::string filename, uint8_t *buf) {
std::ifstream in;
in.open(filename, std::ios::in);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
return;
}
int i = 0;
while (!in.eof()) {
in >> buf[i];
i++;
}
in.close();
}
static const char *g_rfcn_combine = "../models/rfcn";
const std::string g_image_src_float = "../models/rfcn/data.bin";
int main() {
paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
// paddle_mobile.SetThreadNum(4);
if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model",
std::string(g_densebox_combine) + "/params", true,
false, 1, true)) {
// std::vector<float> input;
// std::vector<int64_t> dims{1, 3, 512, 1024};
// GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
// auto vec_result = paddle_mobile.Predict(input, dims);
return 0;
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 512, 1024}, static_cast<float>(0),
static_cast<float>(1));
// readStream(g_image_src_float,
// input_tensor.mutable_data<float>({1, 3, 224, 224}));
paddle_mobile.FeedData(input_tensor);
if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
std::string(g_rfcn_combine) + "/params", true, false,
1, true)) {
float img_info[3] = {768, 1536, 768.0f / 960.0f};
auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
std::vector<void *> v(3, nullptr);
paddle_mobile.FeedData({img_info, img});
paddle_mobile.Predict_To(-1);
paddle_mobile.GetResults(&v);
DLOG << "Computation done";
}
return 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册