提交 b6993ba3 编写于 作者: H hjchen2

Merge conflicts caused by FPGA

......@@ -4,7 +4,7 @@ option(USE_OPENMP "build with openmp support" ON)
option(USE_EXCEPTION "build with exception" ON)
option(WITH_LOGGING "print logging for debug" ON)
option(WITH_SYMBOL "build with all symbols" ON) # turn off if use jni or ios io
option(WITH_PROFILE "print op profile for debug" ON)
option(WITH_PROFILE "print op profile for debug" OFF)
option(WITH_TEST "build with unit tests" ON)
# select the platform to build
......
......@@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data);
auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
float *old_p = p_data;
image::format_image(&p_data, channel, height, width);
if (old_p != p_data) {
image_tensor->reset_data_ptr(p_data);
}
}
void format_ofm(framework::Tensor *ofm_tensor) {
if (ofm_tensor->type() == typeid(float)) {
format_fp32_ofm(ofm_tensor);
} else {
format_fp16_ofm(ofm_tensor);
}
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
......@@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
}
void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
......@@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(half));
}
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
......@@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(typeid(float));
}
float filter_find_max(framework::Tensor *filter_tensor) {
......@@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
auto dims = filter_tensor->dims();
......@@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
......@@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
......@@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter::format_fc_filter(&new_data, num, channel, height, width, 1,
max_value);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
int group_num, int stride) {
......@@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
framework::make_ddim({num, channel, height, width});
filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
}
void format_bias_scale_array(float **bias_scale_array,
......@@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
auto ddim = framework::make_ddim({1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
out->set_type(typeid(half));
}
void format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
......@@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>();
auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<half>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<half>();
auto filter_ptr = filter->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -603,9 +621,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
auto out_ptr = out->data<half>();
arg->output.address =
(half *)out_ptr + // NOLINT
out_ptr +
omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->output.scale_address = out->scale;
......@@ -695,7 +713,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
}
for (int j = 0; j < split_num; ++j) {
// arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
activation_enable;
arg->split_conv_args[i]
......@@ -741,8 +758,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head = &((
int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT
auto filter_head =
&filter_ptr[j * element_num * filter_num_per_div + // NOLINT
i * filter_sub_conv_offset];
arg->split_conv_args[i]->conv_arg[j].filter_address =
fpga_malloc(filter_size);
......@@ -793,7 +810,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->conv_arg[j].output.scale_address),
deleter));
}
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int16_t *>(
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
arg->split_conv_args[i]->conv_arg[j].output.address);
arg->split_conv_args[i]->concat_arg.scales_in[j] =
arg->split_conv_args[i]->conv_arg[j].output.scale_address;
......@@ -818,9 +835,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto deleter = [](void *p) { fpga_free(p); };
arg->vector_dwconv_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
auto filter_ptr = filter->data<uint8_t>();
auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>();
arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable;
......@@ -848,9 +869,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<half>();
auto deleter = [](void *p) { fpga_free(p); };
......@@ -885,7 +905,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, real_out_height, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
auto out_ptr = out->data<half>();
/*====For Addition
arg->output.address =
......
......@@ -23,6 +23,7 @@ namespace paddle_mobile {
namespace fpga {
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor);
......
......@@ -247,6 +247,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
fpga_free(ptr_tmp);
}
fpga_free(ptr_ptr_data);
*data_in = reinterpret_cast<float*>(ptr_space);
/* {
......
......@@ -22,7 +22,6 @@ namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel;
......@@ -35,14 +34,12 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_conv(float **data_in, int height, int cw) {
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
......@@ -55,13 +52,17 @@ void align_element_conv(float **data_in, int height, int cw) {
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width);
int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
float *hwc_temp = *data_in;
align_element_conv(data_in, height, channel * width);
fpga_free(hwc_temp);
}
fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(float));
}
......
......@@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq(args.driver.deconv_param, 0xd18);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO;
DLOG << "Conv Wait Irq Timeout!";
}
DLOG << "after reg poll";
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
......
......@@ -164,7 +164,7 @@ void fpga_free(void *ptr) {
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
DLOG << "Address: " << ptr << " Invalid pointer";
}
}
void fpga_copy(void *dest, const void *src, size_t num) {
......
......@@ -19,17 +19,16 @@ limitations under the License. */
#include <memory>
#include <vector>
namespace paddle_mobile {
namespace fpga {
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
#define IMAGE_ALIGNMENT (16) // Aligned to 16
#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#endif
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
......@@ -49,7 +48,7 @@ enum ActivationType {
};
struct ActivationArgs {
enum ActivationType activation_type;
enum ActivationType activation_type = NONE;
int16_t leaky_relu_negative_slope;
};
......@@ -188,6 +187,7 @@ struct SplitArgs {
uint32_t* out_channel_nums;
uint32_t height;
uint32_t width;
std::vector<std::shared_ptr<char>> vector_split_space;
};
struct PoolingArgs {
......@@ -237,6 +237,7 @@ struct DWconvArgs {
struct KernelArgs kernel;
struct ImageInputArgs image;
struct ImageOutputArgs output;
std::vector<std::shared_ptr<char>> vector_dwconv_space;
};
struct DWDeconvArgs {
......
......@@ -83,6 +83,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
// resize feed and fetch list
InitFeedFetchList();
#ifdef PADDLE_MOBILE_FPGA
program_.scope->EraseVars({"feed", "fetch"});
program_.scope->print_vars();
#endif
int count = 0;
for (auto &op_handler : ops_of_block0_) {
DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
......@@ -291,6 +296,7 @@ template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
#ifdef PADDLE_MOBILE_FPGA
framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
tensor->init(typeid(float));
return true;
#endif
......@@ -506,14 +512,41 @@ template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
std::string var_name) {
Variable *g_feed_value = program_.scope->Var(var_name);
Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
feed_tensor->Resize(t.dims());
feed_tensor->ShareDataWith(t);
}
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
InjectVariable(t, "feed");
InjectVariable(t, "feed0");
}
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
auto input_size = v.size();
auto vars = program_.scope->VarContain("feed");
PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
"input data number not correct");
for (int i = 0; i < input_size; i++) {
auto var = program_.scope->Var("feed", i);
auto feed_tensor = var->template GetMutable<LoDTensor>();
feed_tensor->external_data = v[i];
}
}
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
auto output_size = v->size();
PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
auto vars = program_.scope->VarContain("fetch");
PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
"output data number not correct");
for (int i = 0; i < output_size; i++) {
auto var = program_.scope->Var("fetch", i);
auto fetch_tensor = var->template GetMutable<LoDTensor>();
(*v)[i] = fetch_tensor->template data<float>();
}
}
template <typename Device, typename T>
......
......@@ -52,6 +52,8 @@ class Executor {
#ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const Tensor &t, std::string var_name);
void FeedData(const Tensor &t);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
......
......@@ -50,6 +50,9 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
attrs_(attrs),
scope_(scope) {
CheckAllInputOutputSet();
#ifdef PADDLE_MOBILE_FPGA
InsertTensors();
#endif
}
template <typename Dtype>
......@@ -133,6 +136,25 @@ void OperatorBase<GPU_CL>::Run() {
}
#endif
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype>
void OperatorBase<Dtype>::InsertTensors() {
static int feed_num = 0;
static int fetch_num = 0;
if (type_ == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
inputs_.at("X") = {string(new_name)};
} else if (type_ == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope_->Var(new_name);
var->template GetMutable<framework::LoDTensor>();
outputs_.at("Out") = {string(new_name)};
}
}
#endif
template class OperatorBase<CPU>;
template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>;
......
......@@ -78,6 +78,9 @@ class OperatorBase {
this->scope_->EraseVars(var_names);
}
}
#ifdef PADDLE_MOBILE_FPGA
void InsertTensors();
#endif
protected:
framework::Scope *scope_;
......@@ -102,7 +105,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_.InitCLHelper(scope->GetCLScpoe());
#endif
}
virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
......
......@@ -72,7 +72,8 @@ void ProgramDesc::Description(std::string header) {
}
}
for (auto &attr : op->GetAttrMap()) {
LOG(kLOG_DEBUG2) << "attr name:: " << attr.first;
if (attr.first == "op_callstack") continue;
LOG(kLOG_DEBUG2) << "attr name: " << attr.first;
LOG(kLOG_DEBUG3) << "argument - " << attr.second;
}
}
......
......@@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const {
return nullptr;
}
#ifdef PADDLE_MOBILE_FPGA
Variable *Scope::Var(const std::string &name, const int id) {
return Var(name + std::to_string(id));
}
std::vector<Variable *> Scope::VarContain(const std::string substring) {
std::vector<Variable *> v;
for (auto pair : vars_) {
if (pair.first.find(substring) == 0) {
v.push_back(pair.second);
}
}
return v;
}
void Scope::print_vars() {
DLOG << "====================start to print variables=================";
for (auto pair : vars_) {
DLOG << pair.first;
}
DLOG << "==================complete printing variables================";
}
#endif
} // namespace framework
} // namespace paddle_mobile
......@@ -75,6 +75,12 @@ class Scope {
Variable *FindVarLocally(const std::string &name) const;
#ifdef PADDLE_MOBILE_FPGA
Variable *Var(const std::string &name, const int id);
std::vector<Variable *> VarContain(const std::string substring);
void print_vars();
#endif
#ifdef PADDLE_MOBILE_CL
CLScope *GetCLScpoe() { return cl_scope_; }
#endif
......
......@@ -202,6 +202,11 @@ class Tensor : public TensorBase {
inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT
}
inline void set_type(std::type_index type) { holder_->set_type(type); }
inline void *get_data() {
return (
void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get()); // NOLINT
}
inline void *init(std::type_index type) {
if (holder_ != nullptr) {
......@@ -218,6 +223,7 @@ class Tensor : public TensorBase {
}
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
void *external_data = nullptr; // only used for Feed
#endif
};
......
......@@ -110,6 +110,91 @@ bool PaddleMobilePredictor<Device, T>::Run(
return true;
}
#ifdef PADDLE_MOBILE_FPGA
template <typename Device, typename T>
bool PaddleMobilePredictor<Device, T>::Run(
const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, std::vector<int> *index_data,
int batch_size) {
if (inputs.empty()) {
LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
return false;
}
auto input = inputs[0];
if (input.shape.size() != 4) {
LOG(kLOG_ERROR) << "input shape not equal to 4!";
return false;
}
std::vector<int64_t> dims;
for (auto d : input.shape) {
dims.push_back(static_cast<int64_t>(d));
}
// use tensor
framework::DDim ddim =
framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
framework::Tensor input_tensor;
input_tensor.Resize(ddim);
int input_length = framework::product(ddim);
auto input_ptr = input_tensor.mutable_data<T>();
memcpy(input_ptr, static_cast<T *>(input.data.data()),
input_length * sizeof(T));
paddle_mobile_->Predict(input_tensor);
auto num_result = index_data->size();
if (output_data->size() != num_result) {
LOG(kLOG_ERROR) << "index and output number don't match";
return false;
}
for (int i = 0; i < num_result; i++) {
auto output_tensor = paddle_mobile_->FetchResult((*index_data)[i]);
if (output_data->empty()) {
LOG(kLOG_ERROR)
<< "At least one output should be set with tensors' names.";
return false;
}
auto &output = (*output_data)[i];
int output_length = output_tensor->numel();
std::vector<int64_t> tensor_shape =
framework::vectorize(output_tensor->dims());
for (auto d : tensor_shape) {
output.shape.push_back(static_cast<int>(d));
}
if (output.data.length() < output_length * sizeof(T)) {
output.data.Resize(output_length * sizeof(T));
}
memcpy(output.data.data(), output_tensor->template data<T>(),
output_length * sizeof(T));
}
return true;
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::FeedData(
const std::vector<void *> &inputs) {
paddle_mobile_->FeedData(inputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::GetResults(
std::vector<void *> *outputs) {
paddle_mobile_->GetResults(outputs);
}
template <typename Device, typename T>
void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
paddle_mobile_->Predict_From_To(start, end);
}
#endif
template <typename Device, typename T>
PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
paddle_mobile_->Clear();
......
......@@ -31,7 +31,14 @@ class PaddleMobilePredictor : public PaddlePredictor {
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override;
#ifdef PADDLE_MOBILE_FPGA
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data, std::vector<int>* index_data,
int batch_size = -1) override;
void FeedData(const std::vector<void*>& inputs) override;
void GetResults(std::vector<void*>* outputs) override;
void Predict_From_To(int start = 0, int end = -1) override;
#endif
~PaddleMobilePredictor() override;
private:
......
......@@ -26,8 +26,16 @@ limitations under the License. */
#include <string>
#include <vector>
// #define PADDLE_MOBILE_FPGA
namespace paddle_mobile {
#ifdef PADDLE_MOBILE_FPGA
namespace fpga {
int open_device();
}
#endif
enum PaddleDType {
FLOAT32,
INT64,
......@@ -107,6 +115,14 @@ class PaddlePredictor {
std::string prog_file;
std::string param_file;
};
#ifdef PADDLE_MOBILE_FPGA
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
std::vector<int>* index_data, int batch_size = -1) = 0;
virtual void FeedData(const std::vector<void*>& inputs) = 0;
virtual void GetResults(std::vector<void*>* outputs) = 0;
virtual void Predict_From_To(int start = 0, int end = -1) = 0;
#endif
protected:
PaddlePredictor() = default;
......
......@@ -228,6 +228,16 @@ void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
executor_->FeedData(t);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
executor_->FeedData(v);
}
template <typename Device, typename T>
void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
executor_->GetResults(v);
}
template <typename Device, typename T>
std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
int id) {
......
......@@ -90,6 +90,8 @@ class PaddleMobile {
#ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t);
void FeedData(const std::vector<void *> &v);
void GetResults(std::vector<void *> *v);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
......
......@@ -22,6 +22,7 @@ namespace operators {
template <typename DeviceType, typename T>
void AnchorGeneratorOp<DeviceType, T>::InferShape() const {
const auto &input_dims = this->param_.input_->dims();
// DLOG << "AnchorGenerator input dim =" << input_dims.size();
PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
const auto &anchor_sizes = this->param_.anchor_sizes_;
const auto &aspect_ratios = this->param_.aspect_ratios_;
......@@ -98,3 +99,15 @@ REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef ANCHOR_GENERATOR_OP
REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp);
#endif
#ifdef PROPOSAL_OP
REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
#endif
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
#endif
#endif
......@@ -103,6 +103,10 @@ class ProposalParam : public OpParam {
float nms_thresh_;
float min_size_;
float eta_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_score, float_bbox;
fpga::BypassArgs score_arg, bbox_arg;
#endif
};
DECLARE_KERNEL(Proposal, ProposalParam);
......@@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam {
int pooled_height_;
int pooled_width_;
float spatial_scale_;
#ifdef PADDLE_MOBILE_FPGA
std::shared_ptr<Tensor> float_input, float_output;
fpga::BypassArgs input_arg, output_arg;
#endif
};
DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ANCHOR_GENERATOR_OP
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool AnchorGeneratorKernel<FPGA, float>::Init(
AnchorGeneratorParam<FPGA> *param) {
auto input = param->input_;
auto anchors = param->output_anchors_;
auto anchor_ptr = anchors->mutable_data<float>();
auto stride = param->stride_;
auto feature_width = input->dims()[3], feature_height = input->dims()[2];
auto stride_width = stride[0], stride_height = stride[1];
int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23,
-20, 39, 36, -43, -34, 59, 49, -63, -54,
79, 69, -96, -77, 112, 93, -137, -118, 153,
134, -204, -188, 220, 204, -281, -395, 296, 441};
int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset = h_idx * w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) {
anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] =
anchors_offset[idx * 4 + 1] + h_idx * stride_height;
anchor_ptr[offset + 2] =
anchors_offset[idx * 4 + 2] + w_idx * stride_width;
anchor_ptr[offset + 3] =
anchors_offset[idx * 4 + 3] + h_idx * stride_height;
}
}
}
return true;
}
template <>
void AnchorGeneratorKernel<FPGA, float>::Compute(
const AnchorGeneratorParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif // ANCHOR_GENERATOR_OP
......@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = (half *)input->data<float>(); // NOLINT
images_in[i] = input->data<half>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
scales_in[i] = input->scale;
}
......@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs.image_num = image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->data<float>(); // NOLINT
concatArgs.image_out = out->data<half>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = height;
......
......@@ -26,11 +26,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
......@@ -59,8 +59,6 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
......@@ -70,6 +68,9 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true;
}
......
......@@ -27,10 +27,10 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
vector<int> paddings = param->Paddings();
......@@ -60,8 +60,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
const int groups = param->Groups();
if (groups == channel) {
......@@ -71,6 +69,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope, strides[0], strides[1],
paddings[0], paddings[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(new_scale_ptr);
fpga::fpga_free(bs_ptr);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
......@@ -78,6 +78,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope, param->Groups(), strides[0],
strides[1], paddings[0], paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
}
return true;
}
......
......@@ -25,10 +25,10 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
......@@ -25,10 +25,10 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
......@@ -26,8 +26,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto filter = const_cast<Tensor *>(param->Filter());
auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
......@@ -51,8 +51,6 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
......@@ -61,6 +59,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true;
}
......
......@@ -26,8 +26,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto filter = const_cast<Tensor *>(param->Filter());
auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
......@@ -51,8 +51,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
......@@ -61,6 +59,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true;
}
......
......@@ -27,10 +27,10 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
......@@ -28,10 +28,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
......
......@@ -27,10 +27,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
......
......@@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
......
......@@ -19,19 +19,37 @@ namespace operators {
template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
Tensor *output = param->Out();
auto output = param->Out();
int col = param->Col();
auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
input->init(typeid(float));
input->Resize(output->dims());
if (output->dims().size() != 4) {
auto input_ptr = input->mutable_data<float>();
size_t size = output->numel() * sizeof(float);
auto p = fpga::fpga_malloc(size);
memcpy(p, input_ptr, size);
output->reset_data_ptr(p);
return true;
}
fpga::format_fp16_ofm(output);
return true;
}
template <>
void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto input =
reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
auto output = param.Out();
int col = param.Col();
auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
if (input->dims().size() != 4) {
return;
}
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
auto output_ptr = output->data<half>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
......@@ -39,7 +57,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.address = input_ptr;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
......@@ -48,6 +66,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
input->external_data = nullptr;
}
template class FeedKernel<FPGA, float>;
......
......@@ -19,20 +19,15 @@ namespace operators {
template <>
bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
Tensor *output = param->Out();
// fpga::format_fp16_ofm(output);
auto input = const_cast<LoDTensor *>(param->InputX());
int col = param->Col();
auto output = &(param->Out()->at(col));
if (input->type() == typeid(float)) {
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
param.Out()->ShareDataWith(*(param.InputX()));
/*auto input =
reinterpret_cast<Tensor *>(const_cast<Tensor *>(param.InputX()));
fpga::format_image(input);
auto input_ptr = input->data<float>();
Tensor *output = param.Out();
auto output_ptr = output->data<float>();
}
output->init(typeid(float));
output->Resize(input->dims());
fpga::format_fp32_ofm(output);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
......@@ -40,13 +35,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr);
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] :
1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3]
: 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address
= output_ptr; args.output.scale_address = output->scale;
fpga::PerformBypass(args);*/
args.image.address = input->data<half>();
args.image.channels = (uint32_t)product(input->dims());
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output->data<float>();
args.output.scale_address = output->scale;
param->fpga_bypass_args = args;
return true;
}
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX();
if (input->type() == typeid(float)) {
int col = param.Col();
auto output = &(param.Out()->at(col));
output->ShareDataWith(*input);
return;
}
fpga::PerformBypass(param.fpga_bypass_args);
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
param.fpga_bypass_args.image.channels * sizeof(float));
// TODO: DEalign: get rid of extra 0
}
template class FetchKernel<FPGA, float>;
......
......@@ -25,7 +25,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<Tensor *>(param->InputY());
auto filter = const_cast<LoDTensor *>(param->InputY());
const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>();
auto out = param->Out();
......
......@@ -21,11 +21,11 @@ namespace operators {
template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
auto *input = const_cast<LoDTensor *>(param->Input());
auto input_ptr = input->data<half>();
Tensor *output = param->Output();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<float>();
auto output_ptr = output->mutable_data<half>();
vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PROPOSAL_OP
#include <algorithm>
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
template <>
bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
int post_nms_top_n = param->post_nms_topn_;
int64_t batch = param->scores_->dims()[0];
auto total = post_nms_top_n * batch;
param->rpn_rois_->mutable_data<float>({total, 4});
param->rpn_probs_->mutable_data<float>({total, 1});
// DLOG << *param->rpn_rois_;
// DLOG << *param->rpn_probs_;
param->float_bbox = std::make_shared<Tensor>();
param->float_bbox->Resize(param->bbox_deltas_->dims());
param->float_bbox->init(typeid(float));
fpga::format_fp32_ofm(param->float_bbox.get());
param->float_score = std::make_shared<Tensor>();
param->float_score->Resize(param->scores_->dims());
param->float_score->init(typeid(float));
fpga::format_fp32_ofm(param->float_score.get());
auto input = param->bbox_deltas_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_bbox->mutable_data<float>();
args.output.scale_address = param->float_bbox->scale;
param->bbox_arg = args;
input = param->scores_;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_score->mutable_data<float>();
args.output.scale_address = param->float_score->scale;
param->score_arg = args;
return true;
}
void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
auto *out_data = dst->data<void>();
auto *to_add_data = src.data<void>();
size_t size_of_t = framework::SizeOfType(src.type());
offset *= size_of_t;
std::memcpy(
reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
to_add_data, src.numel() * size_of_t);
}
template <class T>
static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
Tensor *variances, Tensor *proposals) {
T *proposals_data = proposals->mutable_data<T>();
int64_t row = all_anchors->dims()[0];
int64_t len = all_anchors->dims()[1];
auto *bbox_deltas_data = bbox_deltas->data<T>();
auto *anchor_data = all_anchors->data<T>();
const T *variances_data = nullptr;
if (variances) {
variances_data = variances->data<T>();
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
if (variances) {
bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
anchor_center_x;
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
}
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
}
// return proposals;
}
template <class T>
static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
T *boxes_data = boxes->mutable_data<T>();
const T *im_info_data = im_info.data<T>();
T zero(0);
for (int64_t i = 0; i < boxes->numel(); ++i) {
if (i % 4 == 0) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else if (i % 4 == 1) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
} else if (i % 4 == 2) {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
} else {
boxes_data[i] =
std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
}
}
}
template <class T>
static inline void FilterBoxes(Tensor *boxes, float min_size,
const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>();
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0]});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>();
int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i;
}
}
keep->Resize({keep_len});
}
template <class T>
static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
const std::vector<T> &scores) {
std::vector<std::pair<T, int>> sorted_indices;
sorted_indices.reserve(scores.size());
for (size_t i = 0; i < scores.size(); ++i) {
sorted_indices.emplace_back(scores[i], i);
}
// Sort the score pair according to the scores in descending order
std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
[](const std::pair<T, int> &a, const std::pair<T, int> &b) {
return a.first < b.first;
});
return sorted_indices;
}
template <class T>
static inline T BBoxArea(const T *box, bool normalized) {
if (box[2] < box[0] || box[3] < box[1]) {
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return static_cast<T>(0.);
} else {
const T w = box[2] - box[0];
const T h = box[3] - box[1];
if (normalized) {
return w * h;
} else {
// If coordinate values are not within range [0, 1].
return (w + 1) * (h + 1);
}
}
}
template <typename T>
static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
int selected_num) {
Tensor keep_nms;
keep_nms.Resize({selected_num});
auto *keep_data = keep_nms.mutable_data<T>();
for (int i = 0; i < selected_num; ++i) {
keep_data[i] = selected_indices[i];
}
return keep_nms;
}
template <class T>
static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
box2[3] < box1[1]) {
return static_cast<T>(0.);
} else {
const T inter_xmin = std::max(box1[0], box2[0]);
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
return inter_area / (bbox1_area + bbox2_area - inter_area);
}
}
template <class T>
static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
float eta) {
int64_t num_boxes = bbox->dims()[0];
// 4: [xmin ymin xmax ymax]
int64_t box_size = bbox->dims()[1];
std::vector<T> scores_data(num_boxes);
std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
std::vector<std::pair<T, int>> sorted_indices =
GetSortedScoreIndex<T>(scores_data);
std::vector<int> selected_indices;
int selected_num = 0;
T adaptive_threshold = nms_threshold;
const T *bbox_data = bbox->data<T>();
while (sorted_indices.size() != 0) {
int idx = sorted_indices.back().second;
bool flag = true;
for (int kept_idx : selected_indices) {
if (flag) {
T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
bbox_data + kept_idx * box_size, false);
flag = (overlap <= adaptive_threshold);
} else {
break;
}
}
if (flag) {
selected_indices.push_back(idx);
++selected_num;
}
sorted_indices.erase(sorted_indices.end() - 1);
if (flag && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
return VectorToTensor(selected_indices, selected_num);
}
template <typename T>
std::pair<Tensor, Tensor> ProposalForOneImage(
const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
const Tensor &bbox_deltas_slice, // [M, 4]
const Tensor &scores_slice, // [N, 1]
int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
float eta) {
auto *scores_data = scores_slice.data<T>();
// Sort index
Tensor index_t;
index_t.Resize({scores_slice.numel()});
int *index = index_t.mutable_data<int>();
for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}
auto compare = [scores_data](const int64_t &i, const int64_t &j) {
return scores_data[i] > scores_data[j];
};
if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
std::sort(index, index + scores_slice.numel(), compare);
} else {
std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
compare);
index_t.Resize({pre_nms_top_n});
}
Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
scores_sel.mutable_data<T>({index_t.numel(), 1});
bbox_sel.mutable_data<T>({index_t.numel(), 4});
anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4});
Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals);
Tensor keep;
FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
Tensor scores_filter;
bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1});
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter);
}
Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
keep_nms.Resize({post_nms_top_n});
}
proposals.mutable_data<T>({keep_nms.numel(), 4});
scores_sel.mutable_data<T>({keep_nms.numel(), 1});
return std::make_pair(proposals, scores_sel);
}
template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(),
score_tensor->numel() * sizeof(float));
auto bbox_tensor = param.float_bbox.get();
fpga::PerformBypass(param.bbox_arg);
fpga::fpga_invalidate(bbox_tensor->data<float>(),
bbox_tensor->numel() * sizeof(float));
auto *scores = param.float_score.get();
auto *bbox_deltas = param.float_bbox.get();
auto *im_info = param.im_info_;
auto anchors = *param.anchors_;
auto variances = *param.variances_;
auto *rpn_rois = param.rpn_rois_;
auto *rpn_roi_probs = param.rpn_probs_;
int pre_nms_top_n = param.pre_nms_topn_;
int post_nms_top_n = param.post_nms_topn_;
float nms_thresh = param.nms_thresh_;
float min_size = param.min_size_;
float eta = param.eta_;
auto &scores_dim = scores->dims();
int64_t num = scores_dim[0];
int64_t c_score = scores_dim[1];
int64_t h_score = scores_dim[2];
int64_t w_score = scores_dim[3];
auto &bbox_dim = bbox_deltas->dims();
int64_t c_bbox = bbox_dim[1];
int64_t h_bbox = bbox_dim[2];
int64_t w_bbox = bbox_dim[3];
//
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
Tensor &proposals = tensor_pair.first;
Tensor &scores = tensor_pair.second;
AppendProposals(rpn_rois, 4 * num_proposals, proposals);
AppendProposals(rpn_roi_probs, num_proposals, scores);
num_proposals += proposals.dims()[0];
lod0.push_back(num_proposals);
}
rpn_rois->set_lod(lod);
rpn_roi_probs->set_lod(lod);
rpn_rois->Resize({num_proposals, 4});
rpn_roi_probs->Resize({num_proposals, 1});
}
} // namespace operators
} // namespace paddle_mobile
#endif // PROPOSAL_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
auto dims = param->input_x_->dims();
PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
"data not aligned");
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
param->float_output = std::make_shared<Tensor>();
param->float_output->mutable_data<float>(param->output_->dims());
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_HWC;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input->data<half>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->float_input->mutable_data<float>();
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
fpga::format_fp16_ofm(param->output_);
input = param->float_output.get();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input->data<float>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
param->input_arg = args;
return true;
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
fpga::PerformBypass(param.input_arg);
fpga::fpga_invalidate(input_tensor->data<float>(),
input_tensor->numel() * sizeof(float));
auto* in = input_tensor;
auto* rois = param.input_rois_;
auto* out = param.float_output.get();
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
auto spatial_scale = param.spatial_scale_;
auto output_channels = param.output_channels_;
auto in_dims = in->dims();
int batch_size = in_dims[0];
int input_channels = in_dims[1];
int height = in_dims[2];
int width = in_dims[3];
int rois_num = rois->dims()[0];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto in_stride =
framework::stride({batch_size, height, width, input_channels});
auto out_stride = framework::stride(
{out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
const float* input_data = in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
return;
PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
auto rois_lod = rois->lod().back();
int rois_batch_size = rois_lod.size() - 1;
PADDLE_MOBILE_ENFORCE(
rois_batch_size == batch_size,
"the rois_batch_size and input(X) batch_size should be the same.");
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
"the rois_num from input and lod must be the same");
PADDLE_MOBILE_ENFORCE(
input_channels == output_channels * pooled_height * pooled_width,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
// calculate batch id index for each roi according to LoD
for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
rois_batch_id_data[i] = n;
}
}
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI
for (int n = 0; n < rois_num; ++n) {
// set roi batch id
int roi_batch_id = rois_batch_id_data[n];
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
DLOG << 3;
// calculate each pixel of the output feature map.
int out_roi_offset = n * out_stride[0];
for (int c = 0; c < output_channels; ++c) {
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int out_plane_offset = out_roi_offset + c;
for (int ph = 0; ph < pooled_height; ++ph) {
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int out_row_offset = out_plane_offset + ph * out_stride[1];
for (int pw = 0; pw < pooled_width; ++pw) {
// calculate w and h at input feature map
int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
int hend =
ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
int wend =
ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
wstart = std::min(std::max(wstart, 0), width);
hend = std::min(std::max(hend, 0), height);
wend = std::min(std::max(wend, 0), width);
// TODO int output_index = out_row_offset + pw;
int output_index = out_row_offset + pw * output_channels;
int input_channel = (c * pooled_height + ph) * pooled_width + pw;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
auto offset_input_data = input_data + input_plane_offset;
float out_sum = 0.;
bool is_empty = (hend <= hstart) || (wend <= wstart);
for (int ih = hstart; ih < hend; ++ih) {
for (int iw = wstart; iw < wend; ++iw) {
int input_index = ih * in_stride[1] + iw * input_channel;
out_sum += offset_input_data[input_index];
}
}
float bin_area = (hend - hstart) * (wend - wstart);
output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
}
}
}
}
fpga::format_image(out);
fpga::PerformBypass(param.output_arg);
}
} // namespace operators
} // namespace paddle_mobile
#endif // PSROI_POOL_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE2_OP
#include "operators/kernel/reshape2_kernel.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto output = param->Out();
auto shape = param->Shape();
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
output->set_type(input->type());
fpga::format_ofm(output);
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
void reshape(LoDTensor *input, LoDTensor *output) {
// Subscript r means after reshape
// TODO zhangyang verify this function
float *input_ptr_f, *output_ptr_f;
half *input_ptr_h, *output_ptr_h;
bool is_float = false;
if (input->type() == typeid(float)) {
input_ptr_f = input->data<float>();
output_ptr_f = output->data<float>();
is_float = true;
} else {
input_ptr_h = input->data<half>();
output_ptr_h = output->data<half>();
}
auto C = static_cast<int>(input->dims()[1]);
auto H = static_cast<int>(input->dims()[2]);
auto W = static_cast<int>(input->dims()[3]);
auto Cr = static_cast<int>(output->dims()[1]);
auto Hr = static_cast<int>(output->dims()[2]);
auto Wr = static_cast<int>(output->dims()[3]);
PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
auto WC = W * C;
auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
auto HW = H * W;
auto WCr = Wr * Cr;
auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
auto HWr = Hr * Wr;
int offset_align = 0;
int offset_r = 0, offset_align_r = 0;
int cr = 0, hr = 0, wr = 0;
for (int h = 0; h < H; h++) {
int offset0 = h * WC_align;
for (int w = 0; w < W; w++) {
int offset1 = w * C + offset0;
for (int c = 0; c < C; c++) {
offset_align = offset1 + c;
offset_r = c * HW + h * W + c;
cr = offset_r / HWr;
hr = offset_r % HWr / Wr;
wr = offset_r % Wr;
offset_align_r = hr * WCr_align + wr * Cr + cr;
// DLOG << "hwc"<< h<< " " << w << " " << c;
// DLOG << "hrwrcr" << hr<< " " << wr << " " << cr;
if (is_float) {
output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
} else {
output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
}
}
}
}
}
template <>
void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
auto input = const_cast<LoDTensor *>(param.InputX());
auto output = param.Out();
auto shape = param.Shape();
auto num_in = framework::product(input->dims());
auto num_shape = framework::product(framework::make_ddim(shape));
PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
for (int i = 0; i < shape.size(); i++) {
if (shape[i] == -1) {
shape[i] = static_cast<int>(-num_in / num_shape);
break;
}
}
output->Resize(framework::make_ddim(shape));
if (output->dims() == input->dims()) {
DLOG << "No need to reshape";
return;
}
reshape(input, output);
//
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -25,7 +25,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
paddle_mobile::fpga::SIGMOID;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
auto out = param->Out();
fpga::format_fp16_ofm(out);
......@@ -38,7 +38,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = out->data<float>();
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = activation_enable;
args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SLICE_OP
#include "operators/kernel/slice_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
auto output = param->output_;
fpga::format_fp16_ofm(output);
DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_;
if (param->input_->type() != typeid(half)) {
DLOG << "wrong type";
}
return true;
}
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension
auto input = param.input_;
DLOG << input;
int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1];
auto input_ptr = input->data<half>();
auto output_ptr = param.output_->data<half>();
int start = param.starts_[0], end = param.ends_[0];
start = start < 0 ? start + channel : start;
end = end < 0 ? end + channel : end;
start = start > channel ? channel : start;
end = end > channel ? channel : end;
int len = end - start;
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -23,49 +23,72 @@ namespace operators {
template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto input_ptr = input->data<half>();
auto out = param->Out();
fpga::format_fp32_ofm(out);
auto float_input = new Tensor;
if (input->dims().size() == 2) {
float_input->mutable_data<float>({1, input->dims()[1]});
} else if (input->dims().size() == 4) {
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
} else {
DLOG << "wrong dimension of softmax input";
auto float_input = new LoDTensor;
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input");
auto dims = framework::vectorize(input->dims());
auto channel = dims[3];
if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
dims[3] = dims[1];
dims[1] = 1;
}
input->Resize(framework::make_ddim(dims));
float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU
float_input->init(typeid(float));
fpga::format_fp32_ofm(float_input);
fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height =
(input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)dims[1];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
fpga::format_fp16_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input_ptr;
args.image.height = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[2];
args.image.channels = (uint32_t)input->dims()[3];
args.output.address = out->data<half>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = fpga::SOFTMAX;
param->SetFpgaArgs(args);
}
return true;
}
template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *in_x = param.FloatInput();
Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
in_x->numel() * sizeof(float));
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x = param.FloatInput();
fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
}
} // namespace operators
......
......@@ -20,7 +20,7 @@ namespace paddle_mobile {
namespace operators {
template <>
bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
auto *in = const_cast<Tensor *>(param->InputX());
auto *in = const_cast<LoDTensor *>(param->InputX());
auto outs = param->Outs();
auto sections = param->Sections();
int axis = param->Axis();
......@@ -34,22 +34,32 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga::fpga_malloc(image_num * sizeof(float *)));
auto out_channels = reinterpret_cast<uint32_t *>(
fpga::fpga_malloc(image_num * sizeof(uint32_t)));
DLOG << "input: " << in;
for (int i = 0; i < image_num; i++) {
fpga::format_fp16_ofm(outs[i]);
images_out[i] = outs[i]->mutable_data<float>();
DLOG << "output: " << outs[i];
images_out[i] = outs[i]->mutable_data<half>();
scales_out[i] = outs[i]->scale;
out_channels[i] = (uint32_t)sections[i];
}
auto deleter = [](void *p) { fpga::fpga_free(p); };
fpga::SplitArgs arg = {0};
arg.image_num = image_num;
arg.image_in = (half *)in->data<float>();
arg.image_in = in->data<half>();
arg.scale_in = in->scale;
arg.images_out = images_out;
arg.scales_out = scales_out;
arg.out_channel_nums = out_channels;
arg.height = (uint32_t)in->dims()[2];
arg.width = (uint32_t)in->dims()[3];
arg.vector_split_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
arg.vector_split_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
arg.vector_split_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
param->SetFpgaArgs(arg);
return true;
......
......@@ -21,9 +21,11 @@ namespace operators {
template <>
bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto float_input = new Tensor;
auto input = const_cast<LoDTensor *>(param->InputX());
DLOG << "input: " << input;
auto input_ptr = input->data<half>();
auto float_input = new LoDTensor;
float_input->mutable_data<float>(
{1, input->dims()[1], input->dims()[2], input->dims()[3]});
fpga::format_fp32_ofm(float_input);
......
......@@ -20,7 +20,21 @@ namespace operators {
template <>
bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
param->Out()->ShareDataWith(*param->InputX());
auto input = param->InputX();
auto output = param->Out();
auto axis = param->Axis();
auto dim = input->dims();
output->ShareDataWith(*input);
auto dim_v = vectorize(dim);
for (int i = 0; i < axis.size(); i++) {
dim_v[i] = dim[axis[i]];
}
output->Resize(framework::make_ddim(dim_v));
DLOG << "input: " << input;
DLOG << "output: " << output;
return true;
}
......
......@@ -1053,7 +1053,7 @@ class SoftmaxParam : public OpParam {
GType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
}
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif
......@@ -1212,18 +1212,8 @@ class FetchParam : public OpParam {
framework::LoDTensorArray *out_;
int col_;
#ifdef PADDLE_MOBILE_FPGA
private:
std::shared_ptr<GType> float_input_x_;
fpga::BypassArgs fpga_bypass_args;
public:
GType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
}
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
fpga::BypassArgs fpga_bypass_args;
#endif
};
......@@ -1660,7 +1650,7 @@ class TanhParam : public OpParam {
GType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
}
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#endif
......
......@@ -43,5 +43,8 @@ REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
#endif
#endif
......@@ -74,6 +74,9 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-densebox paddle-mobile)
ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-rfcn paddle-mobile)
set(FOUND_MATCH ON)
endif ()
......
......@@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width,
}
}
void dump(std::string filename, const Tensor input_tensor) {
auto dataptr = input_tensor.data<float>();
void dump(std::string filename, Tensor input_tensor) {
auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
for (int i = 0; i < input_tensor.numel(); ++i) {
......@@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) {
}
out.close();
}
void dump_stride(std::string filename, const Tensor input_tensor,
void dump_stride_half(std::string filename, Tensor input_tensor,
const int dumpnum) {
int c = (input_tensor.dims())[1];
int h = (input_tensor.dims())[2];
int w = (input_tensor.dims())[3];
auto data_ptr = input_tensor.data<float>();
int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t));
int16_t *data_ptr_16 = (int16_t *)data_ptr;
auto data_ptr = input_tensor.get_data();
auto *data_tmp =
reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
// const int16_t *dataptr = input_tensor.data<int16_t>();
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
......@@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor,
out.close();
free(data_tmp);
}
void dump_stride_float(std::string filename, Tensor input_tensor,
const int dumpnum) {
auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = data_ptr[i];
out << result << std::endl;
}
out.close();
}
static const char *g_resnet50 = "../models/resnet50";
const std::string g_image_src_float = "../images/image_src_float";
int main() {
......@@ -98,24 +112,21 @@ int main() {
for (int i = 0; i < 73; i++) {
auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(half));
dump_stride(saveName, (*tensor_ptr), 20);
dump_stride_half(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr));
}
std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
//(*output_tensor).dump<float>("resnet50_result_73");
output_tensor = paddle_mobile.FetchResult(74);
//(*output_tensor).dump<float>("resnet50_result_74");
// std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
// output_tensor = paddle_mobile.FetchResult(74);
auto tensor_ptr = paddle_mobile.FetchResult(73);
dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
tensor_ptr = paddle_mobile.FetchResult(74);
dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
float max = 0;
auto data_ptr = output_tensor->data<float>();
auto data_ptr = tensor_ptr->data<float>();
int maximumIdx = 0;
for (int i = 0; i < (*output_tensor).numel(); i++) {
for (int i = 0; i < (*tensor_ptr).numel(); i++) {
if (data_ptr[i] > max) {
maximumIdx = i;
max = data_ptr[i];
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
#ifdef PADDLE_MOBILE_FPGA_V1
#include "fpga/V1/api.h"
#endif
#ifdef PADDLE_MOBILE_FPGA_V2
#include "fpga/V2/api.h"
#endif
void readStream(std::string filename, uint8_t *buf) {
std::ifstream in;
in.open(filename, std::ios::in);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
return;
}
int i = 0;
while (!in.eof()) {
in >> buf[i];
i++;
}
in.close();
}
static const char *g_rfcn_combine = "../models/rfcn";
static const char *g_image_src_float = "../models/rfcn/data.bin";
int main() {
paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
std::string(g_rfcn_combine) + "/params", true, false,
1, true)) {
float img_info[3] = {768, 1536, 768.0f / 960.0f};
auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
std::vector<void *> v(3, nullptr);
paddle_mobile.FeedData({img_info, img});
paddle_mobile.Predict_To(-1);
paddle_mobile.GetResults(&v);
DLOG << "Computation done";
fpga::fpga_free(img);
}
return 0;
}
......@@ -126,6 +126,11 @@ if (CON GREATER -1)
set(RESHAPE_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON)
set(RESHAPE2_OP ON)
set(PSROI_POOL_OP ON)
set(PROPOSAL_OP ON)
set(ANCHOR_GENERATOR_OP ON)
set(SLICE_OP ON)
set(FOUND_MATCH ON)
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册