提交 09d5c269 编写于 作者: xiebaiyuan's avatar xiebaiyuan

Merge remote-tracking branch 'refs/remotes/upstream/develop' into develop_s1p0

...@@ -105,12 +105,14 @@ const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; ...@@ -105,12 +105,14 @@ const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand"; const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand";
const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool"; const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool";
const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax"; const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax";
const char *G_OP_TYPE_SLICE = "slice"; const char *G_OP_TYPE_SLICE = "slice";
const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator"; const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals"; const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
const char *G_OP_TYPE_PSROI_POOL = "psroi_pool"; const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform"; const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
const char *G_OP_TYPE_PAD2D = "pad2d";
const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn";
std::unordered_map< std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
...@@ -210,5 +212,8 @@ std::unordered_map< ...@@ -210,5 +212,8 @@ std::unordered_map<
{{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"}, {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
{"RpnRois", "RpnRoiProbs"}}}, {"RpnRois", "RpnRoiProbs"}}},
{G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}}, {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}}; {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -199,6 +199,9 @@ extern const char *G_OP_TYPE_ANCHOR_GENERATOR; ...@@ -199,6 +199,9 @@ extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
extern const char *G_OP_TYPE_GENERATE_PROPOSALS; extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
extern const char *G_OP_TYPE_PSROI_POOL; extern const char *G_OP_TYPE_PSROI_POOL;
extern const char *G_OP_TYPE_ROI_PERSPECTIVE; extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
extern const char *G_OP_TYPE_PAD2D;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
extern std::unordered_map< extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
...@@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { ...@@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy(new_data, data_ptr, memory_size); fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t)); filter_tensor->set_type(typeid(int16_t));
} }
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
...@@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) {
// auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
auto cmd = 0UL | USE_BIAS; auto cmd = 0UL | USE_BIAS;
auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) | auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
((args.deconv_tx_param.sub_conv_num) << 16) | ((args.deconv_tx_param.sub_conv_num) << 8) |
((args.deconv_tx_param.omit_size) << 0); ((args.deconv_tx_param.omit_size) << 0);
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
...@@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
fpga::format_fp16_ofm(out, dims_out_new); fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<half>(); auto out_ptr = out->data<half>();
arg->output.address = arg->output.address =
out_ptr + (half *)out_ptr + // NOLINT
omit_size * sizeof(half) * omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
...@@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
} }
for (int j = 0; j < split_num; ++j) { for (int j = 0; j < split_num; ++j) {
// arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
activation_enable; activation_enable;
arg->split_conv_args[i] arg->split_conv_args[i]
...@@ -758,8 +759,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -758,8 +759,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) * FILTER_NUM_ALIGNMENT) *
sizeof(int8_t); sizeof(int8_t);
auto filter_head = auto filter_head = &((
&filter_ptr[j * element_num * filter_num_per_div + // NOLINT int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT
i * filter_sub_conv_offset]; i * filter_sub_conv_offset];
arg->split_conv_args[i]->conv_arg[j].filter_address = arg->split_conv_args[i]->conv_arg[j].filter_address =
fpga_malloc(filter_size); fpga_malloc(filter_size);
...@@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address, fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
filter_size); filter_size);
/*{
static int cnt = 0;
std::string str = "deconv_filter";
if(cnt <= 1){
cnt++;
str += std::to_string(cnt);
int8_t result = 0;
fpga::savefile<int8_t>(str,
arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result);
}
}*/
size_t bs_align_num = align_to_x( size_t bs_align_num = align_to_x(
arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT); arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
size_t bs_size = 2 * bs_align_num * sizeof(float); size_t bs_size = 2 * bs_align_num * sizeof(float);
...@@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size); memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size); fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
/* {
static int cnt = 0;
std::string str = "deconv_sb";
if(cnt <= 1){
cnt++;
str += std::to_string(cnt);
float result = 0;
fpga::savefile<float>(str,
arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num,
result);
}
}*/
if (split_num == 1) { if (split_num == 1) {
arg->split_conv_args[i]->conv_arg[j].output.address = arg->split_conv_args[i]->conv_arg[j].output.address =
arg->split_conv_args[i]->output.address; arg->split_conv_args[i]->output.address;
...@@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, ...@@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h, int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w, int stride_w, int padding_h, int padding_w,
float *bias_ptr) { float *bias_ptr) {
auto deleter = [](void *p) { fpga_free(p); }; auto filter_ptr = filter->data<int16_t>();
arg->vector_dwconv_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
auto filter_ptr = filter->data<uint8_t>();
auto input_ptr = input->data<half>(); auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>(); auto output_ptr = out->data<half>();
arg->sub_conv_num = 1; arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled; // arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable; arg->output.activation.activation_type = activation_enable;
......
...@@ -21,15 +21,37 @@ namespace paddle_mobile { ...@@ -21,15 +21,37 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace image { namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) { void convert_to_hwc(float **data_in, int channel, int height, int width,
float *data_tmp = int num) {
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT float *data_tmp = reinterpret_cast<float *>(
fpga_malloc(num * channel * height * width * sizeof(float)));
int64_t amount_per_row = width * channel; int64_t amount_per_row = width * channel;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row; int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) { for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++); *(data_tmp + n * channel * height * width + offset_height +
w * channel + c) = *((*data_in)++);
}
}
}
}
*data_in = data_tmp;
}
void convert_to_chw(float **data_in, int channel, int height, int width,
int num) {
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_side = width * height;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
*(data_tmp + n * height * width * channel + c * amount_per_side +
width * h + w) = *((*data_in)++);
}
} }
} }
} }
...@@ -55,7 +77,7 @@ void align_element_conv(float **data_in, int height, int cw) { ...@@ -55,7 +77,7 @@ void align_element_conv(float **data_in, int height, int cw) {
} }
void format_image(float **data_in, int channel, int height, int width) { void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width); // convert_to_hwc(data_in, channel, height, width);
int cw = channel * width; int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) { if (align_cw != cw) {
...@@ -132,8 +154,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out, ...@@ -132,8 +154,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out,
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
w * channel_nums[i]; w * channel_nums[i];
memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
channel_nums[i] * sizeof(int16_t)); image_in + src_offset, channel_nums[i] * sizeof(int16_t));
src_offset += channel_nums[i]; src_offset += channel_nums[i];
} }
} }
......
...@@ -20,7 +20,11 @@ namespace paddle_mobile { ...@@ -20,7 +20,11 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace image { namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width); void convert_to_hwc(float** data_in, int channel, int height, int width,
int num = 1);
void convert_to_chw(float** data_in, int channel, int height, int width,
int num = 1);
void align_element_conv(float** data_in, int height, int cw); void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width); void format_image(float** data_in, int channel, int height, int width);
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "framework/operator.h" #include "framework/operator.h"
#include <memory>
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
...@@ -70,7 +70,12 @@ void OperatorBase<Dtype>::Run() { ...@@ -70,7 +70,12 @@ void OperatorBase<Dtype>::Run() {
auto vari = this->scope_->FindVar(var_vec_in[i]); auto vari = this->scope_->FindVar(var_vec_in[i]);
if (vari->IsInitialized()) { if (vari->IsInitialized()) {
const Tensor *tensor = vari->template Get<framework::LoDTensor>(); const Tensor *tensor = vari->template Get<framework::LoDTensor>();
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; if (tensor) {
DLOG << type_ << " input- " << key << "=" << *tensor;
#ifdef PADDLE_MOBILE_FPGA
DLOG << var_vec_in[i];
#endif
}
} }
} }
} }
...@@ -80,7 +85,12 @@ void OperatorBase<Dtype>::Run() { ...@@ -80,7 +85,12 @@ void OperatorBase<Dtype>::Run() {
auto vari = scope_->FindVar(var_vec_out[i]); auto vari = scope_->FindVar(var_vec_out[i]);
if (vari->IsInitialized()) { if (vari->IsInitialized()) {
const Tensor *tensor = vari->template Get<framework::LoDTensor>(); const Tensor *tensor = vari->template Get<framework::LoDTensor>();
if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; if (tensor) {
DLOG << type_ << " output- " << key << "=" << *tensor;
#ifdef PADDLE_MOBILE_FPGA
DLOG << var_vec_out[i];
#endif
}
} }
} }
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -80,7 +81,9 @@ class OperatorBase { ...@@ -80,7 +81,9 @@ class OperatorBase {
} }
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
void InsertTensors(); void InsertTensors();
void ChangeNameMap(string key, std::vector<string> value);
#endif #endif
protected: protected:
std::shared_ptr<Scope> scope_; std::shared_ptr<Scope> scope_;
std::string type_; std::string type_;
...@@ -95,6 +98,7 @@ class OperatorBase { ...@@ -95,6 +98,7 @@ class OperatorBase {
template <typename Dtype, typename ParamType, typename KernelType> template <typename Dtype, typename ParamType, typename KernelType>
class OperatorWithKernel : public OperatorBase<Dtype> { class OperatorWithKernel : public OperatorBase<Dtype> {
public: public:
#ifndef PADDLE_MOBILE_FPGA1
OperatorWithKernel(const std::string &type, const VariableNameMap &inputs, OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs, const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope) std::shared_ptr<Scope> scope)
...@@ -104,6 +108,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> { ...@@ -104,6 +108,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_.InitCLHelper(scope->GetCLScpoe()); kernel_.InitCLHelper(scope->GetCLScpoe());
#endif #endif
} }
#else
OperatorWithKernel(const std::string &type, const VariableNameMap inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {
static int feed_num = 0;
static int fetch_num = 0;
if (type == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope->Var(new_name);
(const_cast<VariableNameMap &>(inputs)).at("X") = {string(new_name)};
} else if (type == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope->Var(new_name);
(const_cast<VariableNameMap &>(outputs)).at("Out") = {string(new_name)};
}
param_ = ParamType(inputs, outputs, attrs, *scope);
}
#endif
virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0; virtual void InferShape() const = 0;
......
...@@ -126,6 +126,8 @@ std::vector<Variable *> Scope::VarContain(const std::string substring) { ...@@ -126,6 +126,8 @@ std::vector<Variable *> Scope::VarContain(const std::string substring) {
return v; return v;
} }
void Scope::InsertVar(const std::string str, Variable *var) {}
void Scope::print_vars() { void Scope::print_vars() {
DLOG << "====================start to print variables================="; DLOG << "====================start to print variables=================";
for (auto pair : vars_) { for (auto pair : vars_) {
......
...@@ -86,6 +86,7 @@ class Scope { ...@@ -86,6 +86,7 @@ class Scope {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
Variable *Var(const std::string &name, const int id); Variable *Var(const std::string &name, const int id);
std::vector<Variable *> VarContain(const std::string substring); std::vector<Variable *> VarContain(const std::string substring);
void InsertVar(const std::string str, Variable *var);
void print_vars(); void print_vars();
#endif #endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#include "operators/fusion_deconv_add_bn_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_add_bn_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvAddBNMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"},
{"Y", "BNY"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; }
};
template <typename DeviceType, typename T>
class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNParam<DeviceType>,
operators::DeconvAddBNKernel<DeviceType, T>> {
public:
FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNParam<DeviceType>,
operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DECONV_ADD_BN_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#include "operators/fusion_deconv_add_bn_relu_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
ops::FusionDeconvAddBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_add_bn_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvAddBNReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"},
{"Y", "BNY"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionDeconvAddBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
operators::DeconvAddBNReluKernel<DeviceType, T>> {
public:
FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
operators::DeconvAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DECONV_ADD_BN_RELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvAddBNKernel
: public OpKernelBase<DeviceType, FusionDeconvAddBNParam<DeviceType>> {
public:
void Compute(const FusionDeconvAddBNParam<DeviceType> &param);
bool Init(FusionDeconvAddBNParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvAddBNReluKernel
: public OpKernelBase<DeviceType, FusionDeconvAddBNReluParam<DeviceType>> {
public:
void Compute(const FusionDeconvAddBNReluParam<DeviceType> &param);
bool Init(FusionDeconvAddBNReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -43,9 +43,11 @@ bool AnchorGeneratorKernel<FPGA, float>::Init( ...@@ -43,9 +43,11 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
// DLOG << "stride_height: " << stride_height; // DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) { for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
int offset0 = h_idx * feature_width * num_anchors * 4;
for (int w_idx = 0; w_idx < feature_width; ++w_idx) { for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset = h_idx * w_idx * num_anchors * 4; int offset1 = w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) { for (int idx = 0; idx < num_anchors; idx++) {
int offset = offset0 + offset1 + idx * 4;
anchor_ptr[offset + 0] = anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width; anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] = anchor_ptr[offset + 1] =
......
文件模式从 100644 更改为 100755
...@@ -16,13 +16,10 @@ limitations under the License. */ ...@@ -16,13 +16,10 @@ limitations under the License. */
#include "operators/kernel/conv_bn_relu_kernel.h" #include "operators/kernel/conv_bn_relu_kernel.h"
#include <cmath> #include <cmath>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU; paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0; int16_t leaky_relu_negative_slope = 0;
...@@ -43,7 +40,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -43,7 +40,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
auto new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel}); auto new_bias_ptr = new_bias->mutable_data<float>({channel});
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = bn_scale_ptr[i] / new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5)); static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
...@@ -51,7 +47,16 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -51,7 +47,16 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
const int groups = param->Groups();
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
...@@ -59,16 +64,19 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -59,16 +64,19 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->Strides()[0], param->Strides()[1], param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
}
delete new_scale; delete new_scale;
delete new_bias; delete new_bias;
return true; return true;
} }
template <> template <>
void ConvBNReluKernel<FPGA, float>::Compute( void ConvBNReluKernel<FPGA, float>::Compute(
const FusionConvBNReluParam<FPGA> &param) { const FusionConvBNReluParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWConv(param.FpgaDwconvArgs());
} else {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
}
} }
} // namespace operators } // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
// const Tensor *bias = param->Bias();
// auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
// PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
// "Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = 0; // bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void ConvTransposeKernel<FPGA, float>::Compute(
const ConvTransposeParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#include "operators/kernel/deconv_add_bn_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void DeconvAddBNKernel<FPGA, float>::Compute(
const FusionDeconvAddBNParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#include "operators/kernel/deconv_add_bn_relu_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddBNReluKernel<FPGA, float>::Init(
FusionDeconvAddBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void DeconvAddBNReluKernel<FPGA, float>::Compute(
const FusionDeconvAddBNReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -25,11 +25,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) { ...@@ -25,11 +25,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
input->Resize(output->dims()); input->Resize(output->dims());
if (output->dims().size() != 4) { if (output->dims().size() != 4) {
auto input_ptr = input->mutable_data<float>();
size_t size = output->numel() * sizeof(float);
auto p = fpga::fpga_malloc(size);
memcpy(p, input_ptr, size);
output->reset_data_ptr(p);
return true; return true;
} }
fpga::format_fp16_ofm(output); fpga::format_fp16_ofm(output);
...@@ -41,7 +36,14 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -41,7 +36,14 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto output = param.Out(); auto output = param.Out();
auto input = const_cast<LoDTensor *>(param.InputX()); auto input = const_cast<LoDTensor *>(param.InputX());
if (input->dims().size() != 4) { if (output->dims().size() != 4) {
size_t size = output->numel() * sizeof(float);
auto output_ptr = output->data<float>();
auto input_ptr = input->data<float>();
auto external_ptr = reinterpret_cast<float *>(input->external_data);
float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
memcpy(output_ptr, p_data, size);
input->external_data = nullptr;
return; return;
} }
......
...@@ -49,17 +49,20 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) { ...@@ -49,17 +49,20 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
template <> template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) { void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX(); auto input = const_cast<Tensor *>(param.InputX());
if (input->type() == typeid(float)) { if (input->type() == typeid(float)) {
auto output = param.Out(); auto output = param.Out();
output->ShareDataWith(*input); output->ShareDataWith(*input);
return; return;
} }
fpga::PerformBypass(param.fpga_bypass_args); fpga::BypassArgs args = param.fpga_bypass_args;
auto data = (input->mutable_data<half>());
args.image.address = static_cast<void *>(data);
fpga::PerformBypass(args);
fpga::fpga_invalidate(param.fpga_bypass_args.output.address, fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
param.fpga_bypass_args.image.channels * sizeof(float)); param.fpga_bypass_args.image.channels * sizeof(float));
// TODO: DEalign: get rid of extra 0 // TODO(zhangyang): DEalign: get rid of extra 0
} }
template class FetchKernel<FPGA, float>; template class FetchKernel<FPGA, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/pad2d_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Pad2dKernel<FPGA, float>::Init(Pad2dParam<FPGA> *param) {
Tensor *output = param->Out();
fpga::format_fp16_ofm(output);
return true;
}
void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
auto input_data = (input->data<half>());
auto output_data = (output->data<half>());
auto input_c = input->dims()[1];
auto input_h = input->dims()[2];
auto input_w = input->dims()[3];
auto output_c = output->dims()[1];
auto output_w = output->dims()[3];
auto copysize = input_c * input_w;
for (int h = 0; h < input_h; ++h) {
auto input_offset = h * input_c * input_w;
auto output_offset = h * paddle_mobile::fpga::align_to_x(
output_c * output_w, IMAGE_ALIGNMENT);
memcpy((output_data + output_offset), (input_data + input_offset),
copysize * sizeof(half));
}
}
template <>
void Pad2dKernel<FPGA, float>::Compute(const Pad2dParam<FPGA> &param) {
auto in_x = param.InputX();
auto out = param.Out();
fpga::fpga_invalidate((void *)in_x->data<half>(), // NOLINT
in_x->numel() * sizeof(half));
pad2dFunc(in_x, out);
(out->scale)[0] = (in_x->scale)[0];
(out->scale)[1] = (in_x->scale)[1];
DLOG << (out->scale)[0];
DLOG << (out->scale)[1];
size_t outputSize =
out->dims()[2] *
paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
IMAGE_ALIGNMENT) *
sizeof(half);
fpga::fpga_flush(out->data<half>(), outputSize);
}
} // namespace operators
} // namespace paddle_mobile
...@@ -22,15 +22,29 @@ namespace operators { ...@@ -22,15 +22,29 @@ namespace operators {
template <> template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input()); auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<half>(); auto *output = param->Output();
Tensor *output = param->Output();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<half>();
vector<int> ksize = param->Ksize(); vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides(); vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
std::string pooling_type = param->PoolingType(); std::string pooling_type = param->PoolingType();
if (input->type() == typeid(float)) {
int channels = input->dims()[1];
int height = input->dims()[2];
int width = input->dims()[3];
int num = input->dims()[0];
int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
framework::DDim dim =
framework::make_ddim({num, channels, out_height, out_width});
output->mutable_data<float>(dim);
return true;
}
auto input_ptr = input->data<half>();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<half>();
fpga::PoolingArgs poolArgs = {0}; fpga::PoolingArgs poolArgs = {0};
poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1
poolArgs.kernel_reciprocal = poolArgs.kernel_reciprocal =
...@@ -54,6 +68,31 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { ...@@ -54,6 +68,31 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
template <> template <>
void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) { void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
auto *input = const_cast<Tensor *>(param.Input());
if (input->type() == typeid(float)) {
auto *output = param.Output();
auto in = input->data<float>();
auto len = output->numel();
auto out = output->mutable_data<float>();
int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
W = input->dims()[3];
int HW = H * W, CHW = C * H * W, WC = W * C;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
out[n * C + c] = 0;
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
out[n * C + c] += in[n * CHW + h * WC + w * C +
c]; // in[n * CHW + c * HW + h * W + w]; //
}
}
out[n * C + c] /= HW;
}
}
return;
}
fpga::ComputeFpgaPool(param.FpgaArgs()); fpga::ComputeFpgaPool(param.FpgaArgs());
} }
} // namespace operators } // namespace operators
......
...@@ -67,6 +67,30 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) { ...@@ -67,6 +67,30 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
return true; return true;
} }
template <typename T>
void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
(index.dims().size() == 2 && index.dims()[1] == 1),
"Dim not correct");
int64_t index_size = index.dims()[0];
auto src_dims = src.dims();
const T *p_src = src.data<T>();
const int *p_index = index.data<int>();
T *p_output = output->data<T>();
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const size_t slice_bytes = slice_size * sizeof(T);
for (int64_t i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
}
}
void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
auto *out_data = dst->data<void>(); auto *out_data = dst->data<void>();
...@@ -103,11 +127,11 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, ...@@ -103,11 +127,11 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
T bbox_center_x = 0, bbox_center_y = 0; T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0; T bbox_width = 0, bbox_height = 0;
/*
if (variances) { if (variances) {
bbox_center_x = bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
anchor_center_x; + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height + bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y; anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] * bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
...@@ -119,22 +143,33 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, ...@@ -119,22 +143,33 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
kBBoxClipDefault)) * kBBoxClipDefault)) *
anchor_height; anchor_height;
} else { } else {
bbox_center_x = */
bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y = bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
/*
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2], bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) * kBBoxClipDefault)) *
anchor_width; anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3], bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) * kBBoxClipDefault)) *
anchor_height; anchor_height;
} */
bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
// }
proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
/*
//wong
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
//wong
*/
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
} }
// return proposals; // return proposals;
} }
...@@ -328,9 +363,12 @@ std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -328,9 +363,12 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
anchor_sel.mutable_data<T>({index_t.numel(), 4}); anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4}); var_sel.mutable_data<T>({index_t.numel(), 4});
CPUGather<T>(scores_slice, index_t, &scores_sel);
CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
CPUGather<T>(anchors, index_t, &anchor_sel);
Tensor proposals; Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4}); proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals); BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals); ClipTiledBoxes<T>(im_info_slice, &proposals);
...@@ -341,6 +379,8 @@ std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -341,6 +379,8 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
bbox_sel.mutable_data<T>({keep.numel(), 4}); bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1}); scores_filter.mutable_data<T>({keep.numel(), 1});
CPUGather<T>(proposals, keep, &bbox_sel);
CPUGather<T>(scores_sel, keep, &scores_filter);
if (nms_thresh <= 0) { if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter); return std::make_pair(bbox_sel, scores_filter);
} }
...@@ -351,14 +391,86 @@ std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -351,14 +391,86 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
keep_nms.Resize({post_nms_top_n}); keep_nms.Resize({post_nms_top_n});
} }
proposals.mutable_data<T>({keep_nms.numel(), 4}); // proposals.mutable_data<T>({keep_nms.numel(), 4});//original
scores_sel.mutable_data<T>({keep_nms.numel(), 1}); // scores_sel.mutable_data<T>({keep_nms.numel(), 1});//original
proposals.mutable_data<T>({post_nms_top_n, 4}); // wong
scores_sel.mutable_data<T>({post_nms_top_n, 1}); // wong
CPUGather<T>(bbox_sel, keep_nms, &proposals);
CPUGather<T>(scores_filter, keep_nms, &scores_sel);
return std::make_pair(proposals, scores_sel); return std::make_pair(proposals, scores_sel);
} }
template <> template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto input_score = param.scores_;
auto input_score_data = input_score->data<half>();
auto input_score_data_tmp = input_score->data<half>();
uint32_t score_n, score_height, score_width, score_channels;
auto input_bbox = param.bbox_deltas_;
auto input_bbox_data = input_bbox->data<half>();
auto input_bbox_data_tmp = input_bbox->data<half>();
uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
score_n = (uint32_t)(input_score->dims()[0]);
score_channels = (uint32_t)(input_score->dims()[1]);
score_height = (uint32_t)(input_score->dims()[2]);
score_width = (uint32_t)(input_score->dims()[3]);
bbox_n = (uint32_t)(input_bbox->dims()[0]);
bbox_channels = (uint32_t)(input_bbox->dims()[1]);
bbox_height = (uint32_t)(input_bbox->dims()[2]);
bbox_width = (uint32_t)(input_bbox->dims()[3]);
// score_tmp->init(typeid(half));
std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
score_tmp->Resize(param.scores_->dims());
score_tmp->mutable_data<half>();
std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
bbox_tmp->Resize(param.bbox_deltas_->dims());
bbox_tmp->mutable_data<half>();
auto score_tmp_data = score_tmp->data<half>();
auto bbox_tmp_data = bbox_tmp->data<half>();
int64_t amount_per_side = score_width * score_height;
int idx = 0;
fpga::fpga_invalidate(
input_score_data_tmp,
score_height * score_width * score_channels * sizeof(half));
for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; c++) {
idx++;
// DLOG << "wong input_score: "<<
// paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data_tmp++));
}
}
}
amount_per_side = bbox_width * bbox_height;
fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
bbox_channels * sizeof(half));
for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; c++) {
idx++;
// DLOG << "wong input_score: "<<
// paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data_tmp++));
}
}
}
struct paddle_mobile::fpga::BypassArgs temp_score_arg;
struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
temp_score_arg = param.score_arg;
temp_score_arg.image.address = score_tmp->data<half>();
temp_bbox_arg = param.bbox_arg;
temp_bbox_arg.image.address = bbox_tmp->data<half>();
auto score_tensor = param.float_score.get(); auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg); fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(), fpga::fpga_invalidate(score_tensor->data<float>(),
...@@ -396,23 +508,23 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -396,23 +508,23 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
int64_t w_bbox = bbox_dim[3]; int64_t w_bbox = bbox_dim[3];
// //
Tensor bbox_deltas_swap, scores_swap; rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox}); rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
framework::LoD lod; framework::LoD lod;
lod.resize(1); lod.resize(1);
auto &lod0 = lod[0]; auto &lod0 = lod[0];
lod0.push_back(0); lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4}); anchors.Resize({anchors.numel(), 4});
variances.Resize({variances.numel(), 4});
int64_t num_proposals = 0; int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) { for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1); Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1); Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
scores_slice.Resize({h_score * w_score * c_score, 1}); scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>( std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "operators/kernel/detection_kernel.h" #include "operators/kernel/detection_kernel.h"
#include "fpga/V1/api.h"
#include "fpga/V1/image.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -29,8 +31,7 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) { ...@@ -29,8 +31,7 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
param->float_input = std::make_shared<Tensor>(); param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims()); param->float_input->mutable_data<float>(param->input_x_->dims());
param->float_output = std::make_shared<Tensor>(); // param->float_output = std::make_shared<Tensor>();
param->float_output->mutable_data<float>(param->output_->dims());
auto input = param->input_x_; auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
...@@ -46,22 +47,90 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) { ...@@ -46,22 +47,90 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
args.output.scale_address = param->float_input->scale; args.output.scale_address = param->float_input->scale;
param->input_arg = args; param->input_arg = args;
fpga::format_fp16_ofm(param->output_); auto* rois = param->input_rois_;
int rois_num = rois->dims()[0];
input = param->float_output.get(); framework::DDim dims_out_new = framework::make_ddim(
args.input_data_type = fpga::DATA_TYPE_FP32; {rois_num, param->output_->dims()[1], param->output_->dims()[2],
param->output_->dims()[3]});
param->output_->Resize(dims_out_new);
// fpga::format_fp16_ofm(param->output_);
param->output_->mutable_data<float>(dims_out_new);
// auto output = param->float_output.get();
// param->output_ = output;
/* args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input->data<float>(); args.image.address = output->data<float>();
args.image.height = (uint32_t)input->dims()[2]; args.image.height = (uint32_t)output->dims()[2];
args.image.width = (uint32_t)input->dims()[3]; args.image.width = (uint32_t)output->dims()[3];
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)output->dims()[1] ;
args.output.address = param->output_->mutable_data<half>(); args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale; args.output.scale_address = param->output_->scale;
param->input_arg = args; param->output_arg = args;*/
return true; return true;
} }
template <typename Dtype>
void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width,
const Dtype* bottom_rois, const int output_dim,
const int group_size, Dtype* top_data,
// int* mapping_channel,
int index, int* rois_batch_id) {
// The output is in order (n, ctop, ph, pw)
// static int cnt = 0;
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
bottom_rois += n * 4;
int roi_batch_ind = rois_batch_id[n]; // bottom_rois[0];
Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
Dtype roi_end_w =
static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
Dtype roi_end_h =
static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0
Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
// Compute w and h at bottom
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
hend = std::min(std::max(hend, 0), height);
wstart = std::min(std::max(wstart, 0), width);
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
int gw = pw;
int gh = ph;
int c = (ctop * group_size + gh) * group_size + gw;
bottom_data += (roi_batch_ind * channels + c) * height * width;
Dtype out_sum = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h * width + w;
out_sum += bottom_data[bottom_index];
}
}
Dtype bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / bin_area;
}
template <> template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get(); auto input_tensor = param.float_input.get();
...@@ -71,7 +140,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -71,7 +140,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto* in = input_tensor; auto* in = input_tensor;
auto* rois = param.input_rois_; auto* rois = param.input_rois_;
auto* out = param.float_output.get(); auto* out = param.output_; // param.float_output.get();
auto pooled_height = param.pooled_height_; auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_; auto pooled_width = param.pooled_width_;
...@@ -85,18 +154,17 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -85,18 +154,17 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
int width = in_dims[3]; int width = in_dims[3];
int rois_num = rois->dims()[0]; int rois_num = rois->dims()[0];
// TODO auto in_stride = framework::stride(in_dims); auto data_nhwc = in->mutable_data<float>();
// TODO auto out_stride = framework::stride(out->dims()); fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
auto in_stride = framework::DDim dims_out_new = framework::make_ddim(
framework::stride({batch_size, height, width, input_channels}); {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
auto out_stride = framework::stride( (param.output_)->dims()[3]});
{out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); (param.output_)->Resize(dims_out_new);
const float* input_data = in->data<float>(); const float* input_data = data_nhwc; // in->data<float>();
framework::Tensor rois_batch_id_list; framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num}); rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>(); auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
return;
PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
...@@ -124,78 +192,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -124,78 +192,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_rois = rois->data<float>(); auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI // calculate psroipooling, parallel processing can be implemented per ROI
for (int n = 0; n < rois_num; ++n) {
// set roi batch id
int roi_batch_id = rois_batch_id_data[n];
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
DLOG << 3;
// calculate each pixel of the output feature map.
int out_roi_offset = n * out_stride[0];
for (int c = 0; c < output_channels; ++c) {
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int out_plane_offset = out_roi_offset + c;
for (int ph = 0; ph < pooled_height; ++ph) {
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int out_row_offset = out_plane_offset + ph * out_stride[1];
for (int pw = 0; pw < pooled_width; ++pw) {
// calculate w and h at input feature map
int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
int hend =
ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
int wend =
ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
wstart = std::min(std::max(wstart, 0), width);
hend = std::min(std::max(hend, 0), height);
wend = std::min(std::max(wend, 0), width);
// TODO int output_index = out_row_offset + pw; int index = pooled_height * pooled_width * output_channels * rois_num;
int output_index = out_row_offset + pw * output_channels; for (int idx = 0; idx < index; idx++) {
int input_channel = (c * pooled_height + ph) * pooled_width + pw; PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
// TODO int input_plane_offset = width, pooled_height, pooled_width, input_rois,
// TODO roi_batch_id * in_stride[0] + input_channel * output_channels, pooled_height, output_data, idx,
// in_stride[1]; rois_batch_id_data);
int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
auto offset_input_data = input_data + input_plane_offset;
float out_sum = 0.;
bool is_empty = (hend <= hstart) || (wend <= wstart);
for (int ih = hstart; ih < hend; ++ih) {
for (int iw = wstart; iw < wend; ++iw) {
int input_index = ih * in_stride[1] + iw * input_channel;
out_sum += offset_input_data[input_index];
}
}
float bin_area = (hend - hstart) * (wend - wstart);
output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
}
}
}
} }
fpga::format_image(out); //
fpga::PerformBypass(param.output_arg); fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
pooled_width, rois_num);
out->reset_data_ptr(output_data);
} }
} // namespace operators } // namespace operators
......
...@@ -47,21 +47,11 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) { ...@@ -47,21 +47,11 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
void reshape(LoDTensor *input, LoDTensor *output) { void reshape(LoDTensor *input, LoDTensor *output) {
// Subscript r means after reshape // Subscript r means after reshape
// TODO zhangyang verify this function
float *input_ptr_f, *output_ptr_f; auto input_ptr = input->data<half>();
half *input_ptr_h, *output_ptr_h; auto output_ptr = output->data<half>();
bool is_float = false; output->scale[0] = input->scale[0];
output->scale[1] = input->scale[1];
if (input->type() == typeid(float)) {
input_ptr_f = input->data<float>();
output_ptr_f = output->data<float>();
is_float = true;
} else {
input_ptr_h = input->data<half>();
output_ptr_h = output->data<half>();
}
auto C = static_cast<int>(input->dims()[1]); auto C = static_cast<int>(input->dims()[1]);
auto H = static_cast<int>(input->dims()[2]); auto H = static_cast<int>(input->dims()[2]);
...@@ -77,6 +67,8 @@ void reshape(LoDTensor *input, LoDTensor *output) { ...@@ -77,6 +67,8 @@ void reshape(LoDTensor *input, LoDTensor *output) {
auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
auto HWr = Hr * Wr; auto HWr = Hr * Wr;
fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
int offset_align = 0; int offset_align = 0;
int offset_r = 0, offset_align_r = 0; int offset_r = 0, offset_align_r = 0;
int cr = 0, hr = 0, wr = 0; int cr = 0, hr = 0, wr = 0;
...@@ -87,21 +79,17 @@ void reshape(LoDTensor *input, LoDTensor *output) { ...@@ -87,21 +79,17 @@ void reshape(LoDTensor *input, LoDTensor *output) {
int offset1 = w * C + offset0; int offset1 = w * C + offset0;
for (int c = 0; c < C; c++) { for (int c = 0; c < C; c++) {
offset_align = offset1 + c; offset_align = offset1 + c;
offset_r = c * HW + h * W + c; offset_r = c * HW + h * W + w;
cr = offset_r / HWr; cr = offset_r / HWr;
hr = offset_r % HWr / Wr; hr = offset_r % HWr / Wr;
wr = offset_r % Wr; wr = offset_r % Wr;
offset_align_r = hr * WCr_align + wr * Cr + cr; offset_align_r = hr * WCr_align + wr * Cr + cr;
// DLOG << "hwc"<< h<< " " << w << " " << c; output_ptr[offset_align_r] = input_ptr[offset_align];
// DLOG << "hrwrcr" << hr<< " " << wr << " " << cr;
if (is_float) {
output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
} else {
output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
}
} }
} }
} }
fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
} }
template <> template <>
...@@ -123,6 +111,9 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) { ...@@ -123,6 +111,9 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
output->Resize(framework::make_ddim(shape)); output->Resize(framework::make_ddim(shape));
if (output->dims() == input->dims()) { if (output->dims() == input->dims()) {
DLOG << "No need to reshape"; DLOG << "No need to reshape";
output->ShareDataWith(*input);
framework::LoD lod = input->lod();
output->set_lod(lod);
return; return;
} }
......
...@@ -33,13 +33,18 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) { ...@@ -33,13 +33,18 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
template <> template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension // Only support slicing in channel dimension
// Only support half data
// W must be aligned to 16
auto input = param.input_; auto input = param.input_;
DLOG << input; auto output = param.output_;
int HW = input->dims()[2] * input->dims()[3]; int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1]; int channel = input->dims()[1];
auto input_ptr = input->data<half>(); auto input_ptr = input->data<half>();
auto output_ptr = param.output_->data<half>(); auto output_ptr = output->data<half>();
output->scale[0] = input->scale[0];
output->scale[1] = input->scale[1];
int start = param.starts_[0], end = param.ends_[0]; int start = param.starts_[0], end = param.ends_[0];
start = start < 0 ? start + channel : start; start = start < 0 ? start + channel : start;
...@@ -47,9 +52,10 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { ...@@ -47,9 +52,10 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
start = start > channel ? channel : start; start = start > channel ? channel : start;
end = end > channel ? channel : end; end = end > channel ? channel : end;
int len = end - start; int len = end - start;
size_t size = len * sizeof(half);
for (int i = 0; i < HW; i++) { for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, len); memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
} }
} }
} // namespace operators } // namespace operators
......
...@@ -23,14 +23,21 @@ namespace operators { ...@@ -23,14 +23,21 @@ namespace operators {
template <> template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX()); auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<half>(); auto dims = framework::vectorize(input->dims());
half *input_ptr;
auto out = param->Out(); auto out = param->Out();
if (input->type() == typeid(float)) {
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
} else {
input_ptr = input->data<half>();
}
auto float_input = new Tensor; auto float_input = new Tensor;
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4, PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input"); "Softmax should have 4-order input");
auto dims = framework::vectorize(input->dims());
auto channel = dims[3]; auto channel = dims[3];
if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1] if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op"); PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
...@@ -41,9 +48,12 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -41,9 +48,12 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
float_input->Resize(framework::make_ddim(dims)); float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU if (channel != 2) { // Use CPU
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
float_input->init(typeid(float)); float_input->init(typeid(float));
fpga::format_fp32_ofm(float_input); float_input->mutable_data<float>(framework::make_ddim(dims));
fpga::format_fp32_ofm(out); // fpga::format_fp32_ofm(float_input);
// fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC; args.input_layout_type = fpga::LAYOUT_HWC;
...@@ -51,7 +61,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -51,7 +61,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.input_data_type = fpga::DATA_TYPE_FP16; args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32; args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr; args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1]; args.image.height = (uint32_t)dims[1] * dims[0];
args.image.width = (uint32_t)dims[2]; args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3]; args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>(); args.output.address = float_input->data<float>();
...@@ -80,14 +90,23 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -80,14 +90,23 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
template <> template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
auto *in_x = (param.InputX());
if (in_x->type() == typeid(half)) {
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x2 = param.FloatInput();
fpga::fpga_invalidate(in_x2->data<float>(),
in_x2->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x2, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
} else {
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out(); Tensor *out = param.Out();
Tensor *in_x = param.FloatInput();
fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size()); }
} }
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class Pad2dKernel
: public framework::OpKernelBase<DeviceType, Pad2dParam<DeviceType>> {
public:
void Compute(const Pad2dParam<DeviceType> &param);
bool Init(Pad2dParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
...@@ -1221,6 +1221,7 @@ class FetchParam : public OpParam { ...@@ -1221,6 +1221,7 @@ class FetchParam : public OpParam {
RType *input_x_; RType *input_x_;
Tensor *out_; Tensor *out_;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
public: public:
fpga::BypassArgs fpga_bypass_args; fpga::BypassArgs fpga_bypass_args;
...@@ -2415,6 +2416,120 @@ class FusionDeconvAddParam : public ConvTransposeParam<Dtype> { ...@@ -2415,6 +2416,120 @@ class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
template <typename Dtype> template <typename Dtype>
using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>; using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
#endif #endif
#ifdef FUSION_DECONVADDBN_OP
template <typename Dtype>
class FusionDeconvAddBNParam : public ConvTransposeParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDeconvAddBNParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
output_ = OpParam::OutFrom<GType>(outputs, scope);
input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
momentum_ = OpParam::GetAttr<float>("momentum", attrs);
// is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
}
RType *Output() const { return output_; }
const RType *InputBias() const { return input_bias_; }
const RType *InputMean() const { return input_mean_; }
const RType *InputScale() const { return input_scale_; }
const RType *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
const RType *NewScale() const { return new_scale_; }
const RType *NewBias() const { return new_bias_; }
protected:
RType *output_;
RType *input_bias_;
RType *input_mean_;
RType *input_scale_;
RType *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
RType *new_bias_;
RType *new_scale_;
};
#endif
#ifdef FUSION_DECONVADDBNRELU_OP
template <typename Dtype>
class FusionDeconvAddBNReluParam : public ConvTransposeParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDeconvAddBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
output_ = OpParam::OutFrom<GType>(outputs, scope);
input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
momentum_ = OpParam::GetAttr<float>("momentum", attrs);
// is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
}
RType *Output() const { return output_; }
const RType *InputBias() const { return input_bias_; }
const RType *InputMean() const { return input_mean_; }
const RType *InputScale() const { return input_scale_; }
const RType *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
const RType *NewScale() const { return new_scale_; }
const RType *NewBias() const { return new_bias_; }
protected:
RType *output_;
RType *input_bias_;
RType *input_mean_;
RType *input_scale_;
RType *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
RType *new_bias_;
RType *new_scale_;
};
#endif
#ifdef FUSION_DECONVRELU_OP #ifdef FUSION_DECONVRELU_OP
template <typename Dtype> template <typename Dtype>
...@@ -3114,6 +3229,26 @@ class IncrementParam : public OpParam { ...@@ -3114,6 +3229,26 @@ class IncrementParam : public OpParam {
int step_; int step_;
}; };
#endif // INCREMENT_OP #endif // INCREMENT_OP
#ifdef PAD2D_OP
template <typename Dtype>
class Pad2dParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
Pad2dParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
}
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
private:
RType *input_x_;
RType *out_;
};
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PAD2D_OP
#include "operators/pad2d_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void Pad2dOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
auto input_n = input_dims[0];
auto input_c = input_dims[1];
auto input_h = input_dims[2];
auto input_w = input_dims[3];
this->param_.Out()->Resize({input_n, input_c, input_h + 1, input_w + 1});
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(pad2d, ops::Pad2dOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2dOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PAD2D_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/pad2d_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::AttributeMap;
using framework::OperatorWithKernel;
using framework::Scope;
using std::string;
template <typename DeviceType, typename T>
class Pad2dOp
: public OperatorWithKernel<DeviceType, Pad2dParam<DeviceType>,
operators::Pad2dKernel<DeviceType, T>> {
public:
Pad2dOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorWithKernel<DeviceType, Pad2dParam<DeviceType>,
operators::Pad2dKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
private:
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -97,7 +97,7 @@ void dump_stride_float(std::string filename, Tensor input_tensor, ...@@ -97,7 +97,7 @@ void dump_stride_float(std::string filename, Tensor input_tensor,
out.close(); out.close();
} }
static const char *g_resnet50 = "../models/resnet50"; static const char *g_resnet50 = "../models/resnet50";
const std::string g_image_src_float = "../images/image_src_float"; const std::string g_image_src_float = "../images/image_src_float"; // NOLINT
int main() { int main() {
paddle_mobile::fpga::open_device(); paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
...@@ -114,14 +114,14 @@ int main() { ...@@ -114,14 +114,14 @@ int main() {
std::string saveName = "resnet50_result_" + std::to_string(i); std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(half)); tensor_ptr->numel() * sizeof(half));
dump_stride_half(saveName, (*tensor_ptr), 20); // dump_stride_half(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr)); // dump(saveName, (*tensor_ptr));
} }
auto tensor_ptr = paddle_mobile.FetchResult(73); auto tensor_ptr = paddle_mobile.FetchResult(73);
dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
tensor_ptr = paddle_mobile.FetchResult(74); tensor_ptr = paddle_mobile.FetchResult(74);
dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
float max = 0; float max = 0;
auto data_ptr = tensor_ptr->data<float>(); auto data_ptr = tensor_ptr->data<float>();
......
...@@ -23,21 +23,101 @@ limitations under the License. */ ...@@ -23,21 +23,101 @@ limitations under the License. */
#include "fpga/V2/api.h" #include "fpga/V2/api.h"
#endif #endif
void readStream(std::string filename, uint8_t *buf) { #include <string>
void readStream(std::string filename, char *buf) {
std::ifstream in; std::ifstream in;
in.open(filename, std::ios::in); in.open(filename, std::ios::in | std::ios::binary);
if (!in.is_open()) { if (!in.is_open()) {
std::cout << "open File Failed." << std::endl; std::cout << "open File Failed." << std::endl;
return; return;
} }
int i = 0;
while (!in.eof()) { in.seekg(0, std::ios::end); // go to the end
in >> buf[i]; auto length = in.tellg(); // report location (this is the length)
i++; in.seekg(0, std::ios::beg); // go back to the beginning
} in.read(buf, length);
DLOG << length;
in.close(); in.close();
} }
void convert_to_chw(int16_t **data_in, int channel, int height, int width,
int num, int16_t *data_tmp) {
int64_t amount_per_side = width * height;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
*(data_tmp + n * amount_per_side * channel + c * amount_per_side +
width * h + w) = *((*data_in)++);
}
}
}
}
}
void dump_stride_half(std::string filename, Tensor input_tensor,
const int dumpnum, bool use_chw) {
// bool use_chw = true;
if (input_tensor.dims().size() != 4) return;
int c = (input_tensor.dims())[1];
int h = (input_tensor.dims())[2];
int w = (input_tensor.dims())[3];
int n = (input_tensor.dims())[0];
auto data_ptr = input_tensor.get_data();
auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
auto data_tmp = data_ptr_16;
if (use_chw) {
data_tmp =
reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
}
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
out << result << std::endl;
}
out.close();
if (data_tmp != data_ptr_16) {
free(data_tmp);
}
}
void dump_stride_float(std::string filename, Tensor input_tensor,
const int dumpnum) {
auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = data_ptr[i];
out << result << std::endl;
}
out.close();
}
void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
bool use_chw) {
static int i = 0;
if (input_tensor.numel() == 0) {
return;
}
if (input_tensor.type() == typeid(float)) {
DLOG << "op: " << i++ << ", float data " << input_tensor.numel();
dump_stride_float(filename, input_tensor, dumpnum);
} else {
DLOG << "op: " << i++ << ", half data " << input_tensor.numel();
dump_stride_half(filename, input_tensor, dumpnum, use_chw);
}
DLOG << "dump input address: " << input_tensor.get_data();
}
static const char *g_rfcn_combine = "../models/rfcn"; static const char *g_rfcn_combine = "../models/rfcn";
static const char *g_image_src_float = "../models/rfcn/data.bin"; static const char *g_image_src_float = "../models/rfcn/data.bin";
int main() { int main() {
...@@ -48,12 +128,45 @@ int main() { ...@@ -48,12 +128,45 @@ int main() {
std::string(g_rfcn_combine) + "/params", true, false, std::string(g_rfcn_combine) + "/params", true, false,
1, true)) { 1, true)) {
float img_info[3] = {768, 1536, 768.0f / 960.0f}; float img_info[3] = {768, 1536, 768.0f / 960.0f};
auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)); auto img = reinterpret_cast<float *>(
readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img)); fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
readStream(g_image_src_float, reinterpret_cast<char *>(img));
std::vector<void *> v(3, nullptr); std::vector<void *> v(3, nullptr);
paddle_mobile.FeedData({img_info, img}); paddle_mobile.FeedData({img_info, img});
paddle_mobile.Predict_To(-1); paddle_mobile.Predict_To(-1);
paddle_mobile.GetResults(&v);
for (int i = 55; i < 69; i++) {
auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "rfcn_" + std::to_string(i);
// if(i != 58)
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(float));
// tensor_ptr->numel() * sizeof(float));
if ((i == 48) || (i == 47)) {
dump_stride(saveName, (*tensor_ptr), 20,
false); // 20);//tensor_ptr->numel());
} else if (i == 55) {
dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
true); // 20);//tensor_ptr->numel());
} else {
dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
true); // 20);//tensor_ptr->numel());
}
/* float result = 0;
std::string str = "softmax_input_data";
float* data =
static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() *
sizeof(float))); str = "softmax_output_data"; auto output_ptr =
static_cast<half*>((*tensor_ptr).get_data()); for (int idx = 0; idx <
tensor_ptr->numel(); ++idx)
{
data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
}
fpga::savefile<float>(str,data, tensor_ptr->numel(), result ); */
}
// paddle_mobile.GetResults(&v);
DLOG << "Computation done"; DLOG << "Computation done";
fpga::fpga_free(img); fpga::fpga_free(img);
} }
......
...@@ -131,7 +131,12 @@ if (CON GREATER -1) ...@@ -131,7 +131,12 @@ if (CON GREATER -1)
set(PROPOSAL_OP ON) set(PROPOSAL_OP ON)
set(ANCHOR_GENERATOR_OP ON) set(ANCHOR_GENERATOR_OP ON)
set(SLICE_OP ON) set(SLICE_OP ON)
set(SIGMOID_OP ON)
set(CONCAT_OP ON)
set(PAD2D_OP ON)
set(CONV_TRANSPOSE_OP ON)
set(FUSION_DECONVADDBNRELU_OP ON)
set(FUSION_DECONVADDBN_OP ON)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
...@@ -573,7 +578,6 @@ endif() ...@@ -573,7 +578,6 @@ endif()
if (FUSION_DECONVADDRELU_OP) if (FUSION_DECONVADDRELU_OP)
add_definitions(-DFUSION_DECONVADDRELU_OP) add_definitions(-DFUSION_DECONVADDRELU_OP)
endif() endif()
if (WHILE_OP) if (WHILE_OP)
add_definitions(-DWHILE_OP) add_definitions(-DWHILE_OP)
endif() endif()
...@@ -602,3 +606,12 @@ endif() ...@@ -602,3 +606,12 @@ endif()
if (ROI_PERSPECTIVE_OP) if (ROI_PERSPECTIVE_OP)
add_definitions(-DROI_PERSPECTIVE_OP) add_definitions(-DROI_PERSPECTIVE_OP)
endif() endif()
if (FUSION_DECONVADDBNRELU_OP)
add_definitions(-DFUSION_DECONVADDBNRELU_OP)
endif()
if (FUSION_DECONVADDBN_OP)
add_definitions(-DFUSION_DECONVADDBN_OP)
endif()
if (PAD2D_OP)
add_definitions(-DPAD2D_OP)
endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册