提交 09d5c269 编写于 作者: xiebaiyuan's avatar xiebaiyuan

Merge remote-tracking branch 'refs/remotes/upstream/develop' into develop_s1p0

......@@ -105,12 +105,14 @@ const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand";
const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool";
const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax";
const char *G_OP_TYPE_SLICE = "slice";
const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
const char *G_OP_TYPE_PAD2D = "pad2d";
const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......@@ -210,5 +212,8 @@ std::unordered_map<
{{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
{"RpnRois", "RpnRoiProbs"}}},
{G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}};
{G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile
......@@ -199,6 +199,9 @@ extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
extern const char *G_OP_TYPE_PSROI_POOL;
extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
extern const char *G_OP_TYPE_PAD2D;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
......@@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data);
filter_tensor->set_type(typeid(int8_t));
filter_tensor->set_type(typeid(int16_t));
}
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
......@@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) {
// auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
auto cmd = 0UL | USE_BIAS;
auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) |
((args.deconv_tx_param.sub_conv_num) << 16) |
auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
((args.deconv_tx_param.sub_conv_num) << 8) |
((args.deconv_tx_param.omit_size) << 0);
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
......@@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<half>();
arg->output.address =
out_ptr +
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->output.scale_address = out->scale;
......@@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
}
for (int j = 0; j < split_num; ++j) {
// arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
activation_enable;
arg->split_conv_args[i]
......@@ -758,8 +759,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head =
&filter_ptr[j * element_num * filter_num_per_div + // NOLINT
auto filter_head = &((
int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT
i * filter_sub_conv_offset];
arg->split_conv_args[i]->conv_arg[j].filter_address =
fpga_malloc(filter_size);
......@@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
filter_size);
/*{
static int cnt = 0;
std::string str = "deconv_filter";
if(cnt <= 1){
cnt++;
str += std::to_string(cnt);
int8_t result = 0;
fpga::savefile<int8_t>(str,
arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result);
}
}*/
size_t bs_align_num = align_to_x(
arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
size_t bs_size = 2 * bs_align_num * sizeof(float);
......@@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
/* {
static int cnt = 0;
std::string str = "deconv_sb";
if(cnt <= 1){
cnt++;
str += std::to_string(cnt);
float result = 0;
fpga::savefile<float>(str,
arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num,
result);
}
}*/
if (split_num == 1) {
arg->split_conv_args[i]->conv_arg[j].output.address =
arg->split_conv_args[i]->output.address;
......@@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto deleter = [](void *p) { fpga_free(p); };
arg->vector_dwconv_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
auto filter_ptr = filter->data<uint8_t>();
auto filter_ptr = filter->data<int16_t>();
auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>();
auto output_ptr = out->data<half>();
arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable;
......
......@@ -21,15 +21,37 @@ namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
void convert_to_hwc(float **data_in, int channel, int height, int width,
int num) {
float *data_tmp = reinterpret_cast<float *>(
fpga_malloc(num * channel * height * width * sizeof(float)));
int64_t amount_per_row = width * channel;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
*(data_tmp + n * channel * height * width + offset_height +
w * channel + c) = *((*data_in)++);
}
}
}
}
*data_in = data_tmp;
}
void convert_to_chw(float **data_in, int channel, int height, int width,
int num) {
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_side = width * height;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
*(data_tmp + n * height * width * channel + c * amount_per_side +
width * h + w) = *((*data_in)++);
}
}
}
}
......@@ -55,7 +77,7 @@ void align_element_conv(float **data_in, int height, int cw) {
}
void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width);
// convert_to_hwc(data_in, channel, height, width);
int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
......@@ -132,8 +154,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out,
for (int i = 0; i < image_num; i++) {
des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
w * channel_nums[i];
memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
channel_nums[i] * sizeof(int16_t));
memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
image_in + src_offset, channel_nums[i] * sizeof(int16_t));
src_offset += channel_nums[i];
}
}
......
......@@ -20,7 +20,11 @@ namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width);
void convert_to_hwc(float** data_in, int channel, int height, int width,
int num = 1);
void convert_to_chw(float** data_in, int channel, int height, int width,
int num = 1);
void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width);
......
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/operator.h"
#include <memory>
#include "operators/op_param.h"
namespace paddle_mobile {
namespace framework {
......@@ -70,7 +70,12 @@ void OperatorBase<Dtype>::Run() {
auto vari = this->scope_->FindVar(var_vec_in[i]);
if (vari->IsInitialized()) {
const Tensor *tensor = vari->template Get<framework::LoDTensor>();
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
if (tensor) {
DLOG << type_ << " input- " << key << "=" << *tensor;
#ifdef PADDLE_MOBILE_FPGA
DLOG << var_vec_in[i];
#endif
}
}
}
}
......@@ -80,7 +85,12 @@ void OperatorBase<Dtype>::Run() {
auto vari = scope_->FindVar(var_vec_out[i]);
if (vari->IsInitialized()) {
const Tensor *tensor = vari->template Get<framework::LoDTensor>();
if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
if (tensor) {
DLOG << type_ << " output- " << key << "=" << *tensor;
#ifdef PADDLE_MOBILE_FPGA
DLOG << var_vec_out[i];
#endif
}
}
}
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
......@@ -80,7 +81,9 @@ class OperatorBase {
}
#ifdef PADDLE_MOBILE_FPGA
void InsertTensors();
void ChangeNameMap(string key, std::vector<string> value);
#endif
protected:
std::shared_ptr<Scope> scope_;
std::string type_;
......@@ -95,6 +98,7 @@ class OperatorBase {
template <typename Dtype, typename ParamType, typename KernelType>
class OperatorWithKernel : public OperatorBase<Dtype> {
public:
#ifndef PADDLE_MOBILE_FPGA1
OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
......@@ -104,6 +108,25 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
kernel_.InitCLHelper(scope->GetCLScpoe());
#endif
}
#else
OperatorWithKernel(const std::string &type, const VariableNameMap inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {
static int feed_num = 0;
static int fetch_num = 0;
if (type == "feed") {
auto new_name = string("feed") + std::to_string(feed_num++);
auto var = scope->Var(new_name);
(const_cast<VariableNameMap &>(inputs)).at("X") = {string(new_name)};
} else if (type == "fetch") {
auto new_name = string("fetch") + std::to_string(fetch_num++);
auto var = scope->Var(new_name);
(const_cast<VariableNameMap &>(outputs)).at("Out") = {string(new_name)};
}
param_ = ParamType(inputs, outputs, attrs, *scope);
}
#endif
virtual void RunImpl() { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
......
......@@ -126,6 +126,8 @@ std::vector<Variable *> Scope::VarContain(const std::string substring) {
return v;
}
void Scope::InsertVar(const std::string str, Variable *var) {}
void Scope::print_vars() {
DLOG << "====================start to print variables=================";
for (auto pair : vars_) {
......
......@@ -86,6 +86,7 @@ class Scope {
#ifdef PADDLE_MOBILE_FPGA
Variable *Var(const std::string &name, const int id);
std::vector<Variable *> VarContain(const std::string substring);
void InsertVar(const std::string str, Variable *var);
void print_vars();
#endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#include "operators/fusion_deconv_add_bn_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_add_bn_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvAddBNMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"},
{"Y", "BNY"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; }
};
template <typename DeviceType, typename T>
class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNParam<DeviceType>,
operators::DeconvAddBNKernel<DeviceType, T>> {
public:
FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNParam<DeviceType>,
operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DECONV_ADD_BN_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#include "operators/fusion_deconv_add_bn_relu_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
ops::FusionDeconvAddBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_add_bn_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvAddBNReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"},
{"Y", "BNY"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionDeconvAddBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
operators::DeconvAddBNReluKernel<DeviceType, T>> {
public:
FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
operators::DeconvAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_DECONV_ADD_BN_RELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvAddBNKernel
: public OpKernelBase<DeviceType, FusionDeconvAddBNParam<DeviceType>> {
public:
void Compute(const FusionDeconvAddBNParam<DeviceType> &param);
bool Init(FusionDeconvAddBNParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvAddBNReluKernel
: public OpKernelBase<DeviceType, FusionDeconvAddBNReluParam<DeviceType>> {
public:
void Compute(const FusionDeconvAddBNReluParam<DeviceType> &param);
bool Init(FusionDeconvAddBNReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -43,9 +43,11 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
// DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
int offset0 = h_idx * feature_width * num_anchors * 4;
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset = h_idx * w_idx * num_anchors * 4;
int offset1 = w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) {
int offset = offset0 + offset1 + idx * 4;
anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] =
......
文件模式从 100644 更改为 100755
......@@ -16,13 +16,10 @@ limitations under the License. */
#include "operators/kernel/conv_bn_relu_kernel.h"
#include <cmath>
namespace paddle_mobile {
namespace operators {
template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -43,7 +40,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
......@@ -51,7 +47,16 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
const int groups = param->Groups();
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
......@@ -59,16 +64,19 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
}
delete new_scale;
delete new_bias;
return true;
}
template <>
void ConvBNReluKernel<FPGA, float>::Compute(
const FusionConvBNReluParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWConv(param.FpgaDwconvArgs());
} else {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
}
} // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
// const Tensor *bias = param->Bias();
// auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
// PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
// "Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = 0; // bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void ConvTransposeKernel<FPGA, float>::Compute(
const ConvTransposeParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#include "operators/kernel/deconv_add_bn_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void DeconvAddBNKernel<FPGA, float>::Compute(
const FusionDeconvAddBNParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#include "operators/kernel/deconv_add_bn_relu_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddBNReluKernel<FPGA, float>::Init(
FusionDeconvAddBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<Tensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void DeconvAddBNReluKernel<FPGA, float>::Compute(
const FusionDeconvAddBNReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -25,11 +25,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
input->Resize(output->dims());
if (output->dims().size() != 4) {
auto input_ptr = input->mutable_data<float>();
size_t size = output->numel() * sizeof(float);
auto p = fpga::fpga_malloc(size);
memcpy(p, input_ptr, size);
output->reset_data_ptr(p);
return true;
}
fpga::format_fp16_ofm(output);
......@@ -41,7 +36,14 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto output = param.Out();
auto input = const_cast<LoDTensor *>(param.InputX());
if (input->dims().size() != 4) {
if (output->dims().size() != 4) {
size_t size = output->numel() * sizeof(float);
auto output_ptr = output->data<float>();
auto input_ptr = input->data<float>();
auto external_ptr = reinterpret_cast<float *>(input->external_data);
float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
memcpy(output_ptr, p_data, size);
input->external_data = nullptr;
return;
}
......
......@@ -49,17 +49,20 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = param.InputX();
auto input = const_cast<Tensor *>(param.InputX());
if (input->type() == typeid(float)) {
auto output = param.Out();
output->ShareDataWith(*input);
return;
}
fpga::PerformBypass(param.fpga_bypass_args);
fpga::BypassArgs args = param.fpga_bypass_args;
auto data = (input->mutable_data<half>());
args.image.address = static_cast<void *>(data);
fpga::PerformBypass(args);
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
param.fpga_bypass_args.image.channels * sizeof(float));
// TODO: DEalign: get rid of extra 0
// TODO(zhangyang): DEalign: get rid of extra 0
}
template class FetchKernel<FPGA, float>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/pad2d_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Pad2dKernel<FPGA, float>::Init(Pad2dParam<FPGA> *param) {
Tensor *output = param->Out();
fpga::format_fp16_ofm(output);
return true;
}
void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
auto input_data = (input->data<half>());
auto output_data = (output->data<half>());
auto input_c = input->dims()[1];
auto input_h = input->dims()[2];
auto input_w = input->dims()[3];
auto output_c = output->dims()[1];
auto output_w = output->dims()[3];
auto copysize = input_c * input_w;
for (int h = 0; h < input_h; ++h) {
auto input_offset = h * input_c * input_w;
auto output_offset = h * paddle_mobile::fpga::align_to_x(
output_c * output_w, IMAGE_ALIGNMENT);
memcpy((output_data + output_offset), (input_data + input_offset),
copysize * sizeof(half));
}
}
template <>
void Pad2dKernel<FPGA, float>::Compute(const Pad2dParam<FPGA> &param) {
auto in_x = param.InputX();
auto out = param.Out();
fpga::fpga_invalidate((void *)in_x->data<half>(), // NOLINT
in_x->numel() * sizeof(half));
pad2dFunc(in_x, out);
(out->scale)[0] = (in_x->scale)[0];
(out->scale)[1] = (in_x->scale)[1];
DLOG << (out->scale)[0];
DLOG << (out->scale)[1];
size_t outputSize =
out->dims()[2] *
paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
IMAGE_ALIGNMENT) *
sizeof(half);
fpga::fpga_flush(out->data<half>(), outputSize);
}
} // namespace operators
} // namespace paddle_mobile
......@@ -22,15 +22,29 @@ namespace operators {
template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<half>();
Tensor *output = param->Output();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<half>();
auto *output = param->Output();
vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings();
std::string pooling_type = param->PoolingType();
if (input->type() == typeid(float)) {
int channels = input->dims()[1];
int height = input->dims()[2];
int width = input->dims()[3];
int num = input->dims()[0];
int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
framework::DDim dim =
framework::make_ddim({num, channels, out_height, out_width});
output->mutable_data<float>(dim);
return true;
}
auto input_ptr = input->data<half>();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<half>();
fpga::PoolingArgs poolArgs = {0};
poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1
poolArgs.kernel_reciprocal =
......@@ -54,6 +68,31 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
template <>
void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
auto *input = const_cast<Tensor *>(param.Input());
if (input->type() == typeid(float)) {
auto *output = param.Output();
auto in = input->data<float>();
auto len = output->numel();
auto out = output->mutable_data<float>();
int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2],
W = input->dims()[3];
int HW = H * W, CHW = C * H * W, WC = W * C;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
out[n * C + c] = 0;
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
out[n * C + c] += in[n * CHW + h * WC + w * C +
c]; // in[n * CHW + c * HW + h * W + w]; //
}
}
out[n * C + c] /= HW;
}
}
return;
}
fpga::ComputeFpgaPool(param.FpgaArgs());
}
} // namespace operators
......
......@@ -67,6 +67,30 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
return true;
}
template <typename T>
void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
(index.dims().size() == 2 && index.dims()[1] == 1),
"Dim not correct");
int64_t index_size = index.dims()[0];
auto src_dims = src.dims();
const T *p_src = src.data<T>();
const int *p_index = index.data<int>();
T *p_output = output->data<T>();
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const size_t slice_bytes = slice_size * sizeof(T);
for (int64_t i = 0; i < index_size; ++i) {
int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
}
}
void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
auto *out_data = dst->data<void>();
......@@ -103,11 +127,11 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
/*
if (variances) {
bbox_center_x =
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
anchor_center_x;
bbox_center_y = variances_data[i * len + 1] *
variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
+ anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
......@@ -119,22 +143,33 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
kBBoxClipDefault)) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
*/
bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
/*
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
kBBoxClipDefault)) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
kBBoxClipDefault)) *
anchor_height;
}
*/
bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
// }
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
/*
//wong
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
//wong
*/
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
}
// return proposals;
}
......@@ -328,9 +363,12 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
anchor_sel.mutable_data<T>({index_t.numel(), 4});
var_sel.mutable_data<T>({index_t.numel(), 4});
CPUGather<T>(scores_slice, index_t, &scores_sel);
CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
CPUGather<T>(anchors, index_t, &anchor_sel);
Tensor proposals;
proposals.mutable_data<T>({index_t.numel(), 4});
BoxCoder<T>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
ClipTiledBoxes<T>(im_info_slice, &proposals);
......@@ -341,6 +379,8 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
bbox_sel.mutable_data<T>({keep.numel(), 4});
scores_filter.mutable_data<T>({keep.numel(), 1});
CPUGather<T>(proposals, keep, &bbox_sel);
CPUGather<T>(scores_sel, keep, &scores_filter);
if (nms_thresh <= 0) {
return std::make_pair(bbox_sel, scores_filter);
}
......@@ -351,14 +391,86 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
keep_nms.Resize({post_nms_top_n});
}
proposals.mutable_data<T>({keep_nms.numel(), 4});
scores_sel.mutable_data<T>({keep_nms.numel(), 1});
// proposals.mutable_data<T>({keep_nms.numel(), 4});//original
// scores_sel.mutable_data<T>({keep_nms.numel(), 1});//original
proposals.mutable_data<T>({post_nms_top_n, 4}); // wong
scores_sel.mutable_data<T>({post_nms_top_n, 1}); // wong
CPUGather<T>(bbox_sel, keep_nms, &proposals);
CPUGather<T>(scores_filter, keep_nms, &scores_sel);
return std::make_pair(proposals, scores_sel);
}
template <>
void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
auto input_score = param.scores_;
auto input_score_data = input_score->data<half>();
auto input_score_data_tmp = input_score->data<half>();
uint32_t score_n, score_height, score_width, score_channels;
auto input_bbox = param.bbox_deltas_;
auto input_bbox_data = input_bbox->data<half>();
auto input_bbox_data_tmp = input_bbox->data<half>();
uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
score_n = (uint32_t)(input_score->dims()[0]);
score_channels = (uint32_t)(input_score->dims()[1]);
score_height = (uint32_t)(input_score->dims()[2]);
score_width = (uint32_t)(input_score->dims()[3]);
bbox_n = (uint32_t)(input_bbox->dims()[0]);
bbox_channels = (uint32_t)(input_bbox->dims()[1]);
bbox_height = (uint32_t)(input_bbox->dims()[2]);
bbox_width = (uint32_t)(input_bbox->dims()[3]);
// score_tmp->init(typeid(half));
std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
score_tmp->Resize(param.scores_->dims());
score_tmp->mutable_data<half>();
std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
bbox_tmp->Resize(param.bbox_deltas_->dims());
bbox_tmp->mutable_data<half>();
auto score_tmp_data = score_tmp->data<half>();
auto bbox_tmp_data = bbox_tmp->data<half>();
int64_t amount_per_side = score_width * score_height;
int idx = 0;
fpga::fpga_invalidate(
input_score_data_tmp,
score_height * score_width * score_channels * sizeof(half));
for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; c++) {
idx++;
// DLOG << "wong input_score: "<<
// paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
*(score_tmp_data + c * amount_per_side + score_width * h + w) =
(*(input_score_data_tmp++));
}
}
}
amount_per_side = bbox_width * bbox_height;
fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
bbox_channels * sizeof(half));
for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; c++) {
idx++;
// DLOG << "wong input_score: "<<
// paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
*(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
(*(input_bbox_data_tmp++));
}
}
}
struct paddle_mobile::fpga::BypassArgs temp_score_arg;
struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
temp_score_arg = param.score_arg;
temp_score_arg.image.address = score_tmp->data<half>();
temp_bbox_arg = param.bbox_arg;
temp_bbox_arg.image.address = bbox_tmp->data<half>();
auto score_tensor = param.float_score.get();
fpga::PerformBypass(param.score_arg);
fpga::fpga_invalidate(score_tensor->data<float>(),
......@@ -396,23 +508,23 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
int64_t w_bbox = bbox_dim[3];
//
Tensor bbox_deltas_swap, scores_swap;
bbox_deltas_swap.mutable_data<float>({num, h_bbox, w_bbox, c_bbox});
scores_swap.mutable_data<float>({num, h_score, w_score, c_score});
rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel() / 4, 4});
anchors.Resize({anchors.numel(), 4});
variances.Resize({variances.numel(), 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < num; ++i) {
Tensor im_info_slice = im_info->Slice(i, i + 1);
Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
Tensor scores_slice = scores_swap.Slice(i, i + 1);
Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
scores_slice.Resize({h_score * w_score * c_score, 1});
std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
......
......@@ -18,6 +18,8 @@ limitations under the License. */
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "fpga/V1/api.h"
#include "fpga/V1/image.h"
namespace paddle_mobile {
namespace operators {
......@@ -29,8 +31,7 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
param->float_input = std::make_shared<Tensor>();
param->float_input->mutable_data<float>(param->input_x_->dims());
param->float_output = std::make_shared<Tensor>();
param->float_output->mutable_data<float>(param->output_->dims());
// param->float_output = std::make_shared<Tensor>();
auto input = param->input_x_;
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
......@@ -46,22 +47,90 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
args.output.scale_address = param->float_input->scale;
param->input_arg = args;
fpga::format_fp16_ofm(param->output_);
input = param->float_output.get();
args.input_data_type = fpga::DATA_TYPE_FP32;
auto* rois = param->input_rois_;
int rois_num = rois->dims()[0];
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, param->output_->dims()[1], param->output_->dims()[2],
param->output_->dims()[3]});
param->output_->Resize(dims_out_new);
// fpga::format_fp16_ofm(param->output_);
param->output_->mutable_data<float>(dims_out_new);
// auto output = param->float_output.get();
// param->output_ = output;
/* args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = input->data<float>();
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.image.address = output->data<float>();
args.image.height = (uint32_t)output->dims()[2];
args.image.width = (uint32_t)output->dims()[3];
args.image.channels = (uint32_t)output->dims()[1] ;
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
param->input_arg = args;
param->output_arg = args;*/
return true;
}
template <typename Dtype>
void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width,
const Dtype* bottom_rois, const int output_dim,
const int group_size, Dtype* top_data,
// int* mapping_channel,
int index, int* rois_batch_id) {
// The output is in order (n, ctop, ph, pw)
// static int cnt = 0;
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int ctop = (index / pooled_width / pooled_height) % output_dim;
int n = index / pooled_width / pooled_height / output_dim;
// [start, end) interval for spatial sampling
bottom_rois += n * 4;
int roi_batch_ind = rois_batch_id[n]; // bottom_rois[0];
Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[0])) * spatial_scale;
Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
Dtype roi_end_w =
static_cast<Dtype>(round(bottom_rois[2]) + 1.) * spatial_scale;
Dtype roi_end_h =
static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0
Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f);
// Compute w and h at bottom
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
hend = std::min(std::max(hend, 0), height);
wstart = std::min(std::max(wstart, 0), width);
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
int gw = pw;
int gh = ph;
int c = (ctop * group_size + gh) * group_size + gw;
bottom_data += (roi_batch_ind * channels + c) * height * width;
Dtype out_sum = 0;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int bottom_index = h * width + w;
out_sum += bottom_data[bottom_index];
}
}
Dtype bin_area = (hend - hstart) * (wend - wstart);
top_data[index] = is_empty ? 0. : out_sum / bin_area;
}
template <>
void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_tensor = param.float_input.get();
......@@ -71,7 +140,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto* in = input_tensor;
auto* rois = param.input_rois_;
auto* out = param.float_output.get();
auto* out = param.output_; // param.float_output.get();
auto pooled_height = param.pooled_height_;
auto pooled_width = param.pooled_width_;
......@@ -85,18 +154,17 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
int width = in_dims[3];
int rois_num = rois->dims()[0];
// TODO auto in_stride = framework::stride(in_dims);
// TODO auto out_stride = framework::stride(out->dims());
auto in_stride =
framework::stride({batch_size, height, width, input_channels});
auto out_stride = framework::stride(
{out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
auto data_nhwc = in->mutable_data<float>();
fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
framework::DDim dims_out_new = framework::make_ddim(
{rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
(param.output_)->dims()[3]});
(param.output_)->Resize(dims_out_new);
const float* input_data = in->data<float>();
const float* input_data = data_nhwc; // in->data<float>();
framework::Tensor rois_batch_id_list;
rois_batch_id_list.Resize({rois_num});
auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
return;
PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
......@@ -124,78 +192,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto input_rois = rois->data<float>();
// calculate psroipooling, parallel processing can be implemented per ROI
for (int n = 0; n < rois_num; ++n) {
// set roi batch id
int roi_batch_id = rois_batch_id_data[n];
// [start, end) interval for spatial sampling
auto offset_input_rois = input_rois + n * 4;
auto roi_start_w =
static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
auto roi_start_h =
static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
auto roi_end_w =
static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
auto roi_end_h =
static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
// Force too small rois to be 1 x 1
auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0
auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
// Compute bin size w and h at input feature map
auto bin_size_h = roi_height / static_cast<float>(pooled_height);
auto bin_size_w = roi_width / static_cast<float>(pooled_width);
DLOG << 3;
// calculate each pixel of the output feature map.
int out_roi_offset = n * out_stride[0];
for (int c = 0; c < output_channels; ++c) {
// per category
// int out_plane_offset = out_roi_offset + c * out_stride[1];
int out_plane_offset = out_roi_offset + c;
for (int ph = 0; ph < pooled_height; ++ph) {
// TODO int out_row_offset = out_plane_offset + ph *
// out_stride[2];
int out_row_offset = out_plane_offset + ph * out_stride[1];
for (int pw = 0; pw < pooled_width; ++pw) {
// calculate w and h at input feature map
int hstart = floor(static_cast<float>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<float>(pw) * bin_size_w + roi_start_w);
int hend =
ceil(static_cast<float>(ph + 1) * bin_size_h + roi_start_h);
int wend =
ceil(static_cast<float>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
hstart = std::min(std::max(hstart, 0), height);
wstart = std::min(std::max(wstart, 0), width);
hend = std::min(std::max(hend, 0), height);
wend = std::min(std::max(wend, 0), width);
// TODO int output_index = out_row_offset + pw;
int output_index = out_row_offset + pw * output_channels;
int input_channel = (c * pooled_height + ph) * pooled_width + pw;
// TODO int input_plane_offset =
// TODO roi_batch_id * in_stride[0] + input_channel *
// in_stride[1];
int input_plane_offset = roi_batch_id * in_stride[0] + input_channel;
auto offset_input_data = input_data + input_plane_offset;
float out_sum = 0.;
bool is_empty = (hend <= hstart) || (wend <= wstart);
for (int ih = hstart; ih < hend; ++ih) {
for (int iw = wstart; iw < wend; ++iw) {
int input_index = ih * in_stride[1] + iw * input_channel;
out_sum += offset_input_data[input_index];
}
}
float bin_area = (hend - hstart) * (wend - wstart);
output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
}
}
}
int index = pooled_height * pooled_width * output_channels * rois_num;
for (int idx = 0; idx < index; idx++) {
PSROIPooling<float>(input_data, spatial_scale, input_channels, height,
width, pooled_height, pooled_width, input_rois,
output_channels, pooled_height, output_data, idx,
rois_batch_id_data);
}
fpga::format_image(out);
fpga::PerformBypass(param.output_arg);
//
fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height,
pooled_width, rois_num);
out->reset_data_ptr(output_data);
}
} // namespace operators
......
......@@ -47,21 +47,11 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
void reshape(LoDTensor *input, LoDTensor *output) {
// Subscript r means after reshape
// TODO zhangyang verify this function
float *input_ptr_f, *output_ptr_f;
half *input_ptr_h, *output_ptr_h;
bool is_float = false;
if (input->type() == typeid(float)) {
input_ptr_f = input->data<float>();
output_ptr_f = output->data<float>();
is_float = true;
} else {
input_ptr_h = input->data<half>();
output_ptr_h = output->data<half>();
}
auto input_ptr = input->data<half>();
auto output_ptr = output->data<half>();
output->scale[0] = input->scale[0];
output->scale[1] = input->scale[1];
auto C = static_cast<int>(input->dims()[1]);
auto H = static_cast<int>(input->dims()[2]);
......@@ -77,6 +67,8 @@ void reshape(LoDTensor *input, LoDTensor *output) {
auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
auto HWr = Hr * Wr;
fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
int offset_align = 0;
int offset_r = 0, offset_align_r = 0;
int cr = 0, hr = 0, wr = 0;
......@@ -87,21 +79,17 @@ void reshape(LoDTensor *input, LoDTensor *output) {
int offset1 = w * C + offset0;
for (int c = 0; c < C; c++) {
offset_align = offset1 + c;
offset_r = c * HW + h * W + c;
offset_r = c * HW + h * W + w;
cr = offset_r / HWr;
hr = offset_r % HWr / Wr;
wr = offset_r % Wr;
offset_align_r = hr * WCr_align + wr * Cr + cr;
// DLOG << "hwc"<< h<< " " << w << " " << c;
// DLOG << "hrwrcr" << hr<< " " << wr << " " << cr;
if (is_float) {
output_ptr_f[offset_align_r] = input_ptr_f[offset_align];
} else {
output_ptr_h[offset_align_r] = input_ptr_h[offset_align];
}
output_ptr[offset_align_r] = input_ptr[offset_align];
}
}
}
fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
}
template <>
......@@ -123,6 +111,9 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
output->Resize(framework::make_ddim(shape));
if (output->dims() == input->dims()) {
DLOG << "No need to reshape";
output->ShareDataWith(*input);
framework::LoD lod = input->lod();
output->set_lod(lod);
return;
}
......
......@@ -33,13 +33,18 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension
// Only support half data
// W must be aligned to 16
auto input = param.input_;
DLOG << input;
auto output = param.output_;
int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1];
auto input_ptr = input->data<half>();
auto output_ptr = param.output_->data<half>();
auto output_ptr = output->data<half>();
output->scale[0] = input->scale[0];
output->scale[1] = input->scale[1];
int start = param.starts_[0], end = param.ends_[0];
start = start < 0 ? start + channel : start;
......@@ -47,9 +52,10 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
start = start > channel ? channel : start;
end = end > channel ? channel : end;
int len = end - start;
size_t size = len * sizeof(half);
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, len);
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
}
}
} // namespace operators
......
......@@ -23,14 +23,21 @@ namespace operators {
template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<half>();
auto dims = framework::vectorize(input->dims());
half *input_ptr;
auto out = param->Out();
if (input->type() == typeid(float)) {
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
} else {
input_ptr = input->data<half>();
}
auto float_input = new Tensor;
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input");
auto dims = framework::vectorize(input->dims());
auto channel = dims[3];
if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1]
PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op");
......@@ -41,9 +48,12 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
float_input->init(typeid(float));
fpga::format_fp32_ofm(float_input);
fpga::format_fp32_ofm(out);
float_input->mutable_data<float>(framework::make_ddim(dims));
// fpga::format_fp32_ofm(float_input);
// fpga::format_fp32_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
......@@ -51,7 +61,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1];
args.image.height = (uint32_t)dims[1] * dims[0];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
......@@ -80,14 +90,23 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
auto *in_x = (param.InputX());
if (in_x->type() == typeid(half)) {
fpga::PerformBypass(param.FpgaArgs());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x2 = param.FloatInput();
fpga::fpga_invalidate(in_x2->data<float>(),
in_x2->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x2, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
} else {
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x = param.FloatInput();
fpga::fpga_invalidate(in_x->data<float>(), in_x->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class Pad2dKernel
: public framework::OpKernelBase<DeviceType, Pad2dParam<DeviceType>> {
public:
void Compute(const Pad2dParam<DeviceType> &param);
bool Init(Pad2dParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
......@@ -1221,6 +1221,7 @@ class FetchParam : public OpParam {
RType *input_x_;
Tensor *out_;
#ifdef PADDLE_MOBILE_FPGA
public:
fpga::BypassArgs fpga_bypass_args;
......@@ -2415,6 +2416,120 @@ class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
template <typename Dtype>
using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
#endif
#ifdef FUSION_DECONVADDBN_OP
template <typename Dtype>
class FusionDeconvAddBNParam : public ConvTransposeParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDeconvAddBNParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
output_ = OpParam::OutFrom<GType>(outputs, scope);
input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
momentum_ = OpParam::GetAttr<float>("momentum", attrs);
// is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
}
RType *Output() const { return output_; }
const RType *InputBias() const { return input_bias_; }
const RType *InputMean() const { return input_mean_; }
const RType *InputScale() const { return input_scale_; }
const RType *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
const RType *NewScale() const { return new_scale_; }
const RType *NewBias() const { return new_bias_; }
protected:
RType *output_;
RType *input_bias_;
RType *input_mean_;
RType *input_scale_;
RType *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
RType *new_bias_;
RType *new_scale_;
};
#endif
#ifdef FUSION_DECONVADDBNRELU_OP
template <typename Dtype>
class FusionDeconvAddBNReluParam : public ConvTransposeParam<Dtype> {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FusionDeconvAddBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
output_ = OpParam::OutFrom<GType>(outputs, scope);
input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
momentum_ = OpParam::GetAttr<float>("momentum", attrs);
// is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
}
RType *Output() const { return output_; }
const RType *InputBias() const { return input_bias_; }
const RType *InputMean() const { return input_mean_; }
const RType *InputScale() const { return input_scale_; }
const RType *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
const RType *NewScale() const { return new_scale_; }
const RType *NewBias() const { return new_bias_; }
protected:
RType *output_;
RType *input_bias_;
RType *input_mean_;
RType *input_scale_;
RType *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
RType *new_bias_;
RType *new_scale_;
};
#endif
#ifdef FUSION_DECONVRELU_OP
template <typename Dtype>
......@@ -3114,6 +3229,26 @@ class IncrementParam : public OpParam {
int step_;
};
#endif // INCREMENT_OP
#ifdef PAD2D_OP
template <typename Dtype>
class Pad2dParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
Pad2dParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
}
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
private:
RType *input_x_;
RType *out_;
};
#endif
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PAD2D_OP
#include "operators/pad2d_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void Pad2dOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
auto input_n = input_dims[0];
auto input_c = input_dims[1];
auto input_h = input_dims[2];
auto input_w = input_dims[3];
this->param_.Out()->Resize({input_n, input_c, input_h + 1, input_w + 1});
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(pad2d, ops::Pad2dOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2dOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PAD2D_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/pad2d_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::AttributeMap;
using framework::OperatorWithKernel;
using framework::Scope;
using std::string;
template <typename DeviceType, typename T>
class Pad2dOp
: public OperatorWithKernel<DeviceType, Pad2dParam<DeviceType>,
operators::Pad2dKernel<DeviceType, T>> {
public:
Pad2dOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorWithKernel<DeviceType, Pad2dParam<DeviceType>,
operators::Pad2dKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
private:
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -97,7 +97,7 @@ void dump_stride_float(std::string filename, Tensor input_tensor,
out.close();
}
static const char *g_resnet50 = "../models/resnet50";
const std::string g_image_src_float = "../images/image_src_float";
const std::string g_image_src_float = "../images/image_src_float"; // NOLINT
int main() {
paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
......@@ -114,14 +114,14 @@ int main() {
std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(half));
dump_stride_half(saveName, (*tensor_ptr), 20);
// dump_stride_half(saveName, (*tensor_ptr), 20);
// dump(saveName, (*tensor_ptr));
}
auto tensor_ptr = paddle_mobile.FetchResult(73);
dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
// dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
tensor_ptr = paddle_mobile.FetchResult(74);
dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
// dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
float max = 0;
auto data_ptr = tensor_ptr->data<float>();
......
......@@ -23,21 +23,101 @@ limitations under the License. */
#include "fpga/V2/api.h"
#endif
void readStream(std::string filename, uint8_t *buf) {
#include <string>
void readStream(std::string filename, char *buf) {
std::ifstream in;
in.open(filename, std::ios::in);
in.open(filename, std::ios::in | std::ios::binary);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
return;
}
int i = 0;
while (!in.eof()) {
in >> buf[i];
i++;
}
in.seekg(0, std::ios::end); // go to the end
auto length = in.tellg(); // report location (this is the length)
in.seekg(0, std::ios::beg); // go back to the beginning
in.read(buf, length);
DLOG << length;
in.close();
}
void convert_to_chw(int16_t **data_in, int channel, int height, int width,
int num, int16_t *data_tmp) {
int64_t amount_per_side = width * height;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
*(data_tmp + n * amount_per_side * channel + c * amount_per_side +
width * h + w) = *((*data_in)++);
}
}
}
}
}
void dump_stride_half(std::string filename, Tensor input_tensor,
const int dumpnum, bool use_chw) {
// bool use_chw = true;
if (input_tensor.dims().size() != 4) return;
int c = (input_tensor.dims())[1];
int h = (input_tensor.dims())[2];
int w = (input_tensor.dims())[3];
int n = (input_tensor.dims())[0];
auto data_ptr = input_tensor.get_data();
auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
auto data_tmp = data_ptr_16;
if (use_chw) {
data_tmp =
reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
}
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
out << result << std::endl;
}
out.close();
if (data_tmp != data_ptr_16) {
free(data_tmp);
}
}
void dump_stride_float(std::string filename, Tensor input_tensor,
const int dumpnum) {
auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
std::ofstream out(filename.c_str());
float result = 0;
int stride = input_tensor.numel() / dumpnum;
stride = stride > 0 ? stride : 1;
for (int i = 0; i < input_tensor.numel(); i += stride) {
result = data_ptr[i];
out << result << std::endl;
}
out.close();
}
void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
bool use_chw) {
static int i = 0;
if (input_tensor.numel() == 0) {
return;
}
if (input_tensor.type() == typeid(float)) {
DLOG << "op: " << i++ << ", float data " << input_tensor.numel();
dump_stride_float(filename, input_tensor, dumpnum);
} else {
DLOG << "op: " << i++ << ", half data " << input_tensor.numel();
dump_stride_half(filename, input_tensor, dumpnum, use_chw);
}
DLOG << "dump input address: " << input_tensor.get_data();
}
static const char *g_rfcn_combine = "../models/rfcn";
static const char *g_image_src_float = "../models/rfcn/data.bin";
int main() {
......@@ -48,12 +128,45 @@ int main() {
std::string(g_rfcn_combine) + "/params", true, false,
1, true)) {
float img_info[3] = {768, 1536, 768.0f / 960.0f};
auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float));
readStream(g_image_src_float, reinterpret_cast<uint8_t *>(img));
auto img = reinterpret_cast<float *>(
fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
readStream(g_image_src_float, reinterpret_cast<char *>(img));
std::vector<void *> v(3, nullptr);
paddle_mobile.FeedData({img_info, img});
paddle_mobile.Predict_To(-1);
paddle_mobile.GetResults(&v);
for (int i = 55; i < 69; i++) {
auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "rfcn_" + std::to_string(i);
// if(i != 58)
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
tensor_ptr->numel() * sizeof(float));
// tensor_ptr->numel() * sizeof(float));
if ((i == 48) || (i == 47)) {
dump_stride(saveName, (*tensor_ptr), 20,
false); // 20);//tensor_ptr->numel());
} else if (i == 55) {
dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
true); // 20);//tensor_ptr->numel());
} else {
dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
true); // 20);//tensor_ptr->numel());
}
/* float result = 0;
std::string str = "softmax_input_data";
float* data =
static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() *
sizeof(float))); str = "softmax_output_data"; auto output_ptr =
static_cast<half*>((*tensor_ptr).get_data()); for (int idx = 0; idx <
tensor_ptr->numel(); ++idx)
{
data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
}
fpga::savefile<float>(str,data, tensor_ptr->numel(), result ); */
}
// paddle_mobile.GetResults(&v);
DLOG << "Computation done";
fpga::fpga_free(img);
}
......
......@@ -131,7 +131,12 @@ if (CON GREATER -1)
set(PROPOSAL_OP ON)
set(ANCHOR_GENERATOR_OP ON)
set(SLICE_OP ON)
set(SIGMOID_OP ON)
set(CONCAT_OP ON)
set(PAD2D_OP ON)
set(CONV_TRANSPOSE_OP ON)
set(FUSION_DECONVADDBNRELU_OP ON)
set(FUSION_DECONVADDBN_OP ON)
set(FOUND_MATCH ON)
endif()
......@@ -573,7 +578,6 @@ endif()
if (FUSION_DECONVADDRELU_OP)
add_definitions(-DFUSION_DECONVADDRELU_OP)
endif()
if (WHILE_OP)
add_definitions(-DWHILE_OP)
endif()
......@@ -602,3 +606,12 @@ endif()
if (ROI_PERSPECTIVE_OP)
add_definitions(-DROI_PERSPECTIVE_OP)
endif()
if (FUSION_DECONVADDBNRELU_OP)
add_definitions(-DFUSION_DECONVADDBNRELU_OP)
endif()
if (FUSION_DECONVADDBN_OP)
add_definitions(-DFUSION_DECONVADDBN_OP)
endif()
if (PAD2D_OP)
add_definitions(-DPAD2D_OP)
endif()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册