未验证 提交 cf542116 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

update feed fetch and softmax kernel in FPGA v2 track fixed#1615 (#1616)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update

* update kernel and related files for static quantization in FPGA v2 track

* update

* update feed and fetch kernel in FPGA v2 track

* update io file

* update feed fetch and softmax kernel in FPGA v2 track
上级 3a9016d7
...@@ -50,11 +50,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) { ...@@ -50,11 +50,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims(); auto dims = ofm_tensor->dims();
size_t memory_size = 0; size_t memory_size = 0;
if (dims.size() == 4) { if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
height = dims[2], width = dims[3];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(int8_t); sizeof(int8_t);
} else if (dims.size() == 2) { } else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t); auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
} else { } else {
DLOG << "Wrong ofm dimension"; DLOG << "Wrong ofm dimension";
} }
...@@ -68,11 +70,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) { ...@@ -68,11 +70,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) {
void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
size_t memory_size = 0; size_t memory_size = 0;
if (dims.size() == 4) { if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3]; auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
memory_size = height = dims[2], width = dims[3];
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(int8_t); memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(int8_t);
} else if (dims.size() == 2) { } else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t); auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
} else { } else {
DLOG << "Wrong ofm dimension"; DLOG << "Wrong ofm dimension";
} }
...@@ -87,11 +91,13 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) { ...@@ -87,11 +91,13 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims(); auto dims = ofm_tensor->dims();
size_t memory_size = 0; size_t memory_size = 0;
if (dims.size() == 4) { if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3]; auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
memory_size = height = dims[2], width = dims[3];
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float); memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(float);
} else if (dims.size() == 2) { } else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float); auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(float);
} else { } else {
DLOG << "Wrong ofm dimension"; DLOG << "Wrong ofm dimension";
} }
......
...@@ -21,6 +21,7 @@ template <> ...@@ -21,6 +21,7 @@ template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) { bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
auto output = param->Out(); auto output = param->Out();
if (output->dims().size() != 4) { if (output->dims().size() != 4) {
output->init(type_id<float>().hash_code());
return true; return true;
} }
fpga::format_ofm(output); fpga::format_ofm(output);
...@@ -44,6 +45,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -44,6 +45,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
} }
fpga::format_image(input); fpga::format_image(input);
output->ShareDataWith(*input); output->ShareDataWith(*input);
input->external_data = nullptr;
} }
template class FeedKernel<FPGA, float>; template class FeedKernel<FPGA, float>;
......
...@@ -23,9 +23,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) { ...@@ -23,9 +23,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
auto output = &(param->Out()->at(col)); auto output = &(param->Out()->at(col));
output->init(type_id<float>().hash_code()); output->init(type_id<float>().hash_code());
output->mutable_data<float>(input->dims()); output->mutable_data<float>(input->dims());
if (input->type() == type_id<float>()) {
return true;
}
auto aligned_output = param->aligned_out; auto aligned_output = param->aligned_out;
int outC = 1; int outC = 1;
int outW = 1; int outW = 1;
...@@ -61,13 +59,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) { ...@@ -61,13 +59,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = const_cast<LoDTensor *>(param.InputX()); auto input = const_cast<LoDTensor *>(param.InputX());
int col = param.Col(); int col = param.Col();
auto output = &param.Out()->at(col); auto output = &param.Out()->at(col);
auto outdata_ptr = const_cast<float *>(output->data<float>());
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
}
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (input->type() == type_id<float>()) { if (input->type() == type_id<float>()) {
output->ShareDataWith(*input); if (unalignedCW == alignedCW) {
output->ShareDataWith(*input);
} else {
auto input_address = input->data<float>();
dealign(input_address, outdata_ptr, outC, outH, outW);
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
}
return; return;
} }
auto input_address = input->data<int8_t>(); auto input_address = input->data<int8_t>();
float Si = input->scale[0]; float Si = input->scale[0];
auto outdata_ptr = const_cast<float *>(output->data<float>());
const int num_th = 32; const int num_th = 32;
fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t)); fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
if (input->fpga_data_num < num_th) { if (input->fpga_data_num < num_th) {
...@@ -77,21 +95,11 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) { ...@@ -77,21 +95,11 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float)); fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
return; return;
} }
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
}
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
auto aligned_out = param.aligned_out.get(); auto aligned_out = param.aligned_out.get();
if (unalignedCW != alignedCW) { if (unalignedCW != alignedCW) {
auto aligned_ptr = aligned_out->data<float>(); auto aligned_ptr = aligned_out->data<float>();
fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float));
for (int idx = 0; idx < input->fpga_data_num; ++idx) { for (int idx = 0; idx < input->fpga_data_num; ++idx) {
aligned_ptr[idx] = input_address[idx] * Si; aligned_ptr[idx] = input_address[idx] * Si;
} }
......
...@@ -24,16 +24,11 @@ template <> ...@@ -24,16 +24,11 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX()); auto input = const_cast<LoDTensor *>(param->InputX());
auto dims = framework::vectorize(input->dims()); auto dims = framework::vectorize(input->dims());
int8_t *input_ptr;
auto out = param->Out();
if (input->type() == type_id<float>()) {
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
} else {
input_ptr = input->data<int8_t>();
}
auto float_input = new LoDTensor; auto out = param->Out();
out->Resize(framework::make_ddim(dims));
out->mutable_data<int8_t>(framework::make_ddim(dims));
fpga::format_ofm(out);
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4, PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input"); "Softmax should have 4-order input");
...@@ -45,28 +40,10 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -45,28 +40,10 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
dims[1] = 1; dims[1] = 1;
} }
input->Resize(framework::make_ddim(dims)); input->Resize(framework::make_ddim(dims));
float_input->Resize(framework::make_ddim(dims)); if ((channel == 2) && (input->type() == type_id<int8_t>())) {
auto input_ptr = input->data<int8_t>();
if (channel != 2) { // Use CPU float Si = input->scale[0];
out->Resize(framework::make_ddim(dims)); int16_t slope = fpga::fp32_2_fp16(Si / 127);
out->mutable_data<float>(framework::make_ddim(dims));
float_input->init(type_id<float>().hash_code());
float_input->mutable_data<float>(framework::make_ddim(dims));
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1] * dims[0];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
fpga::format_ofm(out); fpga::format_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC; args.input_layout_type = fpga::LAYOUT_HWC;
...@@ -77,10 +54,28 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -77,10 +54,28 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.image.height = (uint32_t)input->dims()[1]; args.image.height = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[2]; args.image.width = (uint32_t)input->dims()[2];
args.image.channels = (uint32_t)input->dims()[3]; args.image.channels = (uint32_t)input->dims()[3];
args.output.address = out->data<half>(); args.output.address = out->data<int8_t>();
args.output.scale_address = out->scale; args.output.scale_address = out->scale;
args.output.activation.activation_type = fpga::SOFTMAX; args.output.activation.activation_type = fpga::SOFTMAX;
args.output.activation.leaky_relu_negative_slope = slope;
param->SetFpgaArgs(args); param->SetFpgaArgs(args);
} else if (input->type() == type_id<int8_t>()) {
auto float_input_x = param->float_input_x_;
float_input_x = std::make_shared<Tensor>();
float_input_x->Resize(input->dims());
float_input_x->init(type_id<float>().hash_code());
fpga::format_ofm(float_input_x.get());
auto float_out = param->float_out;
float_out = std::make_shared<Tensor>();
float_out->Resize(input->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
} else {
auto float_out = param->float_out;
float_out = std::make_shared<Tensor>();
float_out->Resize(input->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
} }
return true; return true;
...@@ -89,24 +84,54 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -89,24 +84,54 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
template <> template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
auto *in_x = (param.InputX()); auto *in_x = (param.InputX());
if (in_x->type() == type_id<int8_t>()) { auto dims = in_x->dims();
auto n = dims[0];
auto h = dims[1];
auto w = dims[2];
auto c = dims[3];
if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { } else if (in_x->type() == type_id<int8_t>()) {
Tensor *out = param.Out(); auto in_data = in_x->data<int8_t>();
Tensor *in_x2 = param.FloatInput(); float Si = in_x->scale[0];
Tensor *out = param.Out();
fpga::fpga_invalidate(in_x2->data<float>(), out->Resize(
in_x2->numel() * sizeof(float)); {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
math::SoftmaxFuntor<CPU, float>()(in_x2, out); auto out_data = out->data<int8_t>();
fpga::fpga_flush(out->data<float>(), out->memory_size()); auto float_input_x = param.float_input_x_;
auto float_input_x_data = float_input_x->data<float>();
int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
for (int i = 0; i < dataNum; i++) {
float_input_x_data[i] = in_data[i] * Si / 127;
}
auto float_out = param.float_out;
auto float_out_data = float_out->data<float>();
math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), float_out.get());
for (int i = 0; i < dataNum; i++) {
float tmp_out = float_out_data[i] * 127;
out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
: (signed char)(tmp_out + 0.5);
} }
fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
} else { } else {
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { Tensor *out = param.Out();
Tensor *out = param.Out(); out->Resize(
out->Resize( {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
{in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]}); auto out_data = out->data<int8_t>();
math::SoftmaxFuntor<CPU, float>()(in_x, out); auto float_out = param.float_out;
float_out = std::make_shared<Tensor>();
float_out->Resize(in_x->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
auto float_out_data = float_out->data<float>();
math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
for (int i = 0; i < dataNum; i++) {
float tmp_out = float_out_data[i] * 127;
out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
: (signed char)(tmp_out + 0.5);
} }
fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
} }
} }
......
...@@ -1101,6 +1101,8 @@ class SoftmaxParam : public OpParam { ...@@ -1101,6 +1101,8 @@ class SoftmaxParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA_V1
private: private:
std::shared_ptr<GType> float_input_x_; std::shared_ptr<GType> float_input_x_;
fpga::BypassArgs fpga_bypass_args; fpga::BypassArgs fpga_bypass_args;
...@@ -1112,6 +1114,18 @@ class SoftmaxParam : public OpParam { ...@@ -1112,6 +1114,18 @@ class SoftmaxParam : public OpParam {
void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); } void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#else
private:
fpga::BypassArgs fpga_bypass_args;
public:
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
public:
std::shared_ptr<Tensor> float_input_x_, float_out;
#endif
#endif #endif
}; };
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册