未验证 提交 cf542116 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

update feed fetch and softmax kernel in FPGA v2 track fixed#1615 (#1616)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update

* update kernel and related files for static quantization in FPGA v2 track

* update

* update feed and fetch kernel in FPGA v2 track

* update io file

* update feed fetch and softmax kernel in FPGA v2 track
上级 3a9016d7
......@@ -50,11 +50,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
height = dims[2], width = dims[3];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(int8_t);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t);
auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
} else {
DLOG << "Wrong ofm dimension";
}
......@@ -68,11 +70,13 @@ void format_int8_ofm(framework::Tensor *ofm_tensor) {
void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3];
memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(int8_t);
auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
height = dims[2], width = dims[3];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(int8_t);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t);
auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
} else {
DLOG << "Wrong ofm dimension";
}
......@@ -87,11 +91,13 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3];
memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
height = dims[2], width = dims[3];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(float);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(float);
} else {
DLOG << "Wrong ofm dimension";
}
......
......@@ -21,6 +21,7 @@ template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
auto output = param->Out();
if (output->dims().size() != 4) {
output->init(type_id<float>().hash_code());
return true;
}
fpga::format_ofm(output);
......@@ -44,6 +45,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
}
fpga::format_image(input);
output->ShareDataWith(*input);
input->external_data = nullptr;
}
template class FeedKernel<FPGA, float>;
......
......@@ -23,9 +23,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
auto output = &(param->Out()->at(col));
output->init(type_id<float>().hash_code());
output->mutable_data<float>(input->dims());
if (input->type() == type_id<float>()) {
return true;
}
auto aligned_output = param->aligned_out;
int outC = 1;
int outW = 1;
......@@ -61,13 +59,33 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
auto input = const_cast<LoDTensor *>(param.InputX());
int col = param.Col();
auto output = &param.Out()->at(col);
auto outdata_ptr = const_cast<float *>(output->data<float>());
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
}
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (input->type() == type_id<float>()) {
output->ShareDataWith(*input);
if (unalignedCW == alignedCW) {
output->ShareDataWith(*input);
} else {
auto input_address = input->data<float>();
dealign(input_address, outdata_ptr, outC, outH, outW);
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
}
return;
}
auto input_address = input->data<int8_t>();
float Si = input->scale[0];
auto outdata_ptr = const_cast<float *>(output->data<float>());
const int num_th = 32;
fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
if (input->fpga_data_num < num_th) {
......@@ -77,21 +95,11 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
return;
}
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
}
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
auto aligned_out = param.aligned_out.get();
if (unalignedCW != alignedCW) {
auto aligned_ptr = aligned_out->data<float>();
fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float));
for (int idx = 0; idx < input->fpga_data_num; ++idx) {
aligned_ptr[idx] = input_address[idx] * Si;
}
......
......@@ -24,16 +24,11 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
auto dims = framework::vectorize(input->dims());
int8_t *input_ptr;
auto out = param->Out();
if (input->type() == type_id<float>()) {
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
} else {
input_ptr = input->data<int8_t>();
}
auto float_input = new LoDTensor;
auto out = param->Out();
out->Resize(framework::make_ddim(dims));
out->mutable_data<int8_t>(framework::make_ddim(dims));
fpga::format_ofm(out);
PADDLE_MOBILE_ENFORCE(input->dims().size() == 4,
"Softmax should have 4-order input");
......@@ -45,28 +40,10 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
dims[1] = 1;
}
input->Resize(framework::make_ddim(dims));
float_input->Resize(framework::make_ddim(dims));
if (channel != 2) { // Use CPU
out->Resize(framework::make_ddim(dims));
out->mutable_data<float>(framework::make_ddim(dims));
float_input->init(type_id<float>().hash_code());
float_input->mutable_data<float>(framework::make_ddim(dims));
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)dims[1] * dims[0];
args.image.width = (uint32_t)dims[2];
args.image.channels = (uint32_t)dims[3];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
} else { // Use FPGA
if ((channel == 2) && (input->type() == type_id<int8_t>())) {
auto input_ptr = input->data<int8_t>();
float Si = input->scale[0];
int16_t slope = fpga::fp32_2_fp16(Si / 127);
fpga::format_ofm(out);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
......@@ -77,10 +54,28 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.image.height = (uint32_t)input->dims()[1];
args.image.width = (uint32_t)input->dims()[2];
args.image.channels = (uint32_t)input->dims()[3];
args.output.address = out->data<half>();
args.output.address = out->data<int8_t>();
args.output.scale_address = out->scale;
args.output.activation.activation_type = fpga::SOFTMAX;
args.output.activation.leaky_relu_negative_slope = slope;
param->SetFpgaArgs(args);
} else if (input->type() == type_id<int8_t>()) {
auto float_input_x = param->float_input_x_;
float_input_x = std::make_shared<Tensor>();
float_input_x->Resize(input->dims());
float_input_x->init(type_id<float>().hash_code());
fpga::format_ofm(float_input_x.get());
auto float_out = param->float_out;
float_out = std::make_shared<Tensor>();
float_out->Resize(input->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
} else {
auto float_out = param->float_out;
float_out = std::make_shared<Tensor>();
float_out->Resize(input->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
}
return true;
......@@ -89,24 +84,54 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
template <>
void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
auto *in_x = (param.InputX());
if (in_x->type() == type_id<int8_t>()) {
auto dims = in_x->dims();
auto n = dims[0];
auto h = dims[1];
auto w = dims[2];
auto c = dims[3];
if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
fpga::PerformBypass(param.FpgaArgs());
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
Tensor *in_x2 = param.FloatInput();
fpga::fpga_invalidate(in_x2->data<float>(),
in_x2->numel() * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x2, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
} else if (in_x->type() == type_id<int8_t>()) {
auto in_data = in_x->data<int8_t>();
float Si = in_x->scale[0];
Tensor *out = param.Out();
out->Resize(
{in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
auto out_data = out->data<int8_t>();
auto float_input_x = param.float_input_x_;
auto float_input_x_data = float_input_x->data<float>();
int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
for (int i = 0; i < dataNum; i++) {
float_input_x_data[i] = in_data[i] * Si / 127;
}
auto float_out = param.float_out;
auto float_out_data = float_out->data<float>();
math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), float_out.get());
for (int i = 0; i < dataNum; i++) {
float tmp_out = float_out_data[i] * 127;
out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
: (signed char)(tmp_out + 0.5);
}
fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
} else {
if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
Tensor *out = param.Out();
out->Resize(
{in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
math::SoftmaxFuntor<CPU, float>()(in_x, out);
Tensor *out = param.Out();
out->Resize(
{in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]});
auto out_data = out->data<int8_t>();
auto float_out = param.float_out;
float_out = std::make_shared<Tensor>();
float_out->Resize(in_x->dims());
float_out->init(type_id<float>().hash_code());
fpga::format_ofm(float_out.get());
auto float_out_data = float_out->data<float>();
math::SoftmaxFuntor<CPU, float>()(in_x, float_out.get());
int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
for (int i = 0; i < dataNum; i++) {
float tmp_out = float_out_data[i] * 127;
out_data[i] = tmp_out < 0 ? (signed char)(tmp_out - 0.5)
: (signed char)(tmp_out + 0.5);
}
fpga::fpga_flush(out_data, dataNum * sizeof(int8_t));
}
}
......
......@@ -1101,6 +1101,8 @@ class SoftmaxParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA_V1
private:
std::shared_ptr<GType> float_input_x_;
fpga::BypassArgs fpga_bypass_args;
......@@ -1112,6 +1114,18 @@ class SoftmaxParam : public OpParam {
void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
#else
private:
fpga::BypassArgs fpga_bypass_args;
public:
const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
public:
std::shared_ptr<Tensor> float_input_x_, float_out;
#endif
#endif
};
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册