提交 ae204098 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

update feed and fetch kernel in FPGA v2 track fixed#1600 (#1601)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update

* update kernel and related files for static quantization in FPGA v2 track

* update

* update feed and fetch kernel in FPGA v2 track
上级 c576d9d4
......@@ -836,10 +836,6 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto deleter = [](void *p) { fpga_free(p); };
arg->vector_dwconv_space.push_back(
std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
auto filter_ptr = filter->data<int16_t>();
auto input_ptr = input->data<int8_t>();
auto output_ptr = out->mutable_data<int8_t>();
......
......@@ -678,10 +678,12 @@ void Executor<Device, T>::InitQuantMemory() {
auto inputs_vars = inputs[*key];
int count = inputs_vars.size();
for (int i = 0; i < count; i++) {
auto tensor = GetTensorByName(inputs_vars[i]);
tensor->scale[0] = quantValList[inputs_vars[i]];
DLOG << "input variance name : " << inputs_vars[i]
<< ", scale value : " << tensor->scale[0];
if (inputs_vars[i] != "feed") {
auto tensor = GetTensorByName(inputs_vars[i]);
tensor->scale[0] = quantValList[inputs_vars[i]];
DLOG << "input variance name : " << inputs_vars[i]
<< ", scale value : " << tensor->scale[0];
}
}
}
auto output_keys = op->GetOutKeys();
......@@ -690,10 +692,12 @@ void Executor<Device, T>::InitQuantMemory() {
auto outputs_vars = outputs[*key];
int count = outputs_vars.size();
for (int i = 0; i < count; i++) {
auto tensor = GetTensorByName(outputs_vars[i]);
tensor->scale[0] = quantValList[outputs_vars[i]];
DLOG << "output variance name : " << outputs_vars[i]
<< ", scale value : " << tensor->scale[0];
if (outputs_vars[i] != "fetch") {
auto tensor = GetTensorByName(outputs_vars[i]);
tensor->scale[0] = quantValList[outputs_vars[i]];
DLOG << "output variance name : " << outputs_vars[i]
<< ", scale value : " << tensor->scale[0];
}
}
}
}
......
......@@ -74,7 +74,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope, strides[0], strides[1],
paddings[0], paddings[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(new_scale_ptr);
fpga::fpga_free(bs_ptr);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
......@@ -83,9 +82,9 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
leaky_relu_negative_slope, param->Groups(), strides[0],
strides[1], paddings[0], paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
}
delete new_scale;
delete new_bias;
return true;
}
......
......@@ -63,6 +63,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(bs_ptr);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
......
......@@ -29,10 +29,10 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
auto input_x_ptr = input_x->data<int8_t>();
auto input_y_ptr = input_y->data<int8_t>();
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<half>();
auto out_ptr = out->mutable_data<int8_t>();
float Si_1 = input_x->scale[0];
float Si_2 = input_y->scale[0];
float So = out->scale[0];
......
......@@ -20,16 +20,9 @@ namespace operators {
template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
auto output = param->Out();
int col = param->Col();
DLOG << "col = " << col;
auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
if (output->dims().size() != 4) {
input->init(type_id<float>().hash_code());
return true;
}
input->init(type_id<int8_t>().hash_code());
input->Resize(output->dims());
fpga::format_ofm(output);
return true;
}
......
......@@ -22,7 +22,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
DLOG << "col = " << col;
auto output = &(param->Out()->at(col));
output->init(type_id<float>().hash_code());
output->Resize(input->dims());
output->mutable_data<float>(input->dims());
if (input->type() == type_id<float>()) {
return true;
}
......@@ -38,14 +38,16 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (alignedCW != unalignedCW) {
aligned_output.init(type_id<float>().hash_code());
aligned_output.Resize(input->dims());
fpga::format_fp32_ofm(&aligned_output);
param->aligned_out = std::make_shared<Tensor>();
param->aligned_out->Resize(input->dims());
param->aligned_out->init(type_id<float>().hash_code());
fpga::format_ofm(param->aligned_out.get());
}
return true;
}
void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
int alignCW =
paddle_mobile::fpga::align_to_x(input_c * input_w, IMAGE_ALIGNMENT);
int dealignCW = input_c * input_w;
for (int h = 0; h < input_h; ++h) {
auto input_offset = h * alignCW;
......@@ -63,11 +65,9 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
output->ShareDataWith(*input);
return;
}
auto input_address = input->data<int8_t>();
float Si = input->scale[0];
auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
auto outdata_ptr = output->data<float>();
auto outdata_ptr = const_cast<float *>(output->data<float>());
const int num_th = 32;
fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
if (input->fpga_data_num < num_th) {
......@@ -77,9 +77,6 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
return;
}
for (int idx = 0; idx < input->fpga_data_num; ++idx) {
aligned_ptr[idx] = input_address[idx] * Si;
}
int outC = 1;
int outH = 1;
int outW = 1;
......@@ -90,15 +87,21 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
} else { // 2
outC = output->dims()[1];
}
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
auto aligned_out = param.aligned_out.get();
if (unalignedCW != alignedCW) {
auto aligned_ptr = aligned_out->data<float>();
for (int idx = 0; idx < input->fpga_data_num; ++idx) {
aligned_ptr[idx] = input_address[idx] * Si;
}
dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
return;
}
memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
for (int idx = 0; idx < input->fpga_data_num; ++idx) {
outdata_ptr[idx] = input_address[idx] * Si;
}
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
}
template class FetchKernel<FPGA, float>;
......
......@@ -1277,8 +1277,10 @@ class FetchParam : public OpParam {
public:
#ifdef PADDLE_MOBILE_FPGA_V1
fpga::BypassArgs fpga_bypass_args;
#endif
Tensor aligned_out;
#else
std::shared_ptr<Tensor> aligned_out;
#endif
#endif
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册