diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt old mode 100644 new mode 100755 index dc8860188043dde6538a303eef82617e46c2a6c9..7c47e72872ecae6216288c20fa1a6ae30fac65bd --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -4,29 +4,42 @@ endif() set(fpga_deps fpga_target_wrapper kernel_fpga) -add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) -lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps}) +# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) +# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) +# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) -lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps}) - +# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) +add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps}) -lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps}) - +# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) +add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) +add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) +# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps}) +add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps}) +add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) +# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) -lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps}) - +add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) +# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) +# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps}) add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps}) - -add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) -lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps}) - -add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) -lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps}) +# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) +# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps}) add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps}) add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps}) add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps}) add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps}) add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps}) + +# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps}) +# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps}) + +# lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps}) +lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps}) +lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps}) +lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps}) +lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps}) +# lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps}) diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc old mode 100644 new mode 100755 index 51902cb08ad15fa20e3b1853c44564982adc327f..26301be8dff80f0b039831f4411151c58fa50d19 --- a/lite/kernels/fpga/calib_compute.cc +++ b/lite/kernels/fpga/calib_compute.cc @@ -23,24 +23,24 @@ namespace lite { namespace kernels { namespace fpga { using float16 = zynqmp::float16; + void CalibComputeFp32ToFP16::Run() { auto& param = this->Param(); const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(TARGET(kFPGA)); - - for (int i = 0; i < param.input->numel(); ++i) { - dout[i] = zynqmp::float_to_half(din[i]); - } + param.output->mutable_data(); + param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor()); + auto out_lod = param.output->mutable_lod(); + *out_lod = param.input->lod(); return; } void CalibComputeFP16ToFp32::Run() { auto& param = this->Param(); const auto* din = param.input->data(); - auto* dout = param.output->mutable_data(TARGET(kFPGA)); - for (int i = 0; i < param.input->numel(); ++i) { - dout[i] = zynqmp::half_to_float(din[i]); - } + auto* dout = param.output->mutable_data(); + param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor()); + auto out_lod = param.output->mutable_lod(); + *out_lod = param.input->lod(); return; } diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc new file mode 100755 index 0000000000000000000000000000000000000000..ad66e3098187e6bc1c0b11afb15609bbe91dcc2f --- /dev/null +++ b/lite/kernels/fpga/concat_compute.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/concat_compute.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/core/type_system.h" + +#include "lite/backends/fpga/KD/debugger.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void ConcatCompute::PrepareForRun() { + auto& param = this->Param(); + param.output->mutable_data(); + + // ==================================================== + zynqmp::ConcatParam& concat_param = pe_.param(); + for (auto t : param.x) { + concat_param.inputs.push_back(t->ZynqTensor()); + } + concat_param.output = param.output->ZynqTensor(); + concat_param.axis = param.axis; + pe_.init(); + pe_.apply(); +} + +void ConcatCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::ConcatParam& concat_param = pe_.param(); + Debugger::get_instance().registerOutput("concat", concat_param.output); +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(concat, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ConcatCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/concat_compute.h b/lite/kernels/fpga/concat_compute.h new file mode 100755 index 0000000000000000000000000000000000000000..1b5426be33daf50aed3521a6c0dcae054ea7bac0 --- /dev/null +++ b/lite/kernels/fpga/concat_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/operators/concat_op.h" + +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/concat_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class ConcatCompute + : public KernelLite { + public: + using param_t = operators::ConcatParam; + + void PrepareForRun() override; + void Run() override; + + virtual ~ConcatCompute() = default; + + private: + zynqmp::ConcatPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc old mode 100644 new mode 100755 index 8bc171dd67df08c17cdce61c6fa6882afd9ae8ae..a5a7c3b92f8ce01dacfc0518d0763cdb642bd838 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -13,9 +13,12 @@ // limitations under the License. #include "lite/kernels/fpga/conv_compute.h" +#include #include "lite/core/op_registry.h" #include "lite/core/type_system.h" +#include "lite/backends/fpga/KD/debugger.hpp" + namespace paddle { namespace lite { namespace kernels { @@ -25,37 +28,61 @@ using float16 = zynqmp::float16; void ConvCompute::PrepareForRun() { auto& param = this->Param(); - - // ==================================================== - zynqmp::ConvParam& conv_param = pe_.param(); param.output->mutable_data(); + int pad_h = (*param.paddings)[0]; + int pad_w = (*param.paddings)[2]; + // ==================================================== + if (param.x->ZynqTensor()->shape().channel() != 1 && + param.groups == param.x->ZynqTensor()->shape().channel()) { + zynqmp::DepthwiseConvParam& conv_param = dw_conv_pe_.param(); - // filter_.setDataType(zynqmp::FP32); - conv_param.input = param.x->ZynqTensor(); - conv_param.output = param.output->ZynqTensor(); - conv_param.filter = param.filter->ZynqTensor(); - conv_param.groups = param.groups; - conv_param.strides = param.strides; - auto paddings = *param.paddings; - conv_param.paddings = param.paddings; - conv_param.dilations = param.dilations; - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); - if (!pad_equal) { - LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1] - << ", " << paddings[2] << ", " << paddings[3]; + conv_param.input = param.x->ZynqTensor(); + conv_param.output = param.output->ZynqTensor(); + conv_param.filter = param.filter->ZynqTensor(); + conv_param.filter->setDataType(zynqmp::FP32); + conv_param.groups = param.groups; + conv_param.strides = param.strides; + conv_param.paddings = std::vector({pad_h, pad_w}); + conv_param.dilations = *param.dilations; + fill_scale_bias_const(&conv_param); + conv_param.bias()->copyFrom(param.bias->ZynqTensor()); + conv_param.relu.enabled = param.fuse_relu; + + dw_conv_pe_.init(); + dw_conv_pe_.apply(); + } else { + zynqmp::ConvParam& conv_param = conv_pe_.param(); + conv_param.input = param.x->ZynqTensor(); + conv_param.output = param.output->ZynqTensor(); + conv_param.filter = param.filter->ZynqTensor(); + conv_param.filter->setDataType(zynqmp::FP32); + conv_param.groups = param.groups; + conv_param.strides = param.strides; + conv_param.paddings = std::vector({pad_h, pad_w}); + conv_param.dilations = *param.dilations; + fill_scale_bias_const(&conv_param); + if (param.bias != nullptr) { + conv_param.bias()->copyFrom(param.bias->ZynqTensor()); + } + + conv_param.relu.enabled = param.fuse_relu; + conv_pe_.init(); + conv_pe_.apply(); } - fill_scale_bias_const(&conv_param); - conv_param.bias()->copyFrom(param.bias->ZynqTensor()); - conv_param.relu.enabled = param.fuse_relu; - pe_.init(); - pe_.apply(); } void ConvCompute::Run() { auto& param = this->Param(); - zynqmp::ConvParam& conv_param = pe_.param(); - pe_.dispatch(); + if (param.x->ZynqTensor()->shape().channel() != 1 && + param.groups == param.x->ZynqTensor()->shape().channel()) { + dw_conv_pe_.dispatch(); + } else { + conv_pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::ConvParam& conv_param = conv_pe_.param(); + Debugger::get_instance().registerOutput("conv", conv_param.output); +#endif + } } } // namespace fpga diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h old mode 100644 new mode 100755 index a023fb46fc8af0ad12d07929137f3eb058e92ef4..8c2b6c3704b8fe6620487bbfe94c50257b9fbcf9 --- a/lite/kernels/fpga/conv_compute.h +++ b/lite/kernels/fpga/conv_compute.h @@ -14,11 +14,13 @@ #pragma once -#include "lite/backends/fpga/KD/float16.hpp" -#include "lite/backends/fpga/KD/pes/conv_pe.hpp" #include "lite/core/kernel.h" #include "lite/operators/conv_op.h" +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/conv_pe.hpp" +#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp" + namespace paddle { namespace lite { namespace kernels { @@ -36,7 +38,8 @@ class ConvCompute ~ConvCompute() {} private: - zynqmp::ConvPE pe_; + zynqmp::ConvPE conv_pe_; + zynqmp::DepthwiseConvPE dw_conv_pe_; }; } // namespace fpga diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc index 1e05c1fa0c7e0f211b5eaed8f5e0385cbfe20cf2..7db855a0fe10d775cec07ad30e67f00c8230940a 100644 --- a/lite/kernels/fpga/conv_compute_test.cc +++ b/lite/kernels/fpga/conv_compute_test.cc @@ -141,15 +141,13 @@ void conv_compute_ref(const operators::ConvParam& param) { int group = param.groups; int kernel_w = param.filter->dims()[2]; int kernel_h = param.filter->dims()[3]; - - auto paddings = *param.paddings; - auto dilations = *para.dilations; int stride_w = param.strides[0]; int stride_h = param.strides[1]; - int dila_w = dilations[0]; - int dila_h = dilations[1]; - int pad_w = paddings[2]; - int pad_h = paddings[0]; + int dila_w = (*param.dilations)[0]; + int dila_h = (*param.dilations)[1]; + + int pad_w = (*param.paddings)[2]; + int pad_h = (*param.paddings)[0]; bool flag_bias = (param.bias != nullptr); bool flag_relu = param.fuse_relu; @@ -279,14 +277,11 @@ TEST(conv_fpga, compute) { param.bias = &bias; } param.fuse_relu = flag_relu; - std::vector paddings = { - padding, padding, padding, padding}; + *param.paddings = std::vector( + {padding, padding, padding, padding}); param.strides = std::vector({stride, stride}); - std::vector dilations = {dilation, dilation}; - param.paddings = - std::make_shared>(paddings); - param.dilations = - std::make_shared>(dilations); + *param.dilations = + std::vector({dilation, dilation}); param.groups = group; conv.SetParam(param); conv.Launch(); diff --git a/lite/kernels/fpga/dropout_compute.cc b/lite/kernels/fpga/dropout_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..b047a643f5cfda594a1bb56278c59f0371914e96 --- /dev/null +++ b/lite/kernels/fpga/dropout_compute.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/dropout_compute.h" +#include + +#include "lite/backends/fpga/KD/debugger.hpp" +#include "lite/backends/fpga/KD/float16.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +void DropoutCompute::PrepareForRun() { + auto& param = Param(); + param.output->mutable_data(); + + zynqmp::ScaleParam& scale_param = pe_.param(); + scale_param.input = param.x->ZynqTensor(); + scale_param.output = param.output->ZynqTensor(); + + int channel = scale_param.input->shape().channel(); + zynqmp::Tensor* scale = new zynqmp::Tensor(); + zynqmp::Tensor* bias = new zynqmp::Tensor(); + zynqmp::Shape shape(zynqmp::N, {channel}); + float* scale_data = scale->mutableData(zynqmp::FP32, shape); + float* bias_data = bias->mutableData(zynqmp::FP32, shape); + + float scale_value = 1 - param.dropout_prob; + for (int i = 0; i < channel; ++i) { + scale_data[i] = scale_value; + bias_data[i] = 0.0f; + } + scale->flush(); + bias->flush(); + + scale_param.bias = bias; + scale_param.scale = scale; + + pe_.init(); + pe_.apply(); +} + +void DropoutCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::ScaleParam& scale_param = pe_.param(); + Debugger::get_instance().registerOutput("dropout", scale_param.output); +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(dropout, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::DropoutCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/fpga/dropout_compute.h b/lite/kernels/fpga/dropout_compute.h new file mode 100755 index 0000000000000000000000000000000000000000..76f640855c56660007cc1d0eba27b3971a08e9d9 --- /dev/null +++ b/lite/kernels/fpga/dropout_compute.h @@ -0,0 +1,45 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +#include "lite/backends/fpga/KD/pes/scale_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +class DropoutCompute + : public KernelLite { + public: + void PrepareForRun() override; + + void Run() override; + + virtual ~DropoutCompute() = default; + + private: + zynqmp::ScalePE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc old mode 100644 new mode 100755 index 2a12650ef11ca996e8cff96c4d8e54d42e2020f4..1010e5a4d6541b22c07d1d9efbc2465381081997 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -15,6 +15,7 @@ #include "lite/kernels/fpga/elementwise_compute.h" #include #include "lite/backends/arm/math/funcs.h" +#include "lite/backends/fpga/KD/debugger.hpp" namespace paddle { namespace lite { @@ -37,7 +38,13 @@ void ElementwiseAddCompute::PrepareForRun() { pe_.init(); pe_.apply(); } -void ElementwiseAddCompute::Run() { pe_.dispatch(); } +void ElementwiseAddCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::ElementwiseAddParam& ew_param = pe_.param(); + Debugger::get_instance().registerOutput("ew_add", ew_param.output); +#endif +} void ElementwiseAddActivationCompute::PrepareForRun() { zynqmp::ElementwiseAddParam& ew_param = pe_.param(); @@ -53,7 +60,54 @@ void ElementwiseAddActivationCompute::PrepareForRun() { pe_.init(); pe_.apply(); } -void ElementwiseAddActivationCompute::Run() { pe_.dispatch(); } +void ElementwiseAddActivationCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::ElementwiseAddParam& ew_param = pe_.param(); + Debugger::get_instance().registerOutput("ew_add", ew_param.output); +#endif +} + +void ElementwiseMulCompute::PrepareForRun() { + zynqmp::ScaleParam& scale_param = pe_.param(); + auto& param = Param(); + param.Out->mutable_data(); + + scale_param.input = param.X->ZynqTensor(); + scale_param.output = param.Out->ZynqTensor(); + + scale_param.relu.enabled = false; + + int channel = scale_param.input->shape().channel(); + zynqmp::Tensor* scale = new zynqmp::Tensor(); + zynqmp::Tensor* bias = new zynqmp::Tensor(); + scale_param.scale = scale; + scale_param.bias = bias; + zynqmp::Shape shape(zynqmp::N, {channel}); + float* scale_data = scale->mutableData(zynqmp::FP32, shape); + float* bias_data = bias->mutableData(zynqmp::FP32, shape); + float scale_value = param.Y->data()[0]; + + for (int i = 0; i < channel; ++i) { + if (param.Y->dims().production() != 1) { + scale_value = param.Y->ZynqTensor()->data()[i]; + } + scale_data[i] = scale_value; + bias_data[i] = 0; + } + + pe_.init(); + pe_.apply(); +} + +void ElementwiseMulCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::ScaleParam& scale_param = pe_.param(); + Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); + Debugger::get_instance().registerOutput("ew_mul", scale_param.output); +#endif +} } // namespace fpga } // namespace kernels @@ -70,10 +124,7 @@ REGISTER_LITE_KERNEL(elementwise_add, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), @@ -100,3 +151,20 @@ REGISTER_LITE_KERNEL( PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ElementwiseMulCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h index 7051dd7eeda02537be713ff042a0cf33ac1b618d..e3e9c52c4c660e9ae6852f2ec8cdd815829ad524 100644 --- a/lite/kernels/fpga/elementwise_compute.h +++ b/lite/kernels/fpga/elementwise_compute.h @@ -16,6 +16,7 @@ #include #include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" +#include "lite/backends/fpga/KD/pes/scale_pe.hpp" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" @@ -50,6 +51,18 @@ class ElementwiseAddActivationCompute zynqmp::ElementwiseAddPE pe_; }; +class ElementwiseMulCompute + : public KernelLite { + public: + void PrepareForRun() override; + void Run() override; + + virtual ~ElementwiseMulCompute() = default; + + private: + zynqmp::ScalePE pe_; +}; + } // namespace fpga } // namespace kernels } // namespace lite diff --git a/lite/kernels/fpga/fc_compute.cc b/lite/kernels/fpga/fc_compute.cc index dca6dbce16c082fb14cefce6ec4da2e53c61e8e0..0c76bf0b41e45ad0bcaa10e97011e26449a3ad7d 100644 --- a/lite/kernels/fpga/fc_compute.cc +++ b/lite/kernels/fpga/fc_compute.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/fpga/fc_compute.h" +#include "lite/backends/fpga/KD/debugger.hpp" #include "lite/core/op_registry.h" #include "lite/core/type_system.h" @@ -30,7 +31,6 @@ void FcCompute::PrepareForRun() { zynqmp::FullyConnectedParam& fc_param = pe_.param(); param.output->mutable_data(); - fc_param.input = param.input->ZynqTensor(); fc_param.output = param.output->ZynqTensor(); fc_param.filter = param.w->ZynqTensor(); @@ -41,8 +41,11 @@ void FcCompute::PrepareForRun() { } void FcCompute::Run() { - auto& param = this->Param(); pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::FullyConnectedParam& fc_param = pe_.param(); + Debugger::get_instance().registerOutput("fc", fc_param.output); +#endif } } // namespace fpga diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h old mode 100644 new mode 100755 index f20419f02b91d02d8d7c6ec7a573a4ead23b3ba7..4cdd1a1abb54fb31962306fde6114a0c51fc10ed --- a/lite/kernels/fpga/fc_compute.h +++ b/lite/kernels/fpga/fc_compute.h @@ -37,10 +37,6 @@ class FcCompute private: zynqmp::FullyConnectedPE pe_; - zynqmp::Tensor input_; - zynqmp::Tensor output_; - zynqmp::Tensor filter_; - zynqmp::Tensor bias_; }; } // namespace fpga diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc old mode 100644 new mode 100755 index 29c080888bb3cbba80e84278b0399b5eadbfa27f..7670bf0007def88c27c12ea54c569a7fcf263693 --- a/lite/kernels/fpga/feed_compute.cc +++ b/lite/kernels/fpga/feed_compute.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/fpga/feed_compute.h" +#include "lite/backends/fpga/KD/debugger.hpp" #include "lite/core/op_registry.h" #include "lite/core/type_system.h" @@ -25,21 +26,29 @@ using float16 = zynqmp::float16; void FeedCompute::PrepareForRun() { auto& param = this->Param(); - // ==================================================== - zynqmp::InputParam& conv_param = pe_.param(); Tensor& x = param.feed_list->at(param.col); - param.out->Resize(x.dims()); param.out->mutable_data(); - conv_param.input = x.ZynqTensor(); - conv_param.output = param.out->ZynqTensor(); + // ==================================================== + zynqmp::InputParam& feed_param = pe_.param(); + feed_param.input = x.ZynqTensor(); + feed_param.output = param.out->ZynqTensor(); pe_.init(); pe_.apply(); } void FeedCompute::Run() { auto& param = this->Param(); + Tensor& x = param.feed_list->at(param.col); pe_.dispatch(); + + auto out_lod = param.out->mutable_lod(); + *out_lod = x.lod(); + +#ifdef FPGA_PRINT_TENSOR + zynqmp::InputParam& feed_param = pe_.param(); + Debugger::get_instance().registerOutput("feed", feed_param.output); +#endif } } // namespace fpga @@ -50,7 +59,7 @@ void FeedCompute::Run() { REGISTER_LITE_KERNEL( feed, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FeedCompute, def) .BindInput("X", - {LiteType::GetTensorTy(TARGET(kFPGA), + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC))}) .BindOutput("Out", diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h old mode 100644 new mode 100755 index a15ba5636f1d767129ff42f79379670c840a1aa2..5d74b4a0660deafa5b0815ff5ac048c36301f615 --- a/lite/kernels/fpga/feed_compute.h +++ b/lite/kernels/fpga/feed_compute.h @@ -32,8 +32,6 @@ class FeedCompute private: zynqmp::InputPE pe_; - zynqmp::Tensor input_; - zynqmp::Tensor output_; }; } // namespace fpga diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc index cf4cf2d3e6d50aaff5c4a77bc7f6d4bf2d539020..9b5f3f60232bb8527f823395693cf3b3851bc04e 100644 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "lite/kernels/fpga/fetch_compute.h" +#include "lite/backends/fpga/KD/debugger.hpp" #include "lite/core/op_registry.h" #include "lite/core/type_system.h" @@ -25,35 +26,65 @@ using float16 = zynqmp::float16; void FetchCompute::PrepareForRun() { auto& param = this->Param(); // ==================================================== - zynqmp::OutputParam& conv_param = pe_.param(); + zynqmp::OutputParam& fetch_param = pe_.param(); auto fetch_list = param.fetch_list; if (fetch_list->size() <= static_cast(param.col)) { fetch_list->resize(param.col + 1); } Tensor& out = param.fetch_list->at(param.col); out.Resize(param.input->dims()); - out.mutable_data(); + out.mutable_data(); - conv_param.input = param.input->ZynqTensor(); - conv_param.output = out.ZynqTensor(); + fetch_param.input = param.input->ZynqTensor(); + fetch_param.output = out.ZynqTensor(); pe_.init(); pe_.apply(); } -void FetchCompute::Run() { pe_.dispatch(); } +void FetchCompute::Run() { + pe_.dispatch(); + auto& param = this->Param(); + +#ifdef FPGA_PRINT_TENSOR + zynqmp::OutputParam& fetch_param = pe_.param(); + Debugger::get_instance().registerOutput("fetch", fetch_param.output); +#endif +} } // namespace fpga } // namespace kernels } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - fetch, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FetchCompute, def) +REGISTER_LITE_KERNEL(fetch, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::FetchCompute, + fpga_host) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fetch, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::FetchCompute, + host_host) .BindInput("X", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h old mode 100644 new mode 100755 index d86fa905c422dc759b23b3b8603e3b4436c5bdf4..b6804f0a6a4b624a9b7968d5121e9d5440202adb --- a/lite/kernels/fpga/fetch_compute.h +++ b/lite/kernels/fpga/fetch_compute.h @@ -31,8 +31,6 @@ class FetchCompute private: zynqmp::OutputPE pe_; - zynqmp::Tensor input_; - zynqmp::Tensor output_; }; } // namespace fpga diff --git a/lite/kernels/fpga/gru_compute.cc b/lite/kernels/fpga/gru_compute.cc new file mode 100755 index 0000000000000000000000000000000000000000..25fdcb505bcc8221da74b8cc87dc4fbec86b6190 --- /dev/null +++ b/lite/kernels/fpga/gru_compute.cc @@ -0,0 +1,212 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include + +#include "lite/api/paddle_place.h" +#include "lite/backends/arm/math/funcs.h" +#include "lite/backends/arm/math/gru_utils.h" +#include "lite/backends/arm/math/sequence2batch.h" +#include "lite/backends/arm/math/sgemm.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/core/type_system.h" +#include "lite/kernels/fpga/gru_compute.h" + +#include "lite/backends/fpga/KD/debugger.hpp" +#include "lite/backends/fpga/KD/pes/gru_util.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +inline lite_api::ActivationType get_gru_act_type(const std::string& type) { + if (type == "sigmoid") { + return lite_api::ActivationType::kSigmoid; + } else if (type == "tanh") { + return lite_api::ActivationType::kTanh; + } else if (type == "relu") { + return lite_api::ActivationType::kRelu; + } else if (type == "identity") { + return lite_api::ActivationType::kIndentity; + } else { + LOG(FATAL) << "unsupported activation type: " << type; + } +} + +void GRUCompute::PrepareForRun() { + auto& param = this->Param(); + param.hidden->mutable_data(); + + auto input = param.input; + auto h0 = param.h0; + auto weight = param.weight; + auto bias = param.bias; + + zynqmp::GRUParam& gru_param = pe_.param(); + gru_param.input = input->ZynqTensor(); + if (h0 != nullptr) { + gru_param.h0 = h0->ZynqTensor(); + } + gru_param.weight = weight->ZynqTensor(); + gru_param.bias = bias->ZynqTensor(); + + gru_param.batch_gate = param.batch_gate->ZynqTensor(); + gru_param.batch_reset_hidden_prev = + param.batch_reset_hidden_prev->ZynqTensor(); + gru_param.batch_hidden = param.batch_hidden->ZynqTensor(); + gru_param.hidden = param.hidden->ZynqTensor(); + + gru_param.gate_activation = param.gate_activation; + gru_param.activation = param.activation; + + pe_.init(); + pe_.apply(); +} + +void GRUCompute::Run() { + auto& param = this->Param(); + param.hidden->mutable_data(); + // inputs + auto input = param.input; + auto h0 = param.h0; + auto weight = param.weight; + auto bias = param.bias; + // outputs + auto batch_gate = param.batch_gate; + auto batch_reset_hidden_prev = param.batch_reset_hidden_prev; + auto batch_hidden = param.batch_hidden; + auto hidden = param.hidden; + + auto hidden_dims = hidden->dims(); + int frame_size = hidden_dims[1]; + auto batch_size = input->dims()[0]; + + const float* weight_data = weight->data(); + float* batch_gate_data = batch_gate->mutable_data(); + + lite::arm::math::LoDTensor2BatchFunctor to_batch; + to_batch(*input, batch_gate, true, param.is_reverse); // 1. + + if (bias) { + auto bias_data = bias->data(); // 2. + lite::arm::math::gru_add_with_bias(batch_gate_data, + bias_data, + batch_gate_data, + batch_size, + frame_size * 3); + } + + zynqmp::GRUTensors gru_tensors; + lite::arm::math::GRUMetaValue gru_value; + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = + const_cast(weight_data + 2 * frame_size * frame_size); + + Tensor ordered_h0; + std::vector order(batch_gate->lod()[2]); + + if (h0) { + // Since the batch computing for GRU reorders the input sequences + // according to their length. The initialized cell state also needs + // to reorder. + // lite::arm::math::ReorderInitState(*h0, order, &ordered_h0, true); + // //3. + gru_value.prev_out_value = ordered_h0.mutable_data(); + gru_tensors.pre_output = ordered_h0.ZynqTensor(); + } else { + gru_value.prev_out_value = nullptr; + gru_tensors.pre_output = nullptr; + } + auto batch_starts = batch_gate->lod()[0]; + size_t seq_len = batch_starts.size() - 1; + auto active_node = get_gru_act_type(param.activation); + auto active_gate = get_gru_act_type(param.gate_activation); + + save_float(gru_value.gate_weight, "_gate_weight.txt", weight->numel()); + batch_gate->ZynqTensor()->saveToFile("batch_gate.txt"); + + zynqmp::Tensor float_input; + zynqmp::Tensor hidden_out; + + for (size_t n = 0; n < seq_len; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + int cur_batch_size = bend - bstart; + + gru_value.output_value = + batch_hidden->mutable_data() + bstart * batch_hidden->dims()[1]; + gru_value.gate_value = + batch_gate->mutable_data() + bstart * batch_gate->dims()[1]; + gru_value.reset_output_value = + batch_reset_hidden_prev->mutable_data() + + bstart * batch_reset_hidden_prev->dims()[1]; + + zynqmp::Shape float_input_shape(zynqmp::NC, + {cur_batch_size, batch_gate->dims()[1]}); + float* float_data = + float_input.mutableData(zynqmp::FP32, float_input_shape); + memcpy(float_data, + gru_value.gate_value, + batch_gate->dims()[1] * sizeof(float)); + float_input.flush(); + + float* hidden_data = + hidden_out.mutableData(zynqmp::FP32, float_input_shape); + gru_tensors.gate = &float_input; + gru_tensors.output = &hidden_out; + + pe_.GRUCOmpute(gru_tensors, + frame_size, + cur_batch_size, + active_node, + active_gate, + param.origin_mode); + + // TODO(chonwhite): copy data back to original tensor; + + gru_tensors.pre_output = gru_tensors.output; + } + lite::arm::math::Batch2LoDTensorFunctor to_seq; // 5. + *(batch_hidden->mutable_lod()) = batch_gate->lod(); + batch_hidden->mutable_data(); + to_seq(*batch_hidden, hidden); + + save_tensor(const_cast(input), "_input.txt"); + save_tensor(hidden, "_gru.txt"); + + exit(-1); +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + gru, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::GRUCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("H0", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..8e3a31da0ba15d07a574805eaf7235cebeccf533 --- /dev/null +++ b/lite/kernels/fpga/gru_compute.h @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" + +#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" +#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp" +#include "lite/backends/fpga/KD/pes/gru_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class GRUCompute + : public KernelLite { + public: + using param_t = operators::GRUParam; + + GRUCompute() = default; + + void PrepareForRun() override; + + void Run() override; + + virtual ~GRUCompute() = default; + + private: + zynqmp::Tensor pre_output_; + zynqmp::Tensor pre_bias_; + zynqmp::Tensor weight_; + + zynqmp::ElementwiseAddPE bias_ew_pe_; + zynqmp::FullyConnectedPE pre_out_pe_; + zynqmp::FullyConnectedPE reset_out_pe_; + + zynqmp::GRUPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/im2sequence_compute.cc b/lite/kernels/fpga/im2sequence_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..27acd07f21bb187573df44ff8267c5279d720fed --- /dev/null +++ b/lite/kernels/fpga/im2sequence_compute.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lite/api/paddle_place.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/kernels/fpga/im2sequence_compute.h" + +#include "lite/backends/fpga/KD/float16.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void im2sequence(const float16* input, + const int input_c, + const int input_h, + const int input_w, + const int kernel_h, + const int kernel_w, + const int pad_top, + const int pad_bottom, + const int pad_left, + const int pad_right, + const int stride_h, + const int stride_w, + const int out_h, + const int out_w, + float16* out) { + int window_size = kernel_h * kernel_w; + int out_rows = out_h * out_w; + int out_cols = input_c * window_size; + int H_pad = input_h + pad_top + pad_bottom; + int W_pad = input_w + pad_left + pad_right; + + float16 zero = zynqmp::float_to_half(0.0f); + + for (int h_id = 0; h_id < out_h; h_id++) { + for (int w_id = 0; w_id < out_w; w_id++) { + // consider dilation. + int start_h = h_id * stride_h - pad_top; + int start_w = w_id * stride_w - pad_left; + for (int c_id = 0; c_id < input_c; c_id++) { + for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) { + int in_h_id = start_h + k_h_id; + bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad); + int out_start_id = + (h_id * out_w + w_id) * out_cols + c_id * window_size; + for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) { + int in_w_id = start_w + k_w_id; + exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad); + int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id; + int out_id = out_start_id + k_h_id * kernel_w + k_w_id; + out[out_id] = exceed_flag ? zero : input[input_id]; + } + } + } + } + } +} + +template +void hwc_to_chw(T* chw_data, + const T* hwc_data, + int num, + int channel, + int height, + int width) { + int chw = channel * height * width; + int wc = width * channel; + int wh = width * height; + int index = 0; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + chw_data[n * chw + c * wh + h * width + w] = hwc_data[index]; + index++; + } + } + } + } +} + +void Im2SequenceCompute::PrepareForRun() {} + +void Im2SequenceCompute::Run() { + auto& param = this->Param(); + auto kernels = param.kernels; + auto strides = param.strides; + auto paddings = param.paddings; + + const auto* x_data = param.X->data(); + float16* o_data = + reinterpret_cast(param.Out->mutable_data()); + + float16* o2 = o_data; + + auto input_dims = param.X->dims(); + int im_num = input_dims[0]; + int im_size = param.X->numel() / im_num; + + param.X->ZynqTensor()->syncToCPU(); + float16* chw_data = new float16[param.X->numel()]; + hwc_to_chw(chw_data, + x_data, + param.X->dims()[0], + param.X->dims()[1], + param.X->dims()[2], + param.X->dims()[3]); + + const float16* in = chw_data; + + int out_cols = input_dims[1] * kernels[0] * kernels[1]; + + int total_rows = 0; + std::vector im_offset; + im_offset.push_back(total_rows); + if (param.Y) { + const auto* y_data = param.Y->data(); + auto out_strides = param.out_strides; + std::vector im_real_h; + std::vector im_real_w; + std::vector out_h_vec; + std::vector out_w_vec; + for (int im_id = 0; im_id < im_num; im_id++) { + int real_h = y_data[im_id * 2 + 0]; + int real_w = y_data[im_id * 2 + 1]; + int tmp_real_h = (real_h + out_strides[0] - 1) / out_strides[0]; + int tmp_real_w = (real_w + out_strides[1] - 1) / out_strides[1]; + im_real_h.push_back(tmp_real_h); + im_real_w.push_back(tmp_real_w); + int out_h = + (tmp_real_h + paddings[0] + paddings[1] - kernels[0]) / strides[0] + + 1; + int out_w = + (tmp_real_w + paddings[2] + paddings[3] - kernels[1]) / strides[1] + + 1; + out_h_vec.push_back(out_h); + out_w_vec.push_back(out_w); + total_rows += out_h * out_w; + im_offset.push_back(total_rows); + } + auto out_dims = param.Out->dims(); + out_dims[0] = total_rows; + param.Out->Resize(out_dims); + + int out_offset = 0; + for (int im_id = 0; im_id < im_num; im_id++) { + im2sequence(in + im_id * im_size, + input_dims[1], + input_dims[2], + input_dims[3], + param.kernels[0], + param.kernels[1], + param.paddings[0], + param.paddings[1], + param.paddings[2], + param.paddings[3], + param.strides[0], + param.strides[1], + out_h_vec[im_id], + out_w_vec[im_id], + o2 + im_offset[im_id] * out_cols); + } + } else { + int out_h = + (input_dims[2] + paddings[0] + paddings[1] - kernels[0]) / strides[0] + + 1; + int out_w = + (input_dims[3] + paddings[2] + paddings[3] - kernels[1]) / strides[1] + + 1; + for (int im_id = 0; im_id < im_num; im_id++) { + int out_size_per_im = out_h * out_w * out_cols; + im2sequence(in + im_id * im_size, + input_dims[1], + input_dims[2], + input_dims[3], + param.kernels[0], + param.kernels[1], + param.paddings[0], + param.paddings[1], + param.paddings[2], + param.paddings[3], + param.strides[0], + param.strides[1], + out_h, + out_w, + o2 + im_id * out_size_per_im); + im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w)); + } + auto lod = param.Out->mutable_lod(); + lod->resize(1); + (*lod)[0] = im_offset; + } + + delete[] chw_data; + param.Out->ZynqTensor()->flush(); + param.Out->ZynqTensor()->copyScaleFrom(param.X->ZynqTensor()); +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(im2sequence, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::Im2SequenceCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/im2sequence_compute.h b/lite/kernels/fpga/im2sequence_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..362f801b7d1477c9685d8d077aebc9c10e5000ea --- /dev/null +++ b/lite/kernels/fpga/im2sequence_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +// #include "lite/backends/arm/math/type_trans.h" +#include "lite/core/kernel.h" +#include "lite/operators/im2sequence_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class Im2SequenceCompute + : public KernelLite { + public: + using param_t = operators::Im2SequenceParam; + + void PrepareForRun() override; + + void Run() override; + + ~Im2SequenceCompute() {} + + private: +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index df85a03894fd5d3bd2b2265a6bffa5a458aaffe5..10a0e3116b920a2f408606ef211f408ed2279f60 100644 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -45,7 +45,23 @@ class IoCopyHostToFpgaCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - param.y->CopyDataFrom(*param.x); + param.y->mutable_data(); + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + param.y->ZynqTensor()->copyFrom(&tempTensor); + } else { + param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } + param.y->ZynqTensor()->invalidate(); + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + auto out_lod = param.y->mutable_lod(); + *out_lod = param.x->lod(); } std::unique_ptr GetTypeInferHandler() override { @@ -81,7 +97,27 @@ class IoCopyFpgaToHostCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - param.y->CopyDataFrom(*param.x); + + param.y->mutable_data(); + param.y->ZynqTensor()->setDataType(zynqmp::FP32); + param.x->ZynqTensor()->syncToDevice(); + + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + param.y->ZynqTensor()->copyFrom(&tempTensor); + } else { + param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + param.y->ZynqTensor()->flush(); + auto out_lod = param.y->mutable_lod(); + *out_lod = param.x->lod(); } std::string doc() const override { return "Copy IO from FPGA to HOST"; } @@ -100,12 +136,27 @@ REGISTER_LITE_KERNEL(io_copy, host_to_device) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kFloat), - DATALAYOUT(kNCHW))}) + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFloat), - DATALAYOUT(kNCHW))}) + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL(io_copy, + kFPGA, + kAny, + kAny, + paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute, + host_to_device_any_any) + .BindInput("Input", + {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .Finalize(); REGISTER_LITE_KERNEL(io_copy, @@ -119,9 +170,25 @@ REGISTER_LITE_KERNEL(io_copy, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(io_copy, + kFPGA, + kAny, + kAny, + paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute, + device_to_host_22) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) .Finalize(); REGISTER_LITE_KERNEL(io_copy_once, @@ -132,12 +199,12 @@ REGISTER_LITE_KERNEL(io_copy_once, host_to_device_once) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kFloat), - DATALAYOUT(kNCHW))}) + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFloat), - DATALAYOUT(kNCHW))}) + PRECISION(kAny), + DATALAYOUT(kAny))}) .Finalize(); REGISTER_LITE_KERNEL(io_copy_once, @@ -148,8 +215,8 @@ REGISTER_LITE_KERNEL(io_copy_once, device_to_host_once) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny), diff --git a/lite/kernels/fpga/layout_compute.cc b/lite/kernels/fpga/layout_compute.cc index 823637e3f3c92dd39b6110f1ed2028e2d06a1d31..c636e1c7200b5cf3a6f1366c53e2b26e2eea1637 100644 --- a/lite/kernels/fpga/layout_compute.cc +++ b/lite/kernels/fpga/layout_compute.cc @@ -26,16 +26,95 @@ namespace fpga { using float16 = zynqmp::float16; -void TransHwcToChw(Tensor* dest, const Tensor* src) {} -void TransChwToHwc(Tensor* dest, const Tensor* src) {} +template +void convert_to_hwc( + T* chw_data, T* hwc_data, int num, int channel, int height, int width) { + int chw = channel * height * width; + int wc = width * channel; + int index = 0; + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index]; + index++; + } + } + } + } +} + +template +void hwc_to_chw( + T* chw_data, T* hwc_data, int num, int channel, int height, int width) { + int chw = channel * height * width; + int wc = width * channel; + int wh = width * height; + int index = 0; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + chw_data[n * chw + c * wh + h * width + w] = hwc_data[index]; + index++; + } + } + } + } +} + +void TransHwcToChw(Tensor* dest, const Tensor* src) { + if (src->ZynqTensor()->dataType() == zynqmp::FP32) { + float* chw = dest->mutable_data(); + float* hwc = const_cast(src->data()); + int num = dest->dims()[0]; + int channel = dest->dims()[1]; + int height = 1; + if (dest->dims().size() > 2) { + height = dest->dims()[2]; + } + int width = 1; + if (dest->dims().size() > 3) { + width = dest->dims()[3]; + } + + hwc_to_chw(chw, hwc, num, channel, height, width); + } + + if (src->ZynqTensor()->dataType() == zynqmp::FP16) { + float16* chw = dest->mutable_data(); + float16* hwc = const_cast(src->data()); + int num = dest->dims()[0]; + int channel = dest->dims()[1]; + int height = 1; + if (dest->dims().size() > 2) { + height = dest->dims()[2]; + } + int width = 1; + if (dest->dims().size() > 3) { + width = dest->dims()[3]; + } + + hwc_to_chw(chw, hwc, num, channel, height, width); + } +} +void TransChwToHwc(Tensor* dest, const Tensor* src) { + std::cout << "chw to hwc \n"; + exit(-1); +} class TransHwcToChwCompute : public KernelLite { public: void Run() override { auto& param = Param(); - auto out_data = param.y->mutable_data(TARGET(kFPGA)); + param.x->ZynqTensor()->syncToCPU(); TransHwcToChw(param.y, param.x); + param.y->ZynqTensor()->flush(); + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + + auto out_lod = param.y->mutable_lod(); + *out_lod = param.x->lod(); } std::unique_ptr GetTypeInferHandler() override { @@ -97,6 +176,22 @@ REGISTER_LITE_KERNEL(layout, DATALAYOUT(kNCHW))}) .Finalize(); +REGISTER_LITE_KERNEL(layout, + kFPGA, + kAny, + kNHWC, + paddle::lite::kernels::fpga::TransHwcToChwCompute, + hwc_to_chw_arm_float) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + REGISTER_LITE_KERNEL(layout, kFPGA, kAny, diff --git a/lite/kernels/fpga/mul_compute.cc b/lite/kernels/fpga/mul_compute.cc new file mode 100755 index 0000000000000000000000000000000000000000..c27600d9f773ff0aae04a2ee519905bc0e58785c --- /dev/null +++ b/lite/kernels/fpga/mul_compute.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/mul_compute.h" +#include +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +#include "lite/backends/fpga/KD/debugger.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void MulCompute::PrepareForRun() { + auto& param = this->Param(); + + // ==================================================== + zynqmp::FullyConnectedParam& fc_param = pe_.param(); + + param.output->mutable_data(); + + fc_param.input = param.x->ZynqTensor(); + fc_param.output = param.output->ZynqTensor(); + fc_param.filter = param.y->ZynqTensor(); + + fc_param.bias = &bias_; + + int channel = fc_param.filter->shape().channel(); + + zynqmp::Shape bias_shape(zynqmp::N, {channel}); + + float* bias_data = + fc_param.bias->mutableData(zynqmp::FP32, bias_shape); + memset(bias_data, 0, channel * sizeof(float)); + bias_.flush(); + + pe_.init(); + pe_.apply(); +} + +void mul(MulCompute* k) { + auto& param = k->Param(); + int num = param.x->dims()[0]; + int channel = param.x->dims()[1]; + + int fn = param.y->dims()[1]; + + float16* out_data = param.output->mutable_data(); + int g_index = 0; + for (int n = 0; n < 1; n++) { + for (int on = 0; on < fn; on++) { + float sum = 0; + int si = 0; + for (int c = 0; c < channel; c++) { + float value = zynqmp::half_to_float(param.x->data()[si]); + int index = c * fn + on; + float weight = param.y->data()[index]; + sum += value * weight; + si++; + } + out_data[g_index] = zynqmp::float_to_half(sum); + g_index++; + } + } +} + +void MulCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::FullyConnectedParam& fc_param = pe_.param(); + Debugger::get_instance().registerOutput("mul", fc_param.output); +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + mul, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulCompute, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/mul_compute.h b/lite/kernels/fpga/mul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..3e5ad7838595d921c90f19faa75b67de94b59c60 --- /dev/null +++ b/lite/kernels/fpga/mul_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class MulCompute + : public KernelLite { + public: + using param_t = operators::MulParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~MulCompute() = default; + + private: + zynqmp::FullyConnectedPE pe_; + zynqmp::Tensor bias_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..cee5e16205370df7faabc6f37d57fe360e8a9e67 --- /dev/null +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -0,0 +1,430 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/multiclass_nms_compute.h" +#include +#include +#include + +#include "lite/backends/fpga/KD/debugger.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static void GetMaxScoreIndex(const std::vector& scores, + const T threshold, + int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), + sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + T norm = normalized ? static_cast(0.) : static_cast(1.); + T inter_w = inter_xmax - inter_xmin + norm; + T inter_h = inter_ymax - inter_ymin + norm; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +T PolyIoU(const T* box1, + const T* box2, + const size_t box_size, + const bool normalized) { + LOG(FATAL) << "PolyIoU not implement."; +} + +template +void SliceOneClass(const Tensor& items, + const int class_id, + Tensor* one_class_item) { + T* item_data = one_class_item->mutable_data(); + const T* items_data = items.data(); + const int64_t num_item = items.dims()[0]; + const int64_t class_num = items.dims()[1]; + if (items.dims().size() == 3) { + int64_t item_size = items.dims()[2]; + for (int i = 0; i < num_item; ++i) { + std::memcpy(item_data + i * item_size, + items_data + i * class_num * item_size + class_id * item_size, + sizeof(T) * item_size); + } + } else { + for (int i = 0; i < num_item; ++i) { + item_data[i] = items_data[i * class_num + class_id]; + } + } +} + +template +void NMSFast(const Tensor& bbox, + const Tensor& scores, + const T score_threshold, + const T nms_threshold, + const T eta, + const int64_t top_k, + std::vector* selected_indices, + const bool normalized) { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + // 8: [x1 y1 x2 y2 x3 y3 x4 y4] + // 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16 + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = T(0.); + // 4: [xmin ymin xmax ymax] + if (box_size == 4) { + overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, + normalized); + } + // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 + if (box_size == 8 || box_size == 16 || box_size == 24 || + box_size == 32) { + overlap = PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, + box_size, + normalized); + } + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +template +void MultiClassNMS(const operators::MulticlassNmsParam& param, + const Tensor& scores, + const Tensor& bboxes, + const int scores_size, + std::map>* indices, + int* num_nmsed_out) { + int64_t background_label = param.background_label; + int64_t nms_top_k = param.nms_top_k; + int64_t keep_top_k = param.keep_top_k; + bool normalized = param.normalized; + T nms_threshold = static_cast(param.nms_threshold); + T nms_eta = static_cast(param.nms_eta); + T score_threshold = static_cast(param.score_threshold); + + int num_det = 0; + int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1]; + + for (int64_t c = 0; c < class_num; ++c) { + Tensor bbox_slice, score_slice; + if (c == background_label) continue; + if (scores_size == 3) { + scores.Slice(score_slice, c, c + 1); + bbox_slice = bboxes; + } else { + score_slice.Resize({scores.dims()[0], 1}); + bbox_slice.Resize({scores.dims()[0], 4}); + SliceOneClass(scores, c, &score_slice); + SliceOneClass(bboxes, c, &bbox_slice); + } + NMSFast(bboxes, + score_slice, + score_threshold, + nms_threshold, + nms_eta, + nms_top_k, + &((*indices)[c]), + normalized); + if (scores_size == 2) { + std::stable_sort((*indices)[c].begin(), (*indices)[c].end()); + } + num_det += (*indices)[c].size(); + } + + *num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + Tensor score_slice; + + const T* sdata; + std::vector>> score_index_pairs; + for (const auto& it : *indices) { + int label = it.first; + if (scores_size == 3) { + sdata = scores_data + label * scores.dims()[1]; + } else { + score_slice.Resize({scores.dims()[0], 1}); + SliceOneClass(scores, label, &score_slice); + sdata = score_slice.data(); + } + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), + score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + if (scores_size == 2) { + for (const auto& it : new_indices) { + int label = it.first; + std::stable_sort(new_indices[label].begin(), new_indices[label].end()); + } + } + new_indices.swap(*indices); + *num_nmsed_out = keep_top_k; + } +} + +template +void MultiClassOutput(const Tensor& scores, + const Tensor& bboxes, + const std::map>& selected_indices, + const int scores_size, + Tensor* outs) { + int64_t class_num = scores.dims()[1]; + int64_t predict_dim = scores.dims()[1]; + int64_t box_size = bboxes.dims()[1]; + if (scores_size == 2) { + box_size = bboxes.dims()[2]; + } + int64_t out_dim = box_size + 2; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->mutable_data(); + const T* sdata; + Tensor bbox; + bbox.Resize({scores.dims()[0], box_size}); + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const std::vector& indices = it.second; + if (scores_size == 2) { + SliceOneClass(bboxes, label, &bbox); + } else { + sdata = scores_data + label * predict_dim; + } + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + odata[count * out_dim] = label; // label + const T* bdata; + if (scores_size == 3) { + bdata = bboxes_data + idx * box_size; + odata[count * out_dim + 1] = sdata[idx]; // score + } else { + bdata = bbox.data() + idx * box_size; + odata[count * out_dim + 1] = *(scores_data + idx * class_num + label); + } + // xmin, ymin, xmax, ymax or multi-points coordinates + std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); + count++; + } + } +} + +void MulticlassNmsCompute::Run() { + auto& param = Param(); + auto* boxes = param.bboxes; + auto* scores = param.scores; + auto* outs = param.out; + outs->mutable_data(); + + auto score_dims = scores->dims(); + auto score_size = score_dims.size(); + + auto box_dims = boxes->dims(); + int64_t box_dim = boxes->dims()[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + int64_t batch_size = score_dims[0]; + + int64_t out_dim = box_dim + 2; + int num_nmsed_out = 0; + Tensor boxes_slice, scores_slice; + int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1; + for (int i = 0; i < n; ++i) { + if (score_size == 3) { + scores->Slice(scores_slice, i, i + 1); + scores_slice.Resize({score_dims[1], score_dims[2]}); + boxes->Slice(boxes_slice, i, i + 1); + boxes_slice.Resize({score_dims[2], box_dim}); + } else { + auto boxes_lod = boxes->lod().back(); + scores->Slice(scores_slice, boxes_lod[i], boxes_lod[i + 1]); + boxes->Slice(boxes_slice, boxes_lod[i], boxes_lod[i + 1]); + } + std::map> indices; + MultiClassNMS( + param, scores_slice, boxes_slice, score_size, &indices, &num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + uint64_t num_kept = batch_starts.back(); + if (num_kept == 0) { + outs->Resize({1, 1}); + float* od = outs->mutable_data(); + od[0] = -1; + batch_starts = {0, 1}; + } else { + outs->Resize({static_cast(num_kept), out_dim}); + for (int i = 0; i < n; ++i) { + if (score_size == 3) { + scores->Slice(scores_slice, i, i + 1); + boxes->Slice(boxes_slice, i, i + 1); + scores_slice.Resize({score_dims[1], score_dims[2]}); + boxes_slice.Resize({score_dims[2], box_dim}); + } else { + auto boxes_lod = boxes->lod().back(); + scores->Slice(scores_slice, boxes_lod[i], boxes_lod[i + 1]); + boxes->Slice(boxes_slice, boxes_lod[i], boxes_lod[i + 1]); + } + int64_t s = static_cast(batch_starts[i]); + int64_t e = static_cast(batch_starts[i + 1]); + + if (e > s) { + Tensor out; + outs->Slice(out, s, e); + MultiClassOutput( + scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); + outs->ZynqTensor()->copyFrom(out.ZynqTensor()); + } + } + } + LoD lod; + lod.emplace_back(batch_starts); + outs->set_lod(lod); + +#ifdef FPGA_PRINT_TENSOR + Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor()); + Debugger::get_instance().registerOutput("scores", scores->ZynqTensor()); + Debugger::get_instance().registerOutput("nms", outs->ZynqTensor()); +#endif +} +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(multiclass_nms, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::MulticlassNmsCompute, + def) + .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + +REGISTER_LITE_KERNEL(multiclass_nms, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::MulticlassNmsCompute, + def2) + .BindInput("BBoxes", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Scores", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/fpga/multiclass_nms_compute.h b/lite/kernels/fpga/multiclass_nms_compute.h new file mode 100755 index 0000000000000000000000000000000000000000..764e707eeeeed49b46a4cac1c613905d960f0cbb --- /dev/null +++ b/lite/kernels/fpga/multiclass_nms_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class MulticlassNmsCompute + : public KernelLite { + public: + void Run() override; + + virtual ~MulticlassNmsCompute() = default; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/norm_compute.cc b/lite/kernels/fpga/norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..752e2b2211f33fba4f1a0b7ae1ecd9367ccbe91b --- /dev/null +++ b/lite/kernels/fpga/norm_compute.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/norm_compute.h" +#include "lite/backends/fpga/KD/debugger.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void NormCompute::PrepareForRun() { + auto& param = this->Param(); + param.Out->mutable_data(); + + zynqmp::NormParam& norm_param = pe_.param(); + norm_param.input = param.X->ZynqTensor(); + norm_param.output = param.Out->ZynqTensor(); + norm_param.epsilon = param.epsilon; + + pe_.init(); + pe_.apply(); +} + +void NormCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::NormParam& norm_param = pe_.param(); + Debugger::get_instance().registerOutput("norm", norm_param.output); +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + norm, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::NormCompute, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Norm", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/norm_compute.h b/lite/kernels/fpga/norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..9fe135e9eb17928cfa44d60377f36a7355ed5d71 --- /dev/null +++ b/lite/kernels/fpga/norm_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +// #include "lite/backends/arm/math/type_trans.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/norm_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class NormCompute + : public KernelLite { + public: + using param_t = operators::NormParam; + + void PrepareForRun() override; + + void Run() override; + + ~NormCompute() {} + + private: + zynqmp::NormPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/pooling_compute.cc b/lite/kernels/fpga/pooling_compute.cc index e4979f8e5762400f453e323f98a6b18ba17a0998..5411d3370d4b39a7d587cbf1887e7d1372b30036 100644 --- a/lite/kernels/fpga/pooling_compute.cc +++ b/lite/kernels/fpga/pooling_compute.cc @@ -18,6 +18,8 @@ #include "lite/core/op_registry.h" #include "lite/core/type_system.h" +#include "lite/backends/fpga/KD/debugger.hpp" + namespace paddle { namespace lite { namespace kernels { @@ -26,26 +28,32 @@ namespace fpga { using float16 = zynqmp::float16; void PoolCompute::PrepareForRun() { - zynqmp::PoolingParam& pool_param = pe_.param(); auto& param = Param(); - param.output->mutable_data(); + zynqmp::PoolingParam& pool_param = pe_.param(); pool_param.input = param.x->ZynqTensor(); pool_param.output = param.output->ZynqTensor(); pool_param.relu.enabled = false; - pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX : zynqmp::PoolingType::AVERAGE; pool_param.globalPooling = param.global_pooling; pool_param.kernelSize = param.ksize; pool_param.strides = param.strides; - pool_param.paddings = param.paddings; + int pad_h = (*param.paddings)[0]; + int pad_w = (*param.paddings)[2]; + pool_param.paddings = std::vector({pad_h, pad_w}); pe_.init(); pe_.apply(); } -void PoolCompute::Run() { pe_.dispatch(); } +void PoolCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::PoolingParam& pool_param = pe_.param(); + Debugger::get_instance().registerOutput("pooling", pool_param.output); +#endif +} } // namespace fpga } // namespace kernels diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc old mode 100644 new mode 100755 index 2309bf8fe4aa1c083e1662556ac49bf4357ab07a..9248289fe9353705e7a2d84831b9f3de5d8ee7d7 --- a/lite/kernels/fpga/pooling_compute_test.cc +++ b/lite/kernels/fpga/pooling_compute_test.cc @@ -46,7 +46,7 @@ std::vector compute_output_shape(operators::PoolParam* param_) { if (param_->global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); for (size_t i = 0; i < ksize.size(); ++i) { - param_->paddings[i] = 0; + (*param_->paddings)[i] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } @@ -59,7 +59,7 @@ std::vector compute_output_shape(operators::PoolParam* param_) { for (size_t i = 0; i < param_->ksize.size(); ++i) { output_shape.push_back(PoolOutputSize(x_dims[i + 2], param_->ksize[i], - param_->paddings[i], + (*param_->paddings)[i], param_->strides[i], param_->ceil_mode)); } @@ -76,7 +76,7 @@ void pool_compute_ref(const operators::PoolParam& param) { std::vector ksize = param.ksize; std::vector strides = param.strides; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::string pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -103,7 +103,7 @@ void pool_compute_ref(const operators::PoolParam& param) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -230,7 +230,7 @@ TEST(pool_fpga, compute) { } param.global_pooling = global_pooling; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + *param.paddings = {pad, pad, pad, pad}; param.exclusive = exclusive; param.ceil_mode = ceil_mode; param.adaptive = false; diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..afd14ccb4b4a9a4f1e93e1e38840035fb18186bb --- /dev/null +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "lite/backends/fpga/KD/debugger.hpp" +#include "lite/kernels/fpga/prior_box_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, + bool flip, + std::vector* output_aspect_ratior) { + constexpr float epsilon = 1e-6; + output_aspect_ratior->clear(); + output_aspect_ratior->push_back(1.0f); + for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { + float ar = input_aspect_ratior[i]; + bool already_exist = false; + for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { + if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { + already_exist = true; + break; + } + } + if (!already_exist) { + output_aspect_ratior->push_back(ar); + if (flip) { + output_aspect_ratior->push_back(1.0f / ar); + } + } + } +} + +void PriorBoxCompute::PrepareForRun() { + auto& param = this->Param(); + bool is_flip = param.flip; + bool is_clip = param.clip; + std::vector min_size = param.min_sizes; + std::vector max_size = param.max_sizes; + std::vector aspect_ratio = param.aspect_ratios; + std::vector variance = param.variances_; + int img_w = param.img_w; + int img_h = param.img_h; + float step_w = param.step_w; + float step_h = param.step_h; + float offset = param.offset; + std::vector aspect_ratios_vec; + ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec); + size_t prior_num = aspect_ratios_vec.size() * min_size.size(); + prior_num += max_size.size(); + std::vector order = param.order; + bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order; + + int win1 = param.input->dims()[3]; + int hin1 = param.input->dims()[2]; + + DDim shape_out({hin1, win1, prior_num, 4}); + param.boxes->Resize(shape_out); + param.variances->Resize(shape_out); + + param.boxes->mutable_data(); + param.variances->mutable_data(); + zynqmp::PriorBoxParam& priobox_param = pe_.param(); + priobox_param.input = param.input->ZynqTensor(); + priobox_param.image = param.image->ZynqTensor(); + priobox_param.outputBoxes = param.boxes->ZynqTensor(); + priobox_param.outputVariances = param.variances->ZynqTensor(); + priobox_param.minSizes = param.min_sizes; + priobox_param.maxSizes = param.max_sizes; + priobox_param.aspectRatios = param.aspect_ratios; + priobox_param.variances = param.variances_; + priobox_param.minMaxAspectRatiosOrder = min_max_aspect_ratios_order; + priobox_param.flip = param.flip; + priobox_param.clip = param.clip; + priobox_param.stepW = param.step_w; + priobox_param.stepH = param.step_h; + priobox_param.offset = param.offset; + + pe_.init(); + pe_.apply(); +} + +void PriorBoxCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::PriorBoxParam& priobox_param = pe_.param(); + Debugger::get_instance().registerOutput("pb_boxes", + priobox_param.outputBoxes); + Debugger::get_instance().registerOutput("pb_variances", + priobox_param.outputVariances); +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(prior_box, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::PriorBoxCompute, + def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Image", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/fpga/prior_box_compute.h b/lite/kernels/fpga/prior_box_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..74e4c536470ad7c9b41e7bd284ea13eff4910bb7 --- /dev/null +++ b/lite/kernels/fpga/prior_box_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/prior_box_pe.hpp" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class PriorBoxCompute + : public KernelLite { + public: + using param_t = operators::PriorBoxParam; + + void PrepareForRun() override; + void Run() override; + + virtual ~PriorBoxCompute() = default; + + private: + zynqmp::PriorBoxPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..f72f18892c987e48fb3467372352f6ded98444ff --- /dev/null +++ b/lite/kernels/fpga/reshape_compute.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/reshape_compute.h" +#include +#include "lite/operators/reshape_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void ReshapeCompute::Run() { + auto& param = Param(); + param.output->mutable_data(); + auto x = param.x; + // auto actual_shape = param.actual_shape; + Tensor* actual_shape = nullptr; // TODO(chonwhite) change it. + auto output = param.output; + bool inplace = param.inplace; + auto x_dims = x->dims(); + auto output_dims = output->dims(); + if (actual_shape) { + auto actual_shape_dims = actual_shape->dims(); + auto* actual_shape_data = actual_shape->data(); + auto shape = std::vector( + actual_shape_data, actual_shape_data + actual_shape_dims.production()); + output_dims = lite::operators::ValidateShape(shape, x_dims); + output->Resize(output_dims); + } + if (inplace) { + output->ShareDataWith(*x); + } else { + output->CopyDataFrom(*x); + } + + param.x->ZynqTensor()->saveToFile("reshape_in", true); + output->ZynqTensor()->saveToFile("reshape_out", true); + + output->Resize(output_dims); +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(reshape, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ReshapeCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(reshape2, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ReshapeCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("XShape", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(flatten, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ReshapeCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(flatten2, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ReshapeCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("XShape", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/reshape_compute.h b/lite/kernels/fpga/reshape_compute.h new file mode 100755 index 0000000000000000000000000000000000000000..cc5ed0b565c800e7c03698f716cbe6a304ef97de --- /dev/null +++ b/lite/kernels/fpga/reshape_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class ReshapeCompute + : public KernelLite { + public: + void Run() override; + + virtual ~ReshapeCompute() = default; +}; + +class ReshapeComputeFpgaToHost + : public KernelLite { + public: + void Run() override; + + virtual ~ReshapeComputeFpgaToHost() = default; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc old mode 100644 new mode 100755 index 01f9a63ad48f0f371ead0e19042df3841993c372..991c73f2954b00c50107e5ec45ff338e99be6d75 --- a/lite/kernels/fpga/scale_compute.cc +++ b/lite/kernels/fpga/scale_compute.cc @@ -19,7 +19,37 @@ namespace lite { namespace kernels { namespace fpga { -void ScaleCompute::Run() {} +void ScaleCompute::PrepareForRun() { + auto& param = this->Param(); + param.output->mutable_data(); + + zynqmp::ScaleParam& scale_param = pe_.param(); + + scale_param.input = param.x->ZynqTensor(); + scale_param.output = param.output->ZynqTensor(); + + int channel = scale_param.input->shape().channel(); + zynqmp::Tensor* scale = new zynqmp::Tensor(); + zynqmp::Tensor* bias = new zynqmp::Tensor(); + zynqmp::Shape shape(zynqmp::N, {channel}); + float* scale_data = scale->mutableData(zynqmp::FP32, shape); + float* bias_data = bias->mutableData(zynqmp::FP32, shape); + + float scale_value = param.scale; + float bias_value = param.bias_after_scale ? param.bias : 0; + + for (int i = 0; i < channel; ++i) { + scale_data[i] = scale_value; + bias_data[i] = bias_value; + } + scale_param.scale = scale; + scale_param.bias = bias; + + pe_.init(); + pe_.apply(); +} + +void ScaleCompute::Run() { pe_.dispatch(); } } // namespace fpga } // namespace kernels diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h old mode 100644 new mode 100755 index 45cd3528f8a99636e1e22d3cce76e774a6488530..217399db7255d57c68cbf1b5e3f29462b9ab8bfb --- a/lite/kernels/fpga/scale_compute.h +++ b/lite/kernels/fpga/scale_compute.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/scale_pe.hpp" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" @@ -21,12 +23,20 @@ namespace lite { namespace kernels { namespace fpga { +using float16 = zynqmp::float16; + class ScaleCompute : public KernelLite { public: + using param_t = operators::ScaleParam; + + void PrepareForRun() override; void Run() override; virtual ~ScaleCompute() = default; + + private: + zynqmp::ScalePE pe_; }; } // namespace fpga diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc old mode 100644 new mode 100755 index 63abc76e68ebf15a458ed380d7eabeaf89d5dd2f..b13b5f0f46202dfcf59a5c9094a409d02f03e4a2 --- a/lite/kernels/fpga/softmax_compute.cc +++ b/lite/kernels/fpga/softmax_compute.cc @@ -33,7 +33,13 @@ void SoftmaxCompute::PrepareForRun() { pe_.apply(); } -void SoftmaxCompute::Run() { pe_.dispatch(); } +void SoftmaxCompute::Run() { + pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::SoftmaxParam& softmax_param = pe_.param(); + Debugger::get_instance().registerOutput("softmax", softmax_param.output); +#endif +} } // namespace fpga } // namespace kernels diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3bb813873d69d8f9d9939f06869e2640f416915 --- /dev/null +++ b/lite/kernels/fpga/transpose_compute.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +#include "lite/core/type_system.h" +#include "lite/kernels/fpga/transpose_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void transposeCompute(operators::TransposeParam param) { + // copy from; + const auto* input_x = param.x; + const auto input_x_dims = input_x->dims(); + input_x->ZynqTensor()->invalidate(); + input_x->ZynqTensor()->unalignImage(); + + Tensor float_input; + float_input.Resize(input_x_dims); + float_input.mutable_data(); + float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor()); + + const auto* input_x_data = float_input.data(); + + auto* out = param.output; + const auto axis = param.axis; + + auto* out_data = out->mutable_data(); + + size_t ndim = axis.size(); + std::vector xdim(ndim); + std::vector xstride(ndim); + std::vector xout(ndim); + for (int i = 0; i < ndim; i++) { + int j = ndim - 1 - i; + xdim[j] = input_x_dims[axis[i]]; + xstride[j] = 1; + for (int k = axis[i] + 1; k < ndim; k++) { + xstride[j] *= input_x_dims[k]; + } + xout[j] = xstride[j] * xdim[j]; + } + + auto numel = input_x->numel(); + size_t pind = 0; + std::vector ind(ndim); + for (int i = 0; i < numel; i++) { + out_data[i] = input_x_data[pind]; + ind[0]++; + pind += xstride[0]; + for (int j = 0; j < ndim - 1; j++) { + if (ind[j] == xdim[j]) { + ind[j + 1]++; + ind[j] = 0; + pind += xstride[j + 1]; + pind -= xout[j]; + } else { + break; + } + } + } +} + +// Transpose +void TransposeCompute::Run() { auto& param = this->Param(); } + +// Transpose2 +void Transpose2Compute::Run() { + auto& param = this->Param(); + param.output->mutable_data(); + param.x->ZynqTensor()->invalidate(); + param.x->ZynqTensor()->unalignImage(); + if (param.x->dims().size() != 4) { + transposeCompute(param); + } else { + param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +// Transpose +REGISTER_LITE_KERNEL(transpose, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::TransposeCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +// Transpose2 +REGISTER_LITE_KERNEL(transpose2, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::Transpose2Compute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/fpga/transpose_compute.h b/lite/kernels/fpga/transpose_compute.h new file mode 100755 index 0000000000000000000000000000000000000000..ba6a7e4aabbf27e545fc53ea9fd5d63ec2d1e6c0 --- /dev/null +++ b/lite/kernels/fpga/transpose_compute.h @@ -0,0 +1,51 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/core/kernel.h" +#include "lite/operators/transpose_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +// Transpose +class TransposeCompute + : public KernelLite { + public: + using param_t = operators::TransposeParam; + + void Run() override; + + virtual ~TransposeCompute() = default; +}; + +// Transpose2 +class Transpose2Compute + : public KernelLite { + public: + using param_t = operators::TransposeParam; + + void Run() override; + + virtual ~Transpose2Compute() = default; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle