diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index a3271ae50ec894c7cad7d18ea8fed763999127fa..64d3f00a967cfb23b4d0876331306ac9671e5996 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -56,24 +56,25 @@ class Debugger { std::unordered_map op_config; std::unordered_map tick_tock_map; Debugger() { - // op_config["concat"] = true; - // op_config["pooling"] = true; - // op_config["conv"] = true; - // op_config["dropout"] = true; - // op_config["dwconv"] = true; - // op_config["ew_add"] = true; - // op_config["ew_mul"] = true; - // op_config["crop"] = true; - // op_config["feed"] = true; - // op_config["fc"] = true; - // op_config["mul"] = true; - // op_config["fetch"] = true; - // op_config["boxes"] = true; - // op_config["scores"] = true; - // op_config["nms"] = true; - // op_config["pb_boxes"] = true; - // op_config["pb_variances"] = true; - // op_config["softmax"] = true; + op_config["concat"] = true; + op_config["pooling"] = true; + op_config["conv"] = true; + op_config["dropout"] = true; + op_config["dwconv"] = true; + op_config["ew_add"] = true; + op_config["ew_mul"] = true; + op_config["crop"] = true; + op_config["feed"] = true; + op_config["fetch"] = true; + op_config["fc"] = true; + op_config["mul"] = true; + op_config["boxes"] = true; + op_config["scores"] = true; + op_config["nms"] = true; + op_config["pb_boxes"] = true; + op_config["pb_variances"] = true; + op_config["softmax"] = true; + op_config["split"] = true; } }; diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp index 7a2c92335788364426b82d60b6a1ad85e633021c..f8dc1e69627dd039d130a19f224c14eb04e0be92 100755 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -61,7 +61,6 @@ void reset_device() { // memory management; void *fpga_malloc(size_t size) { - #ifdef PADDLE_MOBILE_OS_LINUX void *ptr = reinterpret_cast( @@ -205,7 +204,7 @@ int get_device_info(const struct DeviceInfo &args) { int perform_bypass(const struct BypassArgs &args) { int ret = -1; int size = args.image.channels * args.image.width * args.image.height; - int max_size = 1 << 22; + int max_size = 1 << 20; float times = 1.0 * size / max_size; int count = static_cast(times); diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp index 00dfe1830f6f44cbf6a30708fa5783563470c686..d7d58ee8b7e23de843143b643eda0272c4cfc34b 100644 --- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp +++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp @@ -241,10 +241,13 @@ void PriorBoxPE::compute_prior_box() { } boxes.flush(); - boxes.syncToCPU(); + // boxes.syncToCPU(); variances.flush(); output_boxes->copyFrom(&boxes); output_variances->copyFrom(&variances); + + output_boxes->invalidate(); + output_variances->invalidate(); } void PriorBoxPE::apply() {} @@ -253,8 +256,9 @@ bool PriorBoxPE::dispatch() { if (cachedBoxes_ == nullptr) { cachedBoxes_ = new Tensor(); cachedVariances_ = new Tensor(); - cachedBoxes_->mutableData(FP32, param_.outputBoxes->shape()); - cachedVariances_->mutableData(FP32, param_.outputVariances->shape()); + cachedBoxes_->mutableData(FP16, param_.outputBoxes->shape()); + cachedVariances_->mutableData(FP16, + param_.outputVariances->shape()); cachedBoxes_->setDataLocation(CPU); cachedVariances_->setDataLocation(CPU); compute_prior_box(); diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 065495fd8571691196700cd9da23af282b882240..e105d89847039855d91db8bb3f9cb901f0276c0d 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -389,11 +389,17 @@ class Tensor { float value = 0; if (dataType_ == FP32) { value = data()[i]; - } else if (dataType_ == FP16) { + } + if (dataType_ == FP16) { value = half_to_float(data()[i]); - } else { + } + + if (dataType_ == INT8) { value = data()[i]; } + if (dataType_ == INT32) { + value = data()[i]; + } ofs << value << std::endl; } ofs.close(); diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h index 0feaef6dbe45c58e02fd71f72e17e50a89e549c8..8d65a912227a077124f371e20850a0e2ed992245 100644 --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -81,8 +81,7 @@ class DDimLite { return !(a == b); } - ~DDimLite() { - } + ~DDimLite() {} private: std::vector data_; @@ -112,9 +111,7 @@ class TensorLite { return zynq_tensor_->data() + offset_; } - void Resize(const DDimLite &ddim) { - dims_ = ddim; - } + void Resize(const DDimLite &ddim) { dims_ = ddim; } void Resize(const std::vector &x) { dims_ = DDimLite(x); } const DDimLite &dims() const { return dims_; } @@ -212,6 +209,28 @@ class TensorLite { void mutable_data_internal(); }; +template +zynqmp::DataType get_date_type() { + zynqmp::DataType data_type = zynqmp::FP32; + if (typeid(T) == typeid(float)) { + data_type = zynqmp::FP32; + } + if (typeid(T) == typeid(zynqmp::float16)) { + data_type = zynqmp::FP16; + } + if (typeid(T) == typeid(int)) { + data_type = zynqmp::INT32; + } + if (typeid(T) == typeid(int32_t)) { + data_type = zynqmp::INT32; + } + if (typeid(T) == typeid(int8_t)) { + data_type = zynqmp::INT8; + } + + return data_type; +} + template R *TensorLite::mutable_data() { std::vector v; @@ -237,13 +256,8 @@ R *TensorLite::mutable_data() { break; } zynqmp::Shape input_shape(layout_type, v); - zynqmp::DataType data_type = zynqmp::FP32; - if (typeid(T) == typeid(float)) { - data_type = zynqmp::FP32; - } - if (typeid(T) == typeid(zynqmp::float16)) { - data_type = zynqmp::FP16; - } + zynqmp::DataType data_type = get_date_type(); + if (zynq_tensor_.get() == nullptr) { zynq_tensor_.reset(new zynqmp::Tensor()); } diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index c5ce74e30e34b5878a534010b6cf8b86f91a1118..44494bb72228bbec1b25d415d21162024cd835a0 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -25,7 +25,7 @@ namespace mir { void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { std::vector act_types{"relu"}; for (auto& place : graph->valid_places()) { - if (place.target == TARGET(kCUDA)) { + if (place.target == TARGET(kCUDA) || place.target == TARGET(kFPGA)) { act_types.push_back("leaky_relu"); break; } diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index 0af17ecbe76523b8dcff150863661da93b73d553..76dbdabc54f6fe6e500ba8d668bedf5c338dc2dd 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -8,7 +8,7 @@ set(fpga_deps fpga_target_wrapper kernel_fpga) add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) -# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) +add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) # add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps}) @@ -28,8 +28,9 @@ add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fp add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps}) add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps}) -# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) -# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps}) +add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) +add_kernel(split_compute_fpga FPGA basic SRCS split_compute.cc DEPS ${fpga_deps}) +add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps}) add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps}) add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index 4554c24e07de656b948826c2fa6f9526f61daaa6..8b515532453d41eb504fabb228e491f0d5a3c00e 100755 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -45,21 +45,32 @@ class IoCopyHostToFpgaCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - param.y->mutable_data(); - if (param.x->ZynqTensor()->aligned() && - param.x->ZynqTensor()->shape().shouldAlign()) { - zynqmp::Tensor tempTensor; - tempTensor.mutableData(zynqmp::FP16, - param.x->ZynqTensor()->shape()); - tempTensor.copyFrom(param.x->ZynqTensor()); - tempTensor.setAligned(true); - tempTensor.unalignImage(); - param.y->ZynqTensor()->copyFrom(&tempTensor); - } else { + param.x->ZynqTensor()->flush(); + + if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { + param.y->mutable_data(); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + return; } - param.y->ZynqTensor()->invalidate(); - param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + + if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) { + param.y->mutable_data(); + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + param.y->ZynqTensor()->copyFrom(&tempTensor); + } else { + param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } + param.y->ZynqTensor()->invalidate(); + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + } + auto out_lod = param.y->mutable_lod(); *out_lod = param.x->lod(); } diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc index 4834054df6371a9faaa17bd17b53a29b999ddf03..23a5aad8e694d33cc30adec114e520620685178e 100644 --- a/lite/kernels/fpga/multiclass_nms_compute.cc +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -318,14 +318,29 @@ void MultiClassOutput(const Tensor& scores, void MulticlassNmsCompute::Run() { auto& param = Param(); - auto* boxes = param.bboxes; - auto* scores = param.scores; + auto* boxes_in = param.bboxes; + auto* scores_in = param.scores; auto* outs = param.out; outs->mutable_data(); - auto score_dims = scores->dims(); + auto score_dims = boxes_in->dims(); auto score_size = score_dims.size(); + Tensor boxes_float; + Tensor scores_float; + + boxes_float.Resize(boxes_in->dims()); + scores_float.Resize(scores_in->dims()); + + boxes_float.mutable_data(); + scores_float.mutable_data(); + + boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor()); + scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor()); + + Tensor* boxes = &boxes_float; + Tensor* scores = &scores_float; + auto box_dims = boxes->dims(); int64_t box_dim = boxes->dims()[2]; @@ -383,6 +398,7 @@ void MulticlassNmsCompute::Run() { MultiClassOutput( scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); outs->ZynqTensor()->copyFrom(out.ZynqTensor()); + out.ZynqTensor()->saveToFile("nms_oo", true); } outs->Resize({static_cast(e - s), out_dim}); } @@ -402,16 +418,16 @@ void MulticlassNmsCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(multiclass_nms, - kFPGA, - kFP16, - kNHWC, - paddle::lite::kernels::fpga::MulticlassNmsCompute, - def) - .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); +// REGISTER_LITE_KERNEL(multiclass_nms, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::MulticlassNmsCompute, +// def) +// .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) +// .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) +// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) +// .Finalize(); REGISTER_LITE_KERNEL(multiclass_nms, kFPGA, @@ -427,5 +443,8 @@ REGISTER_LITE_KERNEL(multiclass_nms, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) .Finalize(); diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc index afd14ccb4b4a9a4f1e93e1e38840035fb18186bb..a11e67d837b81b03a8cca753bc409509ca5833b6 100644 --- a/lite/kernels/fpga/prior_box_compute.cc +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -131,3 +131,27 @@ REGISTER_LITE_KERNEL(prior_box, .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +// REGISTER_LITE_KERNEL(prior_box, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::PriorBoxCompute, +// def) +// .BindInput("Input", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindInput("Image", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindOutput("Boxes", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindOutput("Variances", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .Finalize(); diff --git a/lite/kernels/fpga/split_compute.cc b/lite/kernels/fpga/split_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..518503d67ff28b209ed9d7e76d441ef46b3bfd4d --- /dev/null +++ b/lite/kernels/fpga/split_compute.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/split_compute.h" +#include +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +void SplitCompute::PrepareForRun() { + auto& param = Param(); + zynqmp::SplitParam& split_param = pe_.param(); + split_param.input = param.x->ZynqTensor(); + auto& dout = param.output; + for (int i = 0; i < dout.size(); i++) { + dout[i]->mutable_data(); + split_param.outputs.push_back(dout[i]->ZynqTensor()); + } + + pe_.init(); + pe_.apply(); +} + +void SplitCompute::Run() { + zynqmp::SplitParam& split_param = pe_.param(); + pe_.dispatch(); + +#ifdef FPGA_PRINT_TENSOR + auto& dout = param.output; + for (int i = 0; i < dout.size(); i++) { + Debugger::get_instance().registerOutput("split", split_param.outputs[0]); + } + +#endif +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + split, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::SplitCompute, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("SectionsTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/split_compute.h b/lite/kernels/fpga/split_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..d7680a66495c4e31591ecf6bdcdc73e3a71d802e --- /dev/null +++ b/lite/kernels/fpga/split_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/split_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class SplitCompute + : public KernelLite { + public: + void PrepareForRun() override; + void Run() override; + + virtual ~SplitCompute() = default; + + private: + zynqmp::SplitPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc index e3bb813873d69d8f9d9939f06869e2640f416915..4ffeb4c82b10cee4094fbee53c7f39014e7fab84 100644 --- a/lite/kernels/fpga/transpose_compute.cc +++ b/lite/kernels/fpga/transpose_compute.cc @@ -81,7 +81,17 @@ void transposeCompute(operators::TransposeParam param) { } // Transpose -void TransposeCompute::Run() { auto& param = this->Param(); } +void TransposeCompute::Run() { + auto& param = this->Param(); + param.output->mutable_data(); + param.x->ZynqTensor()->invalidate(); + param.x->ZynqTensor()->unalignImage(); + if (param.x->dims().size() != 4) { + transposeCompute(param); + } else { + param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } +} // Transpose2 void Transpose2Compute::Run() {