From a59d6fabd4a287f3676d0ffffb92a63e2fcd7619 Mon Sep 17 00:00:00 2001 From: chonwhite Date: Fri, 19 Jun 2020 10:18:11 +0800 Subject: [PATCH] arm & fpga kernel works together --- lite/backends/fpga/KD/debugger.hpp | 1 + lite/backends/fpga/KD/dispatch/action.hpp | 36 +++ .../backends/fpga/KD/dispatch/transaction.hpp | 40 +++ .../fpga/KD/dispatch/transaction_manager.hpp | 47 +++ lite/backends/fpga/KD/llapi/filter.cpp | 4 +- lite/backends/fpga/KD/pes/conv_process.hpp | 14 +- lite/backends/fpga/KD/pes/input_pe.hpp | 4 +- lite/backends/fpga/KD/pes/norm_pe.hpp | 1 + lite/backends/fpga/KD/pes/output_pe.hpp | 4 +- lite/backends/fpga/KD/pes/prior_box_pe.cpp | 7 +- lite/backends/fpga/KD/pes/prior_box_pe.hpp | 7 + .../fpga/KD/pes/{resize.hpp => resize_pe.hpp} | 36 ++- lite/backends/fpga/KD/pes/scale_pe.hpp | 39 ++- lite/backends/fpga/KD/pes/softmax_pe.cpp | 1 + lite/backends/fpga/KD/pes/split_pe.hpp | 2 +- lite/backends/fpga/KD/tensor.hpp | 35 ++- lite/backends/fpga/lite_tensor.cc | 1 + lite/backends/fpga/lite_tensor.h | 27 +- lite/core/mir/kernel_place_correct_pass.h | 143 ++++++++- lite/core/mir/static_kernel_pick_pass.h | 17 ++ lite/core/mir/type_precision_cast_pass.cc | 10 + lite/core/mir/type_target_cast_pass.cc | 11 +- lite/kernels/arm/concat_compute.cc | 32 +- lite/kernels/fpga/CMakeLists.txt | 2 + lite/kernels/fpga/calib_compute.cc | 29 +- lite/kernels/fpga/calib_compute.h | 12 + lite/kernels/fpga/concat_compute.cc | 3 +- lite/kernels/fpga/conv_compute.cc | 11 + lite/kernels/fpga/elementwise_compute.cc | 52 +++- lite/kernels/fpga/fetch_compute.cc | 24 +- lite/kernels/fpga/interpolate_compute.cc | 282 ++++++++++++++++++ lite/kernels/fpga/interpolate_compute.h | 50 ++++ lite/kernels/fpga/io_copy_compute.cc | 196 ++++++------ lite/kernels/fpga/multiclass_nms_compute.cc | 171 ++++++----- lite/kernels/fpga/prior_box_compute.cc | 3 +- lite/kernels/fpga/reshape_compute.cc | 97 ++++-- lite/kernels/fpga/reshape_compute.h | 8 + lite/kernels/fpga/scale_compute.cc | 4 +- lite/kernels/fpga/scale_compute.h | 2 + lite/kernels/fpga/softmax_compute.cc | 25 +- lite/kernels/fpga/transpose_compute.cc | 34 ++- 41 files changed, 1235 insertions(+), 289 deletions(-) create mode 100644 lite/backends/fpga/KD/dispatch/action.hpp create mode 100644 lite/backends/fpga/KD/dispatch/transaction.hpp create mode 100644 lite/backends/fpga/KD/dispatch/transaction_manager.hpp rename lite/backends/fpga/KD/pes/{resize.hpp => resize_pe.hpp} (64%) create mode 100644 lite/kernels/fpga/interpolate_compute.cc create mode 100644 lite/kernels/fpga/interpolate_compute.h diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index 004536fc8d..454e5db8c6 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -73,6 +73,7 @@ class Debugger { op_config["nms"] = true; op_config["pb_boxes"] = true; op_config["pb_variances"] = true; + op_config["reshape"] = true; op_config["softmax"] = true; op_config["split"] = true; } diff --git a/lite/backends/fpga/KD/dispatch/action.hpp b/lite/backends/fpga/KD/dispatch/action.hpp new file mode 100644 index 0000000000..0235439a07 --- /dev/null +++ b/lite/backends/fpga/KD/dispatch/action.hpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace zynqmp { + +class Action { +public: + void readScale(float* scale) { + + } + + void writeScale(float* scale) { + + } + +private: + int id_ = -1; + int scaleIndex_ = -1; +} + +} +} \ No newline at end of file diff --git a/lite/backends/fpga/KD/dispatch/transaction.hpp b/lite/backends/fpga/KD/dispatch/transaction.hpp new file mode 100644 index 0000000000..c5f19e0e4e --- /dev/null +++ b/lite/backends/fpga/KD/dispatch/transaction.hpp @@ -0,0 +1,40 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/fpga/KD/dispatch/action.hpp" + +#include +#include + +namespace paddle { +namespace zynqmp { + +class Transaction { + +public: + void appendAction(Action* action) { + actions_.push_back(action); + }; + + void startTraction() { + + }; + +private: + std::std::vector actions_; + int id_ = -1; +} + +} +} \ No newline at end of file diff --git a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp new file mode 100644 index 0000000000..b24e154402 --- /dev/null +++ b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp @@ -0,0 +1,47 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle { +namespace zynqmp { + +class TransactionManager { +public: + static TransactionManager& get_instance() { + static TransactionManager s_instance; + return s_instance; + } + + Transaction* getTransaction() { + if (currentTransaction_ == nullptr) { + currentTransaction_ = new Transaction(); + transactions_.push_back(currentTransaction_); + } + return currentTransaction_; + }; + + void endTransaction() { + currentTransaction_ = nullptr; + } + +private: + Transaction* currentTransaction_ = nullptr; + std::vector transactions_; +} + +} +} \ No newline at end of file diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp index e09b9d67d1..a6dd5e7241 100755 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in, for (int n = 0; n < num; n++) { float* filter_start = data_in + n * chw; int8_t* quantized_start = quantized_data + n * chw; - // float f_max = find_max(filter_start, chw); - float f_max = max; + float f_max = find_max(filter_start, chw); + // float f_max = max; quantize(filter_start, quantized_start, chw, f_max); filter_max.push_back(f_max); } diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp index cea22e0edc..6eed7d6080 100755 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -264,10 +264,10 @@ inline void format_filter(Tensor* filter, quantized_filter->flush(); fpga_free(quantized_data); - // for (size_t i = 0; i < max_values.size(); i++) { - // // scales.push_back(max_values[i] / max_value); - // scales.push_back(1.0f); - // } + for (size_t i = 0; i < max_values.size(); i++) { + scales.push_back(max_values[i] / max_value); + // scales.push_back(1.0f); + } // filter->saveToFile("filter.txt"); // std::ofstream ofs; @@ -374,17 +374,15 @@ inline void split_filter_num(const ConvParam& c_param) { std::vector v; // TODO(chonwhite) change variable name; format_filter(&new_filter, &(conv_param->filter), param.groups, v, max); conv_param->filter.setDataType(INT8); - Tensor scale; Tensor bias; int chnnnel_start = i * filter_num_per_div; - Shape s_shape(NC, {1, filter_num}); float* scale_data = scale.mutableData(FP32, s_shape); float* bias_data = bias.mutableData(FP32, s_shape); for (int n = 0; n < filter_num; n++) { - scale_data[n] = param.scale()->data()[n + chnnnel_start]; + scale_data[n] = param.scale()->data()[n + chnnnel_start] * v[n]; } for (int n = 0; n < filter_num; n++) { bias_data[n] = param.bias()->data()[n + chnnnel_start]; @@ -513,7 +511,7 @@ inline void pack_channel_filter(const ConvParam& c_param) { float* scale_data = scale.mutableData(FP32, s_shape); float* bias_data = bias.mutableData(FP32, s_shape); for (int n = 0; n < filter_current_pack; n++) { - scale_data[n] = param.scale()->data()[n + chnnnel_start]; + scale_data[n] = param.scale()->data()[n + chnnnel_start] * v[n]; } for (int n = 0; n < filter_current_pack; n++) { bias_data[n] = param.bias()->data()[n + chnnnel_start]; diff --git a/lite/backends/fpga/KD/pes/input_pe.hpp b/lite/backends/fpga/KD/pes/input_pe.hpp index 380c85e17e..ec32658de1 100755 --- a/lite/backends/fpga/KD/pes/input_pe.hpp +++ b/lite/backends/fpga/KD/pes/input_pe.hpp @@ -41,7 +41,9 @@ class InputPE : public PE { src = &half_tensor; } output->mutableData(); - src->alignImage(output, true); + src->alignImage(); + output->copyFrom(src); + // src->alignImage(output, true); return true; } diff --git a/lite/backends/fpga/KD/pes/norm_pe.hpp b/lite/backends/fpga/KD/pes/norm_pe.hpp index 0537df27e2..a3da530736 100644 --- a/lite/backends/fpga/KD/pes/norm_pe.hpp +++ b/lite/backends/fpga/KD/pes/norm_pe.hpp @@ -103,6 +103,7 @@ class NormPE : public PE { float_out.flush(); // float_out.saveToFile("normalize_", true); param_.output->copyFrom(&float_out); + param_.output->flush(); } bool dispatch() { diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp index 2d02d30fba..015c934a5b 100755 --- a/lite/backends/fpga/KD/pes/output_pe.hpp +++ b/lite/backends/fpga/KD/pes/output_pe.hpp @@ -56,8 +56,8 @@ class OutputPE : public PE { fpga_reset(); - auto max = fpga_get_memory_size_max(); - std::cout << "PL ===== Max: ===== :: " << max << std::endl; + // auto max = fpga_get_memory_size_max(); + // std::cout << "PL ===== Max: ===== :: " << max << std::endl; return true; } diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp index 00dfe1830f..6c2f99087d 100644 --- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp +++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp @@ -241,7 +241,7 @@ void PriorBoxPE::compute_prior_box() { } boxes.flush(); - boxes.syncToCPU(); + // boxes.syncToCPU(); variances.flush(); output_boxes->copyFrom(&boxes); output_variances->copyFrom(&variances); @@ -261,11 +261,12 @@ bool PriorBoxPE::dispatch() { } param_.outputBoxes->copyFrom(this->cachedBoxes_); - param_.outputVariances->copyFrom(this->cachedVariances_); + param_.outputBoxes->flush(); - param_.outputBoxes->syncToCPU(); + // param_.outputBoxes->syncToCPU(); param_.outputVariances->flush(); + return true; } } // namespace zynqmp diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.hpp b/lite/backends/fpga/KD/pes/prior_box_pe.hpp index 8afe40dd30..1fe789084b 100755 --- a/lite/backends/fpga/KD/pes/prior_box_pe.hpp +++ b/lite/backends/fpga/KD/pes/prior_box_pe.hpp @@ -35,6 +35,13 @@ class PriorBoxPE : public PE { PriorBoxParam& param() { return param_; } + ~PriorBoxPE() { + if (cachedBoxes_ != nullptr) { + delete cachedBoxes_; + delete cachedVariances_; + } + } + private: PriorBoxParam param_; Tensor* cachedBoxes_ = nullptr; diff --git a/lite/backends/fpga/KD/pes/resize.hpp b/lite/backends/fpga/KD/pes/resize_pe.hpp similarity index 64% rename from lite/backends/fpga/KD/pes/resize.hpp rename to lite/backends/fpga/KD/pes/resize_pe.hpp index f83896d2c7..98728202b6 100644 --- a/lite/backends/fpga/KD/pes/resize.hpp +++ b/lite/backends/fpga/KD/pes/resize_pe.hpp @@ -73,9 +73,43 @@ class ResizePE : public PE { scale[0] = max / 127.0; scale[1] = 127.0 / max; } + void cpu_compute() { + Shape& in_shape = param_.input->shape(); + Shape& out_shape = param_.output->shape(); + int channel = in_shape.channel(); + int in_height = in_shape.height(); + int in_width = in_shape.width(); + int out_width = out_shape.width(); + int factor = out_shape.width() / in_shape.width(); + + param_.input->syncToCPU(); + + for (int h = 0; h < in_height; h++) { + for (int w = 0; w < in_width; w++) { + int src_index = in_width * channel * h + w * channel; + float16* src = param_.input->data() + src_index; + // std::cout << "src_index:" << src_index << std::endl; + for (int v = 0; v < factor; v++) { + for (int i =0; i < factor; i++) { + int dst_index = out_width * channel * h * factor + + out_width * channel * v + + w * channel * factor + + channel * i; + float16* dst = param_.output->data() + dst_index; + memcpy(dst, src, channel * sizeof(float16)); + // std::cout << "dst_index:" << dst_index << std::endl; + } + } + } + } + param_.output->flush(); + param_.output->copyScaleFrom(param_.input); + } + bool dispatch() { - bool ret = compute_fpga_resize(args_) == 0; + cpu_compute(); + // bool ret = compute_fpga_resize(args_) == 0; return true; } diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp index 09755c65a3..b6b2daa6a2 100755 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -141,22 +141,26 @@ class ScalePE : public PE { Tensor* output = param_.output; Tensor float_input; float* image_addr = float_input.mutableData(FP32, input->shape()); - input->syncToCPU(); + // input->syncToCPU(); + // input->invalidate(); float_input.copyFrom(input); float16* data_out = output->data(); - float* scale_data = param_.scale->data(); + float16* scale_data = param_.scale->data(); int wh = input->shape().width() * input->shape().height(); float16* in_data = input->data(); - float max = 0; for (int i = 0; i < wh; i++) { for (int c = 0; c < input->shape().channel(); c++) { int index = i * input->shape().channel() + c; - float value = half_to_float(in_data[index]) * scale_data[c]; + float x = image_addr[index]; + float y = half_to_float(scale_data[c]); + float value = x * y; + // std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl; + // float value = half_to_float(in_data[index]) * 19.3598f; data_out[index] = float_to_half(value); if (value < 0) { @@ -167,24 +171,27 @@ class ScalePE : public PE { } } } + // exit(-1); output->flush(); output->scale()[0] = max / 127.0f; output->scale()[1] = 127.0f / max; } bool dispatch() { - if (param_.scale->dataType() == FP16) { - DepthwiseConvParam& dw_param = dw_pe_.param(); - memcpy(dw_param.quantizedFilter()->mutableData(), - param_.scale->data(), - param_.scale->shape().numel() * sizeof(float16)); - dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0]; - dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1]; - - dw_param.quantizedFilter()->flush(); - } - param_.input->syncToDevice(); - return dw_pe_.dispatch(); + // if (param_.scale->dataType() == FP16) { + // DepthwiseConvParam& dw_param = dw_pe_.param(); + // memcpy(dw_param.quantizedFilter()->mutableData(), + // param_.scale->data(), + // param_.scale->shape().numel() * sizeof(float16)); + // dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0]; + // dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1]; + // dw_param.quantizedFilter()->flush(); + // } + // param_.input->syncToDevice(); + // return dw_pe_.dispatch(); + + cpu_compute(); + return true; } ScaleParam& param() { return param_; } diff --git a/lite/backends/fpga/KD/pes/softmax_pe.cpp b/lite/backends/fpga/KD/pes/softmax_pe.cpp index 099ed20b8f..7a834169fb 100755 --- a/lite/backends/fpga/KD/pes/softmax_pe.cpp +++ b/lite/backends/fpga/KD/pes/softmax_pe.cpp @@ -154,6 +154,7 @@ bool SoftmaxPE::dispatch() { float_output.flush(); output->copyFrom(&float_output); + output->flush(); return true; } diff --git a/lite/backends/fpga/KD/pes/split_pe.hpp b/lite/backends/fpga/KD/pes/split_pe.hpp index 01a0367874..8c382bbf62 100644 --- a/lite/backends/fpga/KD/pes/split_pe.hpp +++ b/lite/backends/fpga/KD/pes/split_pe.hpp @@ -105,7 +105,7 @@ class SplitPE : public PE { in_stride, out_stride[axis]); input_offset += out_stride[axis]; - // out->flush(); + out->flush(); } return true; } diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 19f8f3b250..2cee46fb55 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -266,22 +266,25 @@ class Tensor { return; } BypassArgs args; - args.input_data_type = - src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; + args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.input_layout_type = LAYOUT_HWC; args.output_layout_type = LAYOUT_HWC; - args.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().numel(), - .width = 1, - .height = 1, - .pad_width = 0u, - .pad_height = 0u}; + args.image = { + .address = src->data(), + .scale_address = src->scale(), + .channels = (uint32_t)src->shape().numel(), + .width = 1, + .height = 1, + .pad_width = 0U, + .pad_height = 0U + }; ImageOutputArgs output = { - .address = data(), .scale_address = scale(), + .address = data(), + .scale_address = scale(), }; + args.output = output; size_t aligned_remainder = src->shape().numel() % 16; if (aligned_remainder > 0) { @@ -380,6 +383,10 @@ class Tensor { } void save_file_with_name(std::string path) { + // std::cout << "saving file: " << path << std::endl; + void* add = (void*)this; + // printf("tensor @: %p data: %p \n", (void *)add, (void*)data()); + // return; std::ofstream ofs; ofs.open(path); ofs << scale()[0] << " / " << scale()[1] << std::endl; @@ -399,8 +406,15 @@ class Tensor { if (dataType_ == INT32) { value = data()[i]; } + + if (i < 10) { + std::cout << value << ","; + } + ofs << value << std::endl; + } + usleep(30000); ofs.close(); } @@ -451,6 +465,7 @@ class Tensor { value = half_to_float(tensor.data()[i]); } os << value << " "; + } os << "\n"; return os; diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc index 5308640495..7b79ac8915 100755 --- a/lite/backends/fpga/lite_tensor.cc +++ b/lite/backends/fpga/lite_tensor.cc @@ -102,6 +102,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { Resize(other.dims()); auto shape = other.zynq_tensor_->shape(); zynq_tensor_->mutableData(zynq_tensor_->dataType(), shape); + precision_ = other.precision_; // this->ZynqTensor()->copyFrom(other.ZynqTensor()); memcpy(this->ZynqTensor()->data(), diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h index 3574d466e9..c6f837db75 100644 --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -109,6 +109,7 @@ class TensorLite { template const R *data() const { return zynq_tensor_->data() + offset_; + // return zynq_tensor_->data(); } void Resize(const DDimLite &ddim) { dims_ = ddim; } @@ -198,7 +199,8 @@ class TensorLite { // set values of precision_ and persistable_ after updating it. // If your tensor is just a temp tensor, such as activations, // you can ignore these two attributes. - PrecisionType precision_{PrecisionType::kUnk}; + // PrecisionType precision_{PrecisionType::kUnk}; + PrecisionType precision_{PrecisionType::kFloat}; bool persistable_{false}; DDimLite dims_; @@ -235,6 +237,28 @@ zynqmp::DataType get_date_type() { return data_type; } +template +PrecisionType get_precistion_type() { + PrecisionType data_type = PrecisionType::kUnk; + if (typeid(T) == typeid(float)) { + data_type = PrecisionType::kFloat; + } + if (typeid(T) == typeid(zynqmp::float16)) { + data_type = PrecisionType::kFP16; + } + if (typeid(T) == typeid(int)) { + data_type = PrecisionType::kInt32; + } + if (typeid(T) == typeid(int32_t)) { + data_type = PrecisionType::kInt32; + } + if (typeid(T) == typeid(int8_t)) { + data_type = PrecisionType::kInt8; + } + + return data_type; +} + template R *TensorLite::mutable_data() { std::vector v; @@ -261,6 +285,7 @@ R *TensorLite::mutable_data() { } zynqmp::Shape input_shape(layout_type, v); zynqmp::DataType data_type = get_date_type(); + precision_ = get_precistion_type(); if (zynq_tensor_.get() == nullptr) { zynq_tensor_.reset(new zynqmp::Tensor()); diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h index 71c6ea9273..4f7d9d110c 100644 --- a/lite/core/mir/kernel_place_correct_pass.h +++ b/lite/core/mir/kernel_place_correct_pass.h @@ -50,6 +50,7 @@ class KernelPlaceCorrectPass : public DebugPass { VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); + // std::cout << "" for (auto& x : graph->StmtTopologicalOrder()) { auto& inst = x->AsStmt(); // The IoCopyOp is a tool operator, it won't support the type inference. @@ -77,6 +78,80 @@ class KernelPlaceCorrectPass : public DebugPass { bool need_correct_place = true; + auto in = x->inlinks.front(); + auto out = x->outlinks.front(); + auto p = in->AsArg().type->precision(); + + std::string node_name = out->AsArg().name; + std::string arg_name = get_argname(node_name, inst.op_info()->outputs()); + + auto op_type = inst.op_type(); + + if (op_type == "reshape" || op_type == "reshape2") { + for (auto* x_in : x->inlinks) { + + std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs()); + // std::cout << "name: " << x_in->AsArg().name << std::endl; + // std::cout << "in_name: " << in_name << std::endl; + if (in_name == "X") { + in = x_in; + std::cout << "found input \n"; + // exit(-1); + } + } + + p = in->AsArg().type->precision(); + if ( p != PrecisionType::kFP16) { + // std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl; + // std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl; + UpdateTarget(inst, TargetType::kHost); + UpdateTensor(inst, in, out, TargetType::kHost); + } + } + + if (inst.op_type() == "fetch") { + UpdateTarget(inst, TargetType::kFPGA); + } + + if (inst.op_type() == "split" || inst.op_type() == "transpose") { + if ( p != PrecisionType::kFP16) { + UpdateTarget(inst, TargetType::kARM); + for (auto* x_out : x->outlinks) { + UpdateTensor(inst, in, x_out, TargetType::kARM); + } + } + } + + if (inst.op_type() == "concat") { + std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl; + std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl; + if ( p != PrecisionType::kFP16) { + UpdateTarget(inst, TargetType::kARM); + UpdateTensor(inst, in, out, TargetType::kARM); + } + } + + // if (inst.op_type() == "elementwise_mul") { + + // for (auto* x_in : x->inlinks) { + + // std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs()); + // std::cout << "name: " << x_in->AsArg().name << std::endl; + // std::cout << "in_name: " << in_name << std::endl; + // if (in_name == "Y") { + // in = x_in; + // std::cout << "found y \n"; + // // exit(-1); + // } + // } + + // if ( p != PrecisionType::kFP16) { + // UpdateTarget(inst, TargetType::kARM); + // UpdateTensor(inst, in, out, TargetType::kARM); + // } + // } + + std::vector in_types; std::vector out_types; for (auto* x_in : x->inlinks) { @@ -88,6 +163,21 @@ class KernelPlaceCorrectPass : public DebugPass { << "-- node name:" << node_name; auto type = inst.picked_kernel().GetInputDeclType(arg_name); + + // std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight) + // << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl; + + // std::cout << " type: "<< inst.op_type() << std::endl; + + if (!x_in->AsArg().is_weight) { + auto p = x_in->AsArg().type->precision(); + auto t = x_in->AsArg().type->target(); + auto l = x_in->AsArg().type->layout(); + // std::cout << "p:" << PrecisionToStr(p) << std::endl; + // std::cout << "t:" << TargetRepr(t) << std::endl; + // std::cout << "layout:" << DataLayoutToStr(l) << std::endl; + } + if (!x_in->AsArg().type) { need_correct_place &= false; } else { @@ -129,18 +219,69 @@ class KernelPlaceCorrectPass : public DebugPass { need_correct_place &= (io_target_same && (in_types[0] != this_type)); if (need_correct_place) { // update this kernel's valid place; - UpdateTarget(inst, in_types[0]); + // UpdateTarget(inst, in_types[0]); } } } + // Update me's kUnk fields by other's fields. void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT + // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl; auto new_place = inst.place(); + new_place.target = new_target; + if (new_target == TargetType::kARM) { + new_place.precision = PrecisionType::kFloat; + new_place.layout = DataLayoutType::kNCHW; + } + + if (new_target == TargetType::kHost) { + new_place.precision = PrecisionType::kFloat; + new_place.layout = DataLayoutType::kNCHW; + } + std::vector places; places.push_back(new_place); inst.ResetKernels(places); + // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl; + } + + void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) { + + auto get_argname = [&]( + const std::string& node_name, + const std::map>& argname_map) + -> std::string { + for (auto& ele : argname_map) { + auto it = + std::find(ele.second.begin(), ele.second.end(), node_name); + if (it != ele.second.end()) return ele.first; + } + return ""; + }; + + std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs()); + std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs()); + + auto type = inst.picked_kernel().GetInputDeclType(in_name); + auto tmp_ptype = in->AsArg().type->precision(); + auto tmp_target = type->target(); + auto tmp_layout = type->layout(); + + if (new_target == TargetType::kARM) { + tmp_target = TargetType::kARM; + tmp_ptype = PrecisionType::kFloat; + tmp_layout = DataLayoutType::kNCHW; + } + + if (new_target == TargetType::kHost) { + tmp_target = TargetType::kHost; + tmp_ptype = PrecisionType::kFloat; + tmp_layout = DataLayoutType::kNCHW; + } + + out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout); } }; diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index 6d45be3b89..a5e057a11b 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -144,6 +144,23 @@ class StaticKernelPickPass : public mir::StmtPass { } } + if (kernel.target() == TARGET(kFPGA)) { + final_score = 4000; + bool in_match = true; + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + if (in_types.count(in_names[i]) && + in_types.at(in_names[i]) != + kernel.GetInputDeclType(tmp)->precision()) { + in_match = false; + } + } + if (in_match) { + final_score = 5000; + } + } + VLOG(4) << "[score(final)]:" << final_score; VLOG(2) << "-------- pick summary for " << instruct.op_type() << " --------"; diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index 121e64dc18..87ebaeeb4b 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -134,6 +134,12 @@ void PrecisionCastPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. std::list nodes; for (auto& node : graph->StmtTopologicalOrder()) { + + // if (node->IsStmt()) { + // auto& s = node->AsStmt(); + // std::cout << "type_precision type:" << s.op_type() << std::endl; + // } + // type_precision_cast_pass nodes.push_back(node); } @@ -231,6 +237,10 @@ void PrecisionCastPass::AddCastInst( // create Op and kernels. bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; std::string cast_type = in_persist ? "calib_once" : "calib"; + + // TODO + cast_type = "calib"; + cast_op_output_arg->AsArg().is_persist = in_persist; auto cast_op = LiteOpRegistry::Global().Create(cast_type); CHECK(cast_op) << "create op [" << cast_op << "] failed"; diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index ed16211de4..89dbb4a420 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -32,6 +32,12 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr& graph) { // Start from inputs of the graph, those should have place set. std::list nodes; for (auto& node : graph->StmtTopologicalOrder()) { + // if (node->IsStmt()) { + // auto& s = node->AsStmt(); + // // std::cout << "type_target type:" << s.op_type() << std::endl; + // }else { + // // std::cout << "type_target not a statement \n"; + // } nodes.push_back(node); } @@ -47,6 +53,7 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr& graph) { ComplementInputs(graph.get(), node, in, &copied_nodes); } } + } void TypeTargetTransformPass::ComplementInputs( @@ -127,7 +134,8 @@ void TypeTargetTransformPass::AddIoCopyInst( auto* io_copy_inst = graph->NewInstructNode(); bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy"; + // std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy"; + std::string io_copy_type = "io_copy"; io_copy_output_arg->AsArg().is_persist = in_persist; // create Op and kernels. auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type); @@ -147,6 +155,7 @@ void TypeTargetTransformPass::AddIoCopyInst( // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type bool is_found = false; std::vector> selected_kernels; + std::cout << "kernels:" << std::to_string(kernels.size()) << std::endl; for (auto& kernel : kernels) { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc index dc78e1b955..f4d23548a9 100644 --- a/lite/kernels/arm/concat_compute.cc +++ b/lite/kernels/arm/concat_compute.cc @@ -64,6 +64,7 @@ void ConcatCompute::Run() { auto& param = Param(); std::vector inputs = param.x; CHECK_GE(inputs.size(), 1); + // std::cout << "concat size:" << std::to_string(inputs.size()) << std::endl; auto* out = param.output; int axis = param.axis; auto* axis_tensor = param.axis_tensor; @@ -72,21 +73,22 @@ void ConcatCompute::Run() { axis = axis_tensor_data[0]; } - switch (inputs.front()->precision()) { - case PRECISION(kFloat): - ConcatFunc(inputs, axis, out); - break; - case PRECISION(kInt32): - ConcatFunc(inputs, axis, out); - break; - case PRECISION(kInt64): - ConcatFunc(inputs, axis, out); - break; - default: - LOG(FATAL) << "Concat does not implement for the " - << "input type:" - << static_cast(inputs.front()->precision()); - } + ConcatFunc(inputs, axis, out); + // switch (inputs.front()->precision()) { + // case PRECISION(kFloat): + // ConcatFunc(inputs, axis, out); + // break; + // case PRECISION(kInt32): + // ConcatFunc(inputs, axis, out); + // break; + // case PRECISION(kInt64): + // ConcatFunc(inputs, axis, out); + // break; + // default: + // LOG(FATAL) << "Concat does not implement for the " + // << "input type:" + // << static_cast(inputs.front()->precision()); + // } } } // namespace arm diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index 7a37d19dbc..fd1f3263c8 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -17,6 +17,8 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps}) add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps}) add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps}) +add_kernel(interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS ${fpga_deps}) + add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps}) add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc index 25614711e7..e4f13aedd8 100755 --- a/lite/kernels/fpga/calib_compute.cc +++ b/lite/kernels/fpga/calib_compute.cc @@ -44,6 +44,17 @@ void CalibComputeFP16ToFp32::Run() { return; } +void CalibComputeFloat2Int::Run() { + auto& param = this->Param(); + const auto* din = param.input->data(); + auto* dout = param.output->mutable_data(); + // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor()); + //TODO + auto out_lod = param.output->mutable_lod(); + *out_lod = param.input->lod(); + return; +} + } // namespace fpga } // namespace kernels } // namespace lite @@ -65,12 +76,28 @@ REGISTER_LITE_KERNEL(calib, DATALAYOUT(kNHWC))}) .Finalize(); +REGISTER_LITE_KERNEL(calib, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::CalibComputeFloat2Int, + float_2_int_fpga) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .Finalize(); + REGISTER_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::CalibComputeFP16ToFp32, - fp16_to_fp32_fpga) + float_to_int_fpga) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h index 3f5c399b9a..9701b52cd9 100644 --- a/lite/kernels/fpga/calib_compute.h +++ b/lite/kernels/fpga/calib_compute.h @@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32 private: }; +class CalibComputeFloat2Int + : public KernelLite { + public: + using param_t = operators::CalibParam; + + void Run() override; + + ~CalibComputeFloat2Int() override{}; + + private: +}; + } // namespace fpga } // namespace kernels } // namespace lite diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc index ad66e30981..523d357709 100755 --- a/lite/kernels/fpga/concat_compute.cc +++ b/lite/kernels/fpga/concat_compute.cc @@ -47,7 +47,8 @@ void ConcatCompute::Run() { pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ConcatParam& concat_param = pe_.param(); - Debugger::get_instance().registerOutput("concat", concat_param.output); + concat_param.output->flush(); + // Debugger::get_instance().registerOutput("concat", concat_param.output); #endif } diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc index bd6adf6093..14de934eb3 100644 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -51,6 +51,11 @@ void ConvCompute::PrepareForRun() { conv_param.activeParam.type = zynqmp::TYPE_RELU; } + if (param.activation_param.Leaky_relu_alpha > 0.001) { + conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU; + conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha; + } + dw_conv_pe_.init(); dw_conv_pe_.apply(); } else { @@ -72,9 +77,15 @@ void ConvCompute::PrepareForRun() { conv_param.activeParam.type = zynqmp::TYPE_RELU; } + if (param.activation_param.Leaky_relu_alpha > 0.001) { + conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU; + conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha; + } + conv_pe_.init(); conv_pe_.apply(); } + // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl; } void ConvCompute::Run() { diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index 0c9df75949..1bcb7f2ae7 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -88,13 +88,33 @@ void ElementwiseMulCompute::PrepareForRun() { scale_.mutableData(zynqmp::FP16, shape); zynqmp::float16* bias_data = bias_.mutableData(zynqmp::FP16, shape); - float scale_value = param.Y->data()[0]; + zynqmp::float16 scale_value = 0; + if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) { + scale_value = zynqmp::float_to_half(param.Y->data()[0]); + // std::cout << "FP32 \n"; + } else { + scale_value = param.Y->data()[0]; + // std::cout << "FP16 \n"; + } + + // std::cout << "channel:" << channel << std::endl; + // std::cout << "production:" << param.Y->dims().production() << std::endl; + + // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; + // exit(-1); for (int i = 0; i < channel; i++) { if (param.Y->dims().production() != 1) { - scale_value = param.Y->ZynqTensor()->data()[i]; + // scale_value = param.Y->ZynqTensor()->data()[i]; + if (param.Y->ZynqTensor()->dataType() == zynqmp::FP32) { + scale_value = zynqmp::float_to_half(param.Y->data()[i]); + } else { + scale_value = param.Y->data()[i]; + } } - scale_data[i] = zynqmp::float_to_half(scale_value); + // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; + // exit(-1); + scale_data[i] = scale_value; bias_data[i] = zero_; } @@ -104,15 +124,17 @@ void ElementwiseMulCompute::PrepareForRun() { void ElementwiseMulCompute::Run() { auto& param = Param(); + // std::cout << "param.Y :" << param.Y->persistable() << std::endl; if (!param.Y->persistable()) { + // TODO scale_.copyFrom(param.Y->ZynqTensor()); - scale_.invalidate(); + scale_.flush();//TODO } pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ScaleParam& scale_param = pe_.param(); - Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); - Debugger::get_instance().registerOutput("ew_mul", scale_param.output); + // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); + // Debugger::get_instance().registerOutput("ew_mul", scale_param.output); #endif } @@ -181,3 +203,21 @@ REGISTER_LITE_KERNEL(elementwise_mul, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ElementwiseMulCompute, + ew_mul_y_arm) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); \ No newline at end of file diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc index d5c8585aae..4ece2a780d 100755 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -64,18 +64,18 @@ void FetchCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(fetch, - kFPGA, - kFP16, - kNHWC, - paddle::lite::kernels::fpga::FetchCompute, - fpga_host) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kAny), - DATALAYOUT(kAny))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) - .Finalize(); +// REGISTER_LITE_KERNEL(fetch, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::FetchCompute, +// fpga_host) +// .BindInput("X", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) +// .Finalize(); REGISTER_LITE_KERNEL(fetch, kFPGA, diff --git a/lite/kernels/fpga/interpolate_compute.cc b/lite/kernels/fpga/interpolate_compute.cc new file mode 100644 index 0000000000..7358ec1bf3 --- /dev/null +++ b/lite/kernels/fpga/interpolate_compute.cc @@ -0,0 +1,282 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/interpolate_compute.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +void BilinearInterpCompute::Run() { + // auto& param = Param(); + // lite::Tensor* X = param.X; + // lite::Tensor* OutSize = param.OutSize; + // auto SizeTensor = param.SizeTensor; + // auto Scale = param.Scale; + // lite::Tensor* Out = param.Out; + // float scale = param.scale; + // int out_w = param.out_w; + // int out_h = param.out_h; + // bool align_corners = param.align_corners; + // std::string interp_method = "Bilinear"; + // lite::arm::math::interpolate(X, + // OutSize, + // SizeTensor, + // Scale, + // Out, + // out_h, + // out_w, + // scale, + // align_corners, + // interp_method); +} + + +void nearest_interp(const float16* src, + int w_in, + int h_in, + int c, + float16* dst, + int w_out, + int h_out, + float scale_x, + float scale_y, + bool with_align) { + float scale_w_new = (with_align) + ? (static_cast(w_in - 1) / (w_out - 1)) + : (static_cast(w_in) / (w_out)); + float scale_h_new = (with_align) + ? (static_cast(h_in - 1) / (h_out - 1)) + : (static_cast(h_in) / (h_out)); + if (with_align) { + for (int h = 0; h < h_out; ++h) { + float16* dst_p = dst + h * w_out * c; + int near_y = static_cast(scale_h_new * h + 0.5); + for (int w = 0; w < w_out; ++w) { + int near_x = static_cast(scale_w_new * w + 0.5); + // *dst_p++ = src[near_y * w_in + near_x]; + const float16* src_n = src + (near_y * w_in + near_x) * c; + memcpy(dst_p, src_n, c * sizeof(float16)); + + dst_p += c; + } + } + } else { + for (int h = 0; h < h_out; ++h) { + float16* dst_p = dst + h * w_out; + int near_y = static_cast(scale_h_new * h); + for (int w = 0; w < w_out; ++w) { + int near_x = static_cast(scale_w_new * w); + + const float16* src_n = src + (near_y * w_in + near_x) * c; + memcpy(dst_p, src_n, c * sizeof(float16)); + dst_p += c; + } + } + } +} + +void NearestInterpCompute::PrepareForRun() { + auto& param = Param(); + lite::Tensor* X = param.X; + lite::Tensor* OutSize = param.OutSize; + lite::Tensor* Out = param.Out; + + Out->mutable_data(); + + zynqmp::ResizeParam& norm_param = pe_.param(); + norm_param.input = X->ZynqTensor(); + norm_param.output = Out->ZynqTensor(); + + pe_.init(); + pe_.apply(); +} + +// TODO +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*tensor->data())); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + + + +void interpolate(lite::Tensor* X, + lite::Tensor* OutSize, + std::vector SizeTensor, + lite::Tensor* Scale, + lite::Tensor* Out, + int out_height, + int out_width, + float scale, + bool with_align, + std::string interpolate_type) { + int in_h = X->dims()[2]; + int in_w = X->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height = new_size[0]; + out_width = new_size[1]; + } else { + auto scale_tensor = Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height = static_cast(in_h * scale); + out_width = static_cast(in_w * scale); + } + auto out_size = OutSize; + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_height = out_size_data[0]; + out_width = out_size_data[1]; + } + } + float height_scale = scale; + float width_scale = scale; + if (out_width > 0 && out_height > 0) { + height_scale = static_cast(out_height / X->dims()[2]); + width_scale = static_cast(out_width / X->dims()[3]); + } + int num_cout = X->dims()[0]; + int c_cout = X->dims()[1]; + Out->Resize({num_cout, c_cout, out_height, out_width}); + + float16* dout = Out->mutable_data(); + const float16* din = X->data(); + int out_num = Out->dims()[0]; + int out_c = Out->dims()[1]; + int count = out_num; + int out_h = Out->dims()[2]; + int out_w = Out->dims()[3]; + int spatial_in = in_h * in_w; + int spatial_out = out_h * out_w; + + + for (int i = 0; i < count; ++i) { + nearest_interp(din + spatial_in * i, + in_w, + in_h, + out_c, + dout + spatial_out * i, + out_w, + out_h, + 1.f / width_scale, + 1.f / height_scale, + with_align); + } +} + +void NearestInterpCompute::Run() { + auto& param = Param(); + lite::Tensor* X = param.X; + lite::Tensor* OutSize = param.OutSize; + auto SizeTensor = param.SizeTensor; + auto Scale = param.Scale; + lite::Tensor* Out = param.Out; + float scale = param.scale; + int out_w = param.out_w; + int out_h = param.out_h; + bool align_corners = param.align_corners; + + + std::string interp_method = ""; + + X->ZynqTensor()->invalidate();//TODO + X->ZynqTensor()->saveToFile("n_in", true); + interpolate(X, + OutSize, + SizeTensor, + Scale, + Out, + out_h, + out_w, + scale, + align_corners, + interp_method); + + + Out->ZynqTensor()->flush(); + Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor()); + Out->ZynqTensor()->saveToFile("n_out", true); + +} + +} /* namespace fpga */ +} /* namespace kernels */ +} /* namespace lite */ +} /* namespace paddle */ + +REGISTER_LITE_KERNEL(bilinear_interp, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::BilinearInterpCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("OutSize", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(nearest_interp, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::NearestInterpCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("OutSize", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/interpolate_compute.h b/lite/kernels/fpga/interpolate_compute.h new file mode 100644 index 0000000000..cc904f9364 --- /dev/null +++ b/lite/kernels/fpga/interpolate_compute.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/backends/fpga/KD/pes/resize_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +class BilinearInterpCompute + : public KernelLite { + public: + void Run() override; + + virtual ~BilinearInterpCompute() = default; +}; + +class NearestInterpCompute + : public KernelLite { + public: + + void PrepareForRun() override; + + void Run() override; + + virtual ~NearestInterpCompute() = default; + private: + zynqmp::ResizePE pe_; +}; + +} /* namespace fpga */ +} /* namespace kernels */ +} /* namespace lite */ +} /* namespace paddle */ diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index 2fd4b0afcf..a7dbf9359f 100755 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -25,10 +25,17 @@ namespace fpga { using float16 = zynqmp::float16; +void copy_properties(operators::IoCopyParam& param) { + param.y->set_persistable(param.x->persistable()); + auto out_lod = param.y->mutable_lod(); + *out_lod = param.x->lod(); + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); +} + /* * This kernel copies a tensor from host to FPGA space. */ -class IoCopyHostToFpgaCompute +class IoCopyHostCHWToFpgaHWCCompute : public KernelLite { public: void Run() override { @@ -37,52 +44,33 @@ class IoCopyHostToFpgaCompute param.x->target() == TARGET(kFPGA)); param.x->ZynqTensor()->flush(); + + if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { param.y->mutable_data(); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + param.y->ZynqTensor()->flush(); + copy_properties(param); return; } - if (param.x->ZynqTensor()->dataType() == zynqmp::FP32) { - param.y->mutable_data(); - if (param.x->ZynqTensor()->aligned() && - param.x->ZynqTensor()->shape().shouldAlign()) { - zynqmp::Tensor tempTensor; - tempTensor.mutableData(zynqmp::FP16, - param.x->ZynqTensor()->shape()); - tempTensor.copyFrom(param.x->ZynqTensor()); - tempTensor.setAligned(true); - tempTensor.unalignImage(); - param.y->ZynqTensor()->copyFrom(&tempTensor); - } else { - param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); - } - param.y->ZynqTensor()->invalidate(); - param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + param.y->mutable_data(); + param.y->ZynqTensor()->setDataLocation(zynqmp::Device); + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + tempTensor.flush(); + param.y->ZynqTensor()->copyFrom(&tempTensor); + } else { + param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } - - auto out_lod = param.y->mutable_lod(); - *out_lod = param.x->lod(); - } - - std::unique_ptr GetTypeInferHandler() override { - std::unique_ptr res(new type_infer_handler_t); - *res = [](const std::map& inputs, - const std::string& out) -> const Type* { - CHECK(!inputs.empty()); - auto* type = inputs.at("Input"); - CHECK(type->target() == TARGET(kHost)); - - auto out_place = type->place(); - out_place.target = TARGET(kFPGA); - auto* out_type = Type::Get(type->id(), - out_place.target, - out_place.precision, - out_place.layout, - out_place.device); - return out_type; - }; - return res; + copy_properties(param); + param.y->ZynqTensor()->invalidate(); } std::string doc() const override { return "Copy IO from HOST to FPGA"; } @@ -98,10 +86,11 @@ class IoCopyFpgaToHostCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - + + param.x->ZynqTensor()->syncToDevice(); param.y->mutable_data(); param.y->ZynqTensor()->setDataType(zynqmp::FP32); - param.x->ZynqTensor()->syncToDevice(); + param.y->ZynqTensor()->setDataLocation(zynqmp::CPU); if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) { @@ -115,10 +104,9 @@ class IoCopyFpgaToHostCompute } else { param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } - param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); - param.y->ZynqTensor()->flush(); - auto out_lod = param.y->mutable_lod(); - *out_lod = param.x->lod(); + + param.y->ZynqTensor()->invalidate(); + copy_properties(param); } std::string doc() const override { return "Copy IO from FPGA to HOST"; } }; @@ -153,14 +141,16 @@ class IoCopyFpgaToHostCHWCompute CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - Tensor hwc; + Tensor hwc; hwc.Resize(param.y->dims()); float* hwc_data = hwc.mutable_data(); - float* chw_data = param.y->mutable_data(); param.y->ZynqTensor()->setDataType(zynqmp::FP32); param.x->ZynqTensor()->syncToDevice(); + hwc.ZynqTensor()->setDataLocation(zynqmp::CPU); + param.y->ZynqTensor()->setDataLocation(zynqmp::CPU); + if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) { zynqmp::Tensor tempTensor; @@ -168,10 +158,30 @@ class IoCopyFpgaToHostCHWCompute param.x->ZynqTensor()->shape()); tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.setAligned(true); + // tempTensor.saveToFile("temp_1", true); tempTensor.unalignImage(); + // tempTensor.saveToFile("temp_2", true); + hwc.ZynqTensor()->copyFrom(&tempTensor); } else { - hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); + // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); + float16* in_data = param.x->ZynqTensor()->data(); + // float* f_data = + param.x->ZynqTensor()->flush(); + float max = 0; + + for (int i = 0; i < param.x->dims().production(); i++) { + float value = zynqmp::half_to_float(in_data[i]); + hwc_data[i] = value; + if (value < 0) { + value = -value; + } + if (value > max) { + max = value; + } + } + param.x->ZynqTensor()->scale()[0] = max / 127; + param.x->ZynqTensor()->scale()[1] = 127 / max; } int num = 1; @@ -188,10 +198,15 @@ class IoCopyFpgaToHostCHWCompute dims.height(), dims.width()); - param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->flush(); - auto out_lod = param.y->mutable_lod(); - *out_lod = param.x->lod(); + copy_properties(param); + + param.x->ZynqTensor()->invalidate(); + param.x->ZynqTensor()->flush(); + // hwc.ZynqTensor()->saveToFile("hwc", true); + // param.x->ZynqTensor()->saveToFile("io2_x", true); + // param.y->ZynqTensor()->saveToFile("io2_y", true); } std::string doc() const override { return "Copy IO from FPGA to HOST"; } }; @@ -201,52 +216,36 @@ class IoCopyFpgaToHostCHWCompute } // namespace lite } // namespace paddle -// REGISTER_LITE_KERNEL(io_copy, -// kFPGA, -// kAny, -// kAny, -// paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute, -// host_to_device) -// .BindInput("Input", -// {LiteType::GetTensorTy(TARGET(kHost), -// PRECISION(kAny), -// DATALAYOUT(kAny))}) -// .BindOutput("Out", -// {LiteType::GetTensorTy(TARGET(kFPGA), -// PRECISION(kAny), -// DATALAYOUT(kAny))}) -// .Finalize(); - REGISTER_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, - paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute, - host_to_device_any_any) + paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute, + host_to_device) .BindInput("Input", - {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kAny), + DATALAYOUT(kAny))}) + .Finalize(); + +REGISTER_LITE_KERNEL(io_copy, + kFPGA, + kAny, + kAny, + paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute, + host_float_chw_to_device_fp16_hwc) + .BindInput("Input", {LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); -// REGISTER_LITE_KERNEL(io_copy, -// kFPGA, -// kAny, -// kAny, -// paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute, -// device_to_host) -// .BindInput("Input", -// {LiteType::GetTensorTy(TARGET(kFPGA), -// PRECISION(kFP16), -// DATALAYOUT(kNHWC))}) -// .BindOutput("Out", -// {LiteType::GetTensorTy(TARGET(kHost), -// PRECISION(kFloat), -// DATALAYOUT(kNHWC))}) -// .Finalize(); REGISTER_LITE_KERNEL(io_copy, kFPGA, @@ -311,3 +310,26 @@ REGISTER_LITE_KERNEL(io_copy, // PRECISION(kAny), // DATALAYOUT(kAny))}) // .Finalize(); + + +// ========================================================== + + // std::unique_ptr GetTypeInferHandler() override { + // std::unique_ptr res(new type_infer_handler_t); + // *res = [](const std::map& inputs, + // const std::string& out) -> const Type* { + // CHECK(!inputs.empty()); + // auto* type = inputs.at("Input"); + // CHECK(type->target() == TARGET(kHost)); + + // auto out_place = type->place(); + // out_place.target = TARGET(kFPGA); + // auto* out_type = Type::Get(type->id(), + // out_place.target, + // out_place.precision, + // out_place.layout, + // out_place.device); + // return out_type; + // }; + // return res; + // } \ No newline at end of file diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc index 23a5aad8e6..9e1e106223 100644 --- a/lite/kernels/fpga/multiclass_nms_compute.cc +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -94,6 +94,7 @@ T PolyIoU(const T* box1, const size_t box_size, const bool normalized) { LOG(FATAL) << "PolyIoU not implement."; + return *box1; } template @@ -128,34 +129,44 @@ void NMSFast(const Tensor& bbox, std::vector* selected_indices, const bool normalized) { // The total boxes for each instance. + // std::cout << "1\n"; int64_t num_boxes = bbox.dims()[0]; + // std::cout << "1,1\n"; // 4: [xmin ymin xmax ymax] // 8: [x1 y1 x2 y2 x3 y3 x4 y4] // 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16 int64_t box_size = bbox.dims()[1]; + // std::cout << "1,2\n"; std::vector scores_data(num_boxes); std::copy_n(scores.data(), num_boxes, scores_data.begin()); + // std::cout << "1,3\n"; std::vector> sorted_indices; + // std::cout << "1,4\n"; GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + // std::cout << "2\n"; selected_indices->clear(); T adaptive_threshold = nms_threshold; const T* bbox_data = bbox.data(); - + // std::cout << "3\n"; while (sorted_indices.size() != 0) { const int idx = sorted_indices.front().second; + // std::cout << "4\n"; bool keep = true; for (size_t k = 0; k < selected_indices->size(); ++k) { + // std::cout << "5\n"; if (keep) { const int kept_idx = (*selected_indices)[k]; T overlap = T(0.); + // std::cout << "6\n"; // 4: [xmin ymin xmax ymax] if (box_size == 4) { overlap = JaccardOverlap(bbox_data + idx * box_size, bbox_data + kept_idx * box_size, normalized); } + // std::cout << "7\n"; // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 if (box_size == 8 || box_size == 16 || box_size == 24 || box_size == 32) { @@ -168,10 +179,13 @@ void NMSFast(const Tensor& bbox, } else { break; } + // std::cout << "8\n"; } + // std::cout << "9\n"; if (keep) { selected_indices->push_back(idx); } + // std::cout << "10\n"; sorted_indices.erase(sorted_indices.begin()); if (keep && eta < 1 && adaptive_threshold > 0.5) { adaptive_threshold *= eta; @@ -195,21 +209,25 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, T score_threshold = static_cast(param.score_threshold); int num_det = 0; - int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1]; + int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1]; + Tensor bbox_slice, score_slice; for (int64_t c = 0; c < class_num; ++c) { - Tensor bbox_slice, score_slice; if (c == background_label) continue; + + // std::cout << "------ 1 \n"; if (scores_size == 3) { + // std::cout << "------ scores_size = 3 \n"; scores.Slice(score_slice, c, c + 1); - bbox_slice = bboxes; + // bbox_slice = bboxes; } else { + // std::cout << "------ scores_size != 3 \n"; score_slice.Resize({scores.dims()[0], 1}); bbox_slice.Resize({scores.dims()[0], 4}); SliceOneClass(scores, c, &score_slice); SliceOneClass(bboxes, c, &bbox_slice); } - NMSFast(bboxes, + NMSFast(bboxes,// TODO score_slice, score_threshold, nms_threshold, @@ -226,8 +244,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param, *num_nmsed_out = num_det; const T* scores_data = scores.data(); if (keep_top_k > -1 && num_det > keep_top_k) { - Tensor score_slice; - const T* sdata; std::vector>> score_index_pairs; for (const auto& it : *indices) { @@ -275,7 +291,9 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, const std::map>& selected_indices, const int scores_size, - Tensor* outs) { + Tensor* outs, + int* oindices = nullptr, + const int offset = 0) { int64_t class_num = scores.dims()[1]; int64_t predict_dim = scores.dims()[1]; int64_t box_size = bboxes.dims()[1]; @@ -305,9 +323,15 @@ void MultiClassOutput(const Tensor& scores, if (scores_size == 3) { bdata = bboxes_data + idx * box_size; odata[count * out_dim + 1] = sdata[idx]; // score + if (oindices != nullptr) { + oindices[count] = offset + idx; + } } else { bdata = bbox.data() + idx * box_size; odata[count * out_dim + 1] = *(scores_data + idx * class_num + label); + if (oindices != nullptr) { + oindices[count] = offset + idx * class_num + label; + } } // xmin, ymin, xmax, ymax or multi-points coordinates std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); @@ -318,36 +342,18 @@ void MultiClassOutput(const Tensor& scores, void MulticlassNmsCompute::Run() { auto& param = Param(); - auto* boxes_in = param.bboxes; - auto* scores_in = param.scores; + auto* boxes = param.bboxes; + auto* scores = param.scores; auto* outs = param.out; - outs->mutable_data(); - - auto score_dims = boxes_in->dims(); + bool return_index = param.index ? true : false; + auto* index = param.index; + auto score_dims = scores->dims(); auto score_size = score_dims.size(); - Tensor boxes_float; - Tensor scores_float; - - boxes_float.Resize(boxes_in->dims()); - scores_float.Resize(scores_in->dims()); - - boxes_float.mutable_data(); - scores_float.mutable_data(); - - boxes_float.ZynqTensor()->copyFrom(boxes_in->ZynqTensor()); - scores_float.ZynqTensor()->copyFrom(scores_in->ZynqTensor()); - - Tensor* boxes = &boxes_float; - Tensor* scores = &scores_float; - - auto box_dims = boxes->dims(); - int64_t box_dim = boxes->dims()[2]; - std::vector>> all_indices; std::vector batch_starts = {0}; int64_t batch_size = score_dims[0]; - + int64_t box_dim = boxes->dims()[2]; int64_t out_dim = box_dim + 2; int num_nmsed_out = 0; Tensor boxes_slice, scores_slice; @@ -372,79 +378,104 @@ void MulticlassNmsCompute::Run() { uint64_t num_kept = batch_starts.back(); if (num_kept == 0) { - outs->Resize({1, 1}); - float* od = outs->mutable_data(); - od[0] = -1; - batch_starts = {0, 1}; + if (return_index) { + outs->Resize({0, out_dim}); + index->Resize({0, 1}); + } else { + outs->Resize({1, 1}); + float* od = outs->mutable_data(); + od[0] = -1; + batch_starts = {0, 1}; + } } else { outs->Resize({static_cast(num_kept), out_dim}); + outs->mutable_data(); + int offset = 0; + int* oindices = nullptr; for (int i = 0; i < n; ++i) { if (score_size == 3) { scores->Slice(scores_slice, i, i + 1); boxes->Slice(boxes_slice, i, i + 1); scores_slice.Resize({score_dims[1], score_dims[2]}); boxes_slice.Resize({score_dims[2], box_dim}); + if (return_index) { + offset = i * score_dims[2]; + } } else { auto boxes_lod = boxes->lod().back(); scores->Slice(scores_slice, boxes_lod[i], boxes_lod[i + 1]); boxes->Slice(boxes_slice, boxes_lod[i], boxes_lod[i + 1]); + if (return_index) { + offset = boxes_lod[i] * score_dims[1]; + } } int64_t s = static_cast(batch_starts[i]); int64_t e = static_cast(batch_starts[i + 1]); - if (e > s) { Tensor out; outs->Slice(out, s, e); - MultiClassOutput( - scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); + if (return_index) { + index->Resize({static_cast(num_kept), 1}); + int* output_idx = index->mutable_data(); + oindices = output_idx + s; + } + MultiClassOutput(scores_slice, + boxes_slice, + all_indices[i], + score_dims.size(), + &out, + oindices, + offset); + // out.ZynqTensor()->saveToFile("nms_o", true); outs->ZynqTensor()->copyFrom(out.ZynqTensor()); - out.ZynqTensor()->saveToFile("nms_oo", true); + outs->ZynqTensor()->flush(); } - outs->Resize({static_cast(e - s), out_dim}); } } + LoD lod; lod.emplace_back(batch_starts); + if (return_index) { + index->set_lod(lod); + } outs->set_lod(lod); -#ifdef FPGA_PRINT_TENSOR - Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor()); - Debugger::get_instance().registerOutput("scores", scores->ZynqTensor()); - Debugger::get_instance().registerOutput("nms", outs->ZynqTensor()); -#endif + // boxes->ZynqTensor()->saveToFile("boxes", true); + // scores->ZynqTensor()->saveToFile("scores", true); + // outs->ZynqTensor()->saveToFile("nms", true); } } // namespace fpga } // namespace kernels } // namespace lite } // namespace paddle -// REGISTER_LITE_KERNEL(multiclass_nms, -// kFPGA, -// kFP16, -// kNHWC, -// paddle::lite::kernels::fpga::MulticlassNmsCompute, -// def) -// .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))}) -// .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) -// .Finalize(); - REGISTER_LITE_KERNEL(multiclass_nms, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulticlassNmsCompute, - def2) - .BindInput("BBoxes", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) - .BindInput("Scores", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFloat), - DATALAYOUT(kNHWC))}) + def) + .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +// REGISTER_LITE_KERNEL(multiclass_nms, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::MulticlassNmsCompute, +// def2) +// .BindInput("BBoxes", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindInput("Scores", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFloat), +// DATALAYOUT(kNHWC))}) +// .Finalize(); diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc index e1f361440c..c19744fa52 100644 --- a/lite/kernels/fpga/prior_box_compute.cc +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() { float offset = param.offset; std::vector aspect_ratios_vec; ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec); - size_t prior_num = aspect_ratios_vec.size() * min_size.size(); + int prior_num = aspect_ratios_vec.size() * min_size.size(); prior_num += max_size.size(); std::vector order = param.order; bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order; @@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() { param.boxes->mutable_data(); param.variances->mutable_data(); + zynqmp::PriorBoxParam& priobox_param = pe_.param(); priobox_param.input = param.input->ZynqTensor(); priobox_param.image = param.image->ZynqTensor(); diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc index b79051f5b1..24c60f54ef 100644 --- a/lite/kernels/fpga/reshape_compute.cc +++ b/lite/kernels/fpga/reshape_compute.cc @@ -23,31 +23,64 @@ namespace fpga { using float16 = zynqmp::float16; -void ReshapeCompute::Run() { + +void FlattenCompute::Run() { auto& param = Param(); - param.output->mutable_data(); auto x = param.x; - // auto actual_shape = param.actual_shape; - Tensor* actual_shape = nullptr; // TODO(chonwhite) change it. auto output = param.output; - bool inplace = param.inplace; - auto x_dims = x->dims(); + output->mutable_data(); auto output_dims = output->dims(); - if (actual_shape) { - auto actual_shape_dims = actual_shape->dims(); - auto* actual_shape_data = actual_shape->data(); - auto shape = std::vector( - actual_shape_data, actual_shape_data + actual_shape_dims.production()); - // output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO - output->Resize(output_dims); + if (param.inplace) { + output->ShareDataWith(*x); + } else { + // output->CopyDataFrom(*x); } - // if (inplace) { - // output->ShareDataWith(*x); - // } else { - // output->CopyDataFrom(*x); - // } + x->ZynqTensor()->unalignImage(); + // x->ZynqTensor()->saveToFile("fi", true); + output->ZynqTensor()->copyFrom(x->ZynqTensor()); + // output->ZynqTensor()->saveToFile("fo", true); + output->ZynqTensor()->flush(); + output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned()); output->Resize(output_dims); + +#ifdef FPGA_PRINT_TENSOR + Debugger::get_instance().registerOutput("flatten", + output->ZynqTensor()); +#endif +} + + +void ReshapeCompute::Run() { + auto& param = Param(); + auto x = param.x; + auto output = param.output; + auto output_dims = output->dims(); + + x->ZynqTensor()->unalignImage(); + + // x->ZynqTensor()->saveToFile("ri", true); + + output->Resize(output_dims); + output->mutable_data(); + + if (param.inplace) { + output->ShareDataWith(*x); + } else { + // output->CopyDataFrom(*x); + } + + + + output->ZynqTensor()->copyFrom(x->ZynqTensor()); + // output->ZynqTensor()->saveToFile("ro", true); + output->ZynqTensor()->flush(); + output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned()); + +#ifdef FPGA_PRINT_TENSOR + Debugger::get_instance().registerOutput("reshape", + output->ZynqTensor()); +#endif } } // namespace fpga @@ -66,9 +99,9 @@ REGISTER_LITE_KERNEL(reshape, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindInput("Shape", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), @@ -86,9 +119,9 @@ REGISTER_LITE_KERNEL(reshape2, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindInput("Shape", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), @@ -103,16 +136,16 @@ REGISTER_LITE_KERNEL(flatten, kFPGA, kFP16, kNHWC, - paddle::lite::kernels::fpga::ReshapeCompute, + paddle::lite::kernels::fpga::FlattenCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindInput("Shape", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), @@ -123,16 +156,16 @@ REGISTER_LITE_KERNEL(flatten2, kFPGA, kFP16, kNHWC, - paddle::lite::kernels::fpga::ReshapeCompute, + paddle::lite::kernels::fpga::FlattenCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindInput("Shape", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kAny), + DATALAYOUT(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/reshape_compute.h b/lite/kernels/fpga/reshape_compute.h index cc5ed0b565..8a3b3c266e 100755 --- a/lite/kernels/fpga/reshape_compute.h +++ b/lite/kernels/fpga/reshape_compute.h @@ -30,6 +30,14 @@ class ReshapeCompute virtual ~ReshapeCompute() = default; }; +class FlattenCompute + : public KernelLite { + public: + void Run() override; + + virtual ~FlattenCompute() = default; +}; + class ReshapeComputeFpgaToHost : public KernelLite { public: diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc index 991c73f295..f28fbf736c 100755 --- a/lite/kernels/fpga/scale_compute.cc +++ b/lite/kernels/fpga/scale_compute.cc @@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() { scale_param.output = param.output->ZynqTensor(); int channel = scale_param.input->shape().channel(); - zynqmp::Tensor* scale = new zynqmp::Tensor(); - zynqmp::Tensor* bias = new zynqmp::Tensor(); + zynqmp::Tensor* scale = &scale_; + zynqmp::Tensor* bias = &bias_; zynqmp::Shape shape(zynqmp::N, {channel}); float* scale_data = scale->mutableData(zynqmp::FP32, shape); float* bias_data = bias->mutableData(zynqmp::FP32, shape); diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h index 217399db72..10ddf04ca7 100755 --- a/lite/kernels/fpga/scale_compute.h +++ b/lite/kernels/fpga/scale_compute.h @@ -37,6 +37,8 @@ class ScaleCompute private: zynqmp::ScalePE pe_; + zynqmp::Tensor scale_; + zynqmp::Tensor bias_; }; } // namespace fpga diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc index b13b5f0f46..25fceda569 100755 --- a/lite/kernels/fpga/softmax_compute.cc +++ b/lite/kernels/fpga/softmax_compute.cc @@ -26,7 +26,8 @@ void SoftmaxCompute::PrepareForRun() { zynqmp::SoftmaxParam& softmax_param = pe_.param(); auto& param = Param(); - param.output->mutable_data(); + // param.output->mutable_data(); + param.output->mutable_data(); softmax_param.input = param.x->ZynqTensor(); softmax_param.output = param.output->ZynqTensor(); pe_.init(); @@ -34,9 +35,13 @@ void SoftmaxCompute::PrepareForRun() { } void SoftmaxCompute::Run() { + zynqmp::SoftmaxParam& softmax_param = pe_.param(); + // softmax_param.input->saveToFile("softmax_in", true); pe_.dispatch(); + + softmax_param.output->flush(); + // softmax_param.output->saveToFile("softmax", true); #ifdef FPGA_PRINT_TENSOR - zynqmp::SoftmaxParam& softmax_param = pe_.param(); Debugger::get_instance().registerOutput("softmax", softmax_param.output); #endif } @@ -57,7 +62,17 @@ REGISTER_LITE_KERNEL(softmax, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + + + + + + + + +// .BindOutput("Out", +// {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) \ No newline at end of file diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc index 4ffeb4c82b..5f55ae3d9f 100644 --- a/lite/kernels/fpga/transpose_compute.cc +++ b/lite/kernels/fpga/transpose_compute.cc @@ -34,17 +34,17 @@ void transposeCompute(operators::TransposeParam param) { input_x->ZynqTensor()->invalidate(); input_x->ZynqTensor()->unalignImage(); - Tensor float_input; - float_input.Resize(input_x_dims); - float_input.mutable_data(); - float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor()); + // Tensor float_input; + // float_input.Resize(input_x_dims); + // float_input.mutable_data(); + // float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor()); - const auto* input_x_data = float_input.data(); + const auto* input_x_data = input_x->data(); auto* out = param.output; const auto axis = param.axis; - auto* out_data = out->mutable_data(); + auto* out_data = out->mutable_data(); size_t ndim = axis.size(); std::vector xdim(ndim); @@ -84,10 +84,11 @@ void transposeCompute(operators::TransposeParam param) { void TransposeCompute::Run() { auto& param = this->Param(); param.output->mutable_data(); - param.x->ZynqTensor()->invalidate(); + // param.x->ZynqTensor()->invalidate(); param.x->ZynqTensor()->unalignImage(); if (param.x->dims().size() != 4) { transposeCompute(param); + param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned()); } else { param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } @@ -96,14 +97,25 @@ void TransposeCompute::Run() { // Transpose2 void Transpose2Compute::Run() { auto& param = this->Param(); - param.output->mutable_data(); - param.x->ZynqTensor()->invalidate(); + param.output->mutable_data(); + // param.x->ZynqTensor()->syncToCPU(); + // param.x->ZynqTensor()->saveToFile("t_in", true); param.x->ZynqTensor()->unalignImage(); + // param.x->ZynqTensor()->saveToFile("t_unaligned", true); + param.x->ZynqTensor()->flush(); + param.x->ZynqTensor()->invalidate(); + if (param.x->dims().size() != 4) { transposeCompute(param); + param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned()); } else { param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } + + // param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + + param.output->ZynqTensor()->flush(); + // param.output->ZynqTensor()->saveToFile("Transpose2", true); } } // namespace fpga @@ -139,6 +151,8 @@ REGISTER_LITE_KERNEL(transpose2, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); -- GitLab