diff --git a/lite/backends/fpga/KD/dispatch/action.hpp b/lite/backends/fpga/KD/dispatch/action.hpp index 0235439a0704c65cabcd397d97141f48dc254dec..9a3653be804b750a6eb487c02ee5a8a8b88c4e52 100644 --- a/lite/backends/fpga/KD/dispatch/action.hpp +++ b/lite/backends/fpga/KD/dispatch/action.hpp @@ -18,19 +18,14 @@ namespace paddle { namespace zynqmp { class Action { -public: - void readScale(float* scale) { + public: + void readScale(float* scale) {} - } + void writeScale(float* scale) {} - void writeScale(float* scale) { - - } - -private: + private: int id_ = -1; int scaleIndex_ = -1; } - } -} \ No newline at end of file +} diff --git a/lite/backends/fpga/KD/dispatch/transaction.hpp b/lite/backends/fpga/KD/dispatch/transaction.hpp index c5f19e0e4eb2457472c86de811dd08017bc835bd..6081c720ca62f08ceff8505c7ed410233bd29627 100644 --- a/lite/backends/fpga/KD/dispatch/transaction.hpp +++ b/lite/backends/fpga/KD/dispatch/transaction.hpp @@ -21,20 +21,16 @@ namespace paddle { namespace zynqmp { class Transaction { + public: + void appendAction(Action* action) { actions_.push_back(action); }; -public: - void appendAction(Action* action) { - actions_.push_back(action); - }; + void startTraction(){ - void startTraction() { - }; -private: + private: std::std::vector actions_; int id_ = -1; } - } -} \ No newline at end of file +} diff --git a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp index b24e154402f335c8e95d195e9dd677f3fd3a6280..149f0e0073c28abed5d2f0cc8f253374c77f0555 100644 --- a/lite/backends/fpga/KD/dispatch/transaction_manager.hpp +++ b/lite/backends/fpga/KD/dispatch/transaction_manager.hpp @@ -20,7 +20,7 @@ namespace paddle { namespace zynqmp { class TransactionManager { -public: + public: static TransactionManager& get_instance() { static TransactionManager s_instance; return s_instance; @@ -34,14 +34,11 @@ public: return currentTransaction_; }; - void endTransaction() { - currentTransaction_ = nullptr; - } + void endTransaction() { currentTransaction_ = nullptr; } -private: + private: Transaction* currentTransaction_ = nullptr; std::vector transactions_; } - } -} \ No newline at end of file +} diff --git a/lite/backends/fpga/KD/io.cpp b/lite/backends/fpga/KD/io.cpp index 868287d937bf73a6e34af736b4f94a29f34036ca..2a36a8ec89627d9e07c516c31fb5ca04c2a8d0d9 100644 --- a/lite/backends/fpga/KD/io.cpp +++ b/lite/backends/fpga/KD/io.cpp @@ -15,8 +15,5 @@ limitations under the License. */ #include "io.hpp" namespace paddle { -namespace zynqmp { - - -} // namespace zynqmp +namespace zynqmp {} // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp index 4c5da08a4b1f5ae02965b03da8ff95c09a721f5c..210c02fac5ff3fa135cfc626dbd2d00cbeac12ba 100644 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -70,6 +70,10 @@ class ConvPE : public PE { param_.input->shape().channel() >= 2048) { use_cpu_ = true; } + if (param_.filter->shape().width() == 1 && + param_.filter->shape().num() % 16 != 0) { + use_cpu_ = true; + } if (!use_cpu_) { // param_.filter->releaseData(); } @@ -93,34 +97,38 @@ class ConvPE : public PE { float* filter_data = param_.filter->data(); float* mi = new float[in_channel]; + + int wh = input->shape().width() * input->shape().height(); + float max = 0; for (int i = 0; i < out_channel; i++) { - float* image = image_addr; float* filter_ptr = filter_data + i * in_channel; - float* out_ptr = mi; -#pragma omp parallel for - for (int j = 0; j < in_channel; j++) { - // float32x4_t x0 = vld1q_f32(image); - // float32x4_t x1 = vld1q_f32(filter_ptr); - - // float32x4_t r = vmulq_f32(x0, x1); - - // vst1q_f32(out_ptr, r); - // image += 4; - // filter_ptr += 4; - // out_ptr += 4; - float value = image_addr[j] * filter_ptr[j]; - mi[j] = value; + // #pragma omp parallel for + + for (int k = 0; k < wh; k++) { + float* image = image_addr; + float* out_ptr = mi; + + for (int j = 0; j < in_channel; j++) { + float value = image_addr[k * in_channel + j] * filter_ptr[j]; + mi[j] = value; + } + + float sum = 0; + for (int j = 0; j < in_channel; j++) { + sum += mi[j]; + } + sum *= param_.scale()->data()[i]; + sum += param_.bias()->data()[i]; + out[i * wh + k] = sum; + max = std::max(max, std::abs(sum)); } - - float sum = 0; - for (int j = 0; j < in_channel; j++) { - sum += mi[j]; - } - out[i] = sum; } delete[] mi; float_output.flush(); output->copyFrom(&float_output); + output->scale()[0] = max / 127.0; + output->scale()[1] = 127.0 / max; + // output->saveToFile("cpu", true); } bool dispatch() { @@ -206,7 +214,6 @@ class ConvPE : public PE { // std::cout << "\n ================== EW ================== \n"; // } } - return ret == 0; } diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp index 6c2f99087d779e529208f486b5223621abb6afa1..0ee29b3de885390f0adafc167da513521a2d2184 100644 --- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp +++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp @@ -262,7 +262,7 @@ bool PriorBoxPE::dispatch() { param_.outputBoxes->copyFrom(this->cachedBoxes_); param_.outputVariances->copyFrom(this->cachedVariances_); - + param_.outputBoxes->flush(); // param_.outputBoxes->syncToCPU(); param_.outputVariances->flush(); diff --git a/lite/backends/fpga/KD/pes/resize_pe.hpp b/lite/backends/fpga/KD/pes/resize_pe.hpp index 98728202b62d2ba819848cf4ccb6658800bb8493..e27c60c7694b13ffd02e156e68abe9e29a7af43f 100644 --- a/lite/backends/fpga/KD/pes/resize_pe.hpp +++ b/lite/backends/fpga/KD/pes/resize_pe.hpp @@ -84,28 +84,26 @@ class ResizePE : public PE { param_.input->syncToCPU(); - for (int h = 0; h < in_height; h++) { - for (int w = 0; w < in_width; w++) { - int src_index = in_width * channel * h + w * channel; - float16* src = param_.input->data() + src_index; - // std::cout << "src_index:" << src_index << std::endl; - for (int v = 0; v < factor; v++) { - for (int i =0; i < factor; i++) { - int dst_index = out_width * channel * h * factor + - out_width * channel * v + - w * channel * factor + + for (int h = 0; h < in_height; h++) { + for (int w = 0; w < in_width; w++) { + int src_index = in_width * channel * h + w * channel; + float16* src = param_.input->data() + src_index; + // std::cout << "src_index:" << src_index << std::endl; + for (int v = 0; v < factor; v++) { + for (int i = 0; i < factor; i++) { + int dst_index = out_width * channel * h * factor + + out_width * channel * v + w * channel * factor + channel * i; - float16* dst = param_.output->data() + dst_index; - memcpy(dst, src, channel * sizeof(float16)); - // std::cout << "dst_index:" << dst_index << std::endl; - } - } - } + float16* dst = param_.output->data() + dst_index; + memcpy(dst, src, channel * sizeof(float16)); + // std::cout << "dst_index:" << dst_index << std::endl; + } } - param_.output->flush(); - param_.output->copyScaleFrom(param_.input); + } } - + param_.output->flush(); + param_.output->copyScaleFrom(param_.input); + } bool dispatch() { cpu_compute(); diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp index b6b2daa6a2827bb38a2bee93c4e99797243edcf3..8a2c0c92ec5ee051021ba622cc191f074543cabb 100755 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -158,8 +158,9 @@ class ScalePE : public PE { int index = i * input->shape().channel() + c; float x = image_addr[index]; float y = half_to_float(scale_data[c]); - float value = x * y; - // std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl; + float value = x * y; + // std::cout << " x = " << std::to_string(x) << " y = " << + // std::to_string(y) << " v = " << std::to_string(value) << std::endl; // float value = half_to_float(in_data[index]) * 19.3598f; data_out[index] = float_to_half(value); @@ -188,9 +189,9 @@ class ScalePE : public PE { // dw_param.quantizedFilter()->flush(); // } // param_.input->syncToDevice(); - // return dw_pe_.dispatch(); + return dw_pe_.dispatch(); - cpu_compute(); + // cpu_compute(); return true; } diff --git a/lite/backends/fpga/KD/pes/yolobox_pe.hpp b/lite/backends/fpga/KD/pes/yolobox_pe.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d997fe07833f63bf6843c226477a402d6aeb0357 --- /dev/null +++ b/lite/backends/fpga/KD/pes/yolobox_pe.hpp @@ -0,0 +1,205 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "lite/backends/fpga/KD/pe.hpp" +#include "lite/backends/fpga/KD/pe_params.hpp" + +namespace paddle { +namespace zynqmp { + + +float sigmoid(float x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +inline void GetYoloBox(float* box, const float* x, const int* anchors, int w, + int h, int an_idx, int grid_size, + int input_size, int index, + int img_height, int img_width) { + box[0] = (w + sigmoid(x[index])) * img_width * 1.0f/ grid_size; + box[1] = (h + sigmoid(x[index + 1])) * img_height * 1.0f / grid_size; + box[2] = std::exp(x[index + 2 ]) * anchors[2 * an_idx] * img_width * 1.0f/ + input_size; + box[3] = std::exp(x[index + 3]) * anchors[2 * an_idx + 1] * + img_height * 1.0f / input_size; +} + +inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +inline void CalcDetectionBox(float* boxes, float* box, const int box_idx, + const int img_height, + const int img_width) { + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : 0; + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : 0; + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : (img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : (img_height - 1); +} + +inline void CalcLabelScore(float* scores, const float* input, + const int label_idx, const int score_idx, + const int class_num, const float conf) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i]); + // std::cout << scores[score_idx + i] << " "; + } + // std::cout << std::endl; +} + + +class YoloBoxPE : public PE { + public: + bool init() { + param_.outputBoxes->setAligned(false); + param_.outputScores->setAligned(false); + param_.outputBoxes->setDataLocation(CPU); + param_.outputScores->setDataLocation(CPU); + return true; + } + + bool dispatch() { + auto* input = param_.input; + auto* imgsize = param_.imgSize; + auto* boxes = param_.outputBoxes; + auto* scores = param_.outputScores; + auto anchors = param_.anchors; + int class_num = param_.classNum; + float conf_thresh = param_.confThresh; + int downsample_ratio = param_.downsampleRatio; + + + const int num = input->shape().num(); + const int height = input->shape().height(); + const int width = input->shape().width(); + const int box_num = boxes->shape().channel(); + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * height; + + const int stride = height * width; + const int an_stride = (class_num + 5) * stride; + + Tensor anchors_; + Shape anchors_shape(N, {an_num * 2}); + auto anchors_data = anchors_.mutableData(INT32, anchors_shape); + std::copy(anchors.begin(), anchors.end(), anchors_data); + + input->syncToCPU(); + input->unalignImage(); + // input->setAligned(false); + Tensor input_float; + input_float.setDataLocation(CPU); + float* input_data = input_float.mutableData(FP32, input->shape()); + input_float.copyFrom(input); + // input_float.saveToFile("input_yolobox_half", "true"); + + // input_float.setAligned(input->aligned()); + // input_float.unalignImage(); + // std::cout << "-------------unalignImage-----------------" << std::endl; + // for (int i = 0; i < input_float.shape().numel(); ++i) + // { + // std::cout << input_data[i] << " "; + // } + // std::cout << "-" << std::endl; + // std::cout << "-------------unalignImage-----------------" << std::endl; + // input_float.setAligned(false); + // input_float.saveToFile("input_yolobox_float", "true"); + // input_float.syncToCPU(); + // input_float.invalidate(); + + imgsize->saveToFile("img_size", true); + const int32_t* imgsize_data = imgsize->data(); + + Tensor boxes_float; + Tensor scores_float; + + boxes_float.setDataLocation(CPU); + float* boxes_float_data = boxes_float.mutableData(FP32, boxes->shape()); + memset(boxes_float_data, 0, boxes->shape().numel() * sizeof(float)); + + scores_float.setDataLocation(CPU); + float* scores_float_data = scores_float.mutableData(FP32, scores->shape()); + memset(scores_float_data, 0, scores->shape().numel() * sizeof(float)); + + // float* boxes_data = boxes->mutableData(); + // memset(boxes_data, 0, boxes->shape().numel() * sizeof(float)); + + // float* scores_data = scores->mutableData(); + // memset(scores_data, 0, scores->shape().numel() * sizeof(float)); + + float box[4]; + // for (int n = 0; n < num; n++) { + // int img_height = imgsize_data[2 * i]; + // int img_width = imgsize_data[2 * i + 1]; + int img_height = imgsize_data[0]; + int img_width = imgsize_data[1]; + std::cout << "YoloBoxPE imgsize:" << img_height << "," << img_width << std::endl; + + int channel = input_float.shape().channel(); + int count = 0; + for (int h = 0; h < height; h++) { + for (int w = 0; w < width ; w++) { + for (int n = 0; n < an_num; n++) { + + int obj_idx = channel * width * h + channel * w + n * (5 + class_num) + 4; + // std::cout << obj_idx << " "; + float conf = sigmoid(input_data[obj_idx]); + if (conf < conf_thresh) { + count++; + continue; + } + + int box_idx = channel * width * h + channel * w + n * (5 + class_num) + 0; + GetYoloBox(box, input_data, anchors_data, w, h, n, height, input_size, + box_idx, img_height, img_width); + + box_idx = h * an_num * 4 * width + an_num * 4 * w + n * 4; + CalcDetectionBox(boxes_float_data, box, box_idx, img_height,img_width); + + int label_idx = channel * width * h + channel * w + n * (5 + class_num) + 5; + int score_idx = h * an_num * class_num * width + an_num * class_num * w + n * class_num; + CalcLabelScore(scores_float_data, input_data, label_idx, score_idx, class_num, conf); + } + } + } + + boxes->copyFrom(&boxes_float); + scores->copyFrom(&scores_float); + input->setAligned(true); + } + + void apply(){}; + + YoloBoxParam& param() { return param_; } + + private: + YoloBoxParam param_; + +}; +} // namespace zynqmp +} // namespace paddle diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 2cee46fb55a9dae9a2bd4020b01c4099ea2525c3..3a2996fed4965b019aeae386103e595ba4c6d3d9 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -266,23 +266,21 @@ class Tensor { return; } BypassArgs args; - args.input_data_type = src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; + args.input_data_type = + src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16; args.input_layout_type = LAYOUT_HWC; args.output_layout_type = LAYOUT_HWC; - args.image = { - .address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().numel(), - .width = 1, - .height = 1, - .pad_width = 0U, - .pad_height = 0U - }; + args.image = {.address = src->data(), + .scale_address = src->scale(), + .channels = (uint32_t)src->shape().numel(), + .width = 1, + .height = 1, + .pad_width = 0U, + .pad_height = 0U}; ImageOutputArgs output = { - .address = data(), - .scale_address = scale(), + .address = data(), .scale_address = scale(), }; args.output = output; @@ -385,10 +383,11 @@ class Tensor { void save_file_with_name(std::string path) { // std::cout << "saving file: " << path << std::endl; void* add = (void*)this; - // printf("tensor @: %p data: %p \n", (void *)add, (void*)data()); + // printf("tensor @: %p data: %p \n", (void *)add, (void*)data()); // return; std::ofstream ofs; ofs.open(path); + ofs << "data type: " << dataType() << std::endl; ofs << scale()[0] << " / " << scale()[1] << std::endl; for (int i = 0; i < shape_->numel(); i++) { @@ -406,13 +405,14 @@ class Tensor { if (dataType_ == INT32) { value = data()[i]; } - + if (i < 10) { std::cout << value << ","; } - + // if (i > 1000) { + // break; + // } ofs << value << std::endl; - } usleep(30000); ofs.close(); @@ -465,7 +465,6 @@ class Tensor { value = half_to_float(tensor.data()[i]); } os << value << " "; - } os << "\n"; return os; diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h index c6f837db75c4e321ab17f417d218f9222971688d..f3d409edf756d53c72637504c630e6bad7f4393a 100644 --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -166,6 +166,9 @@ class TensorLite { void clear() { // zynq_tensor_->releaseData(); + if (zynq_tensor_) { + memset(zynq_tensor_->data(), 0, zynq_tensor_->memorySize()); + } } template diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h index 88dbc1bc171eddb4c5666ba7f3a23e17e7383e9d..aa2ac583b2d7a53a4529ad6a4ef3e59293950f39 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h @@ -105,7 +105,6 @@ class ChannelWiseDequantOpFuser : public FuseBase { */ class DeleteQuantDequantOpFuser : public FuseBase { public: - void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; diff --git a/lite/core/mir/kernel_place_correct_pass.h b/lite/core/mir/kernel_place_correct_pass.h index 4f7d9d110caa0850ba1d3d30eeac85d34725a451..1c4439643fcd8aa16d3d81480679760e79fda611 100644 --- a/lite/core/mir/kernel_place_correct_pass.h +++ b/lite/core/mir/kernel_place_correct_pass.h @@ -84,13 +84,13 @@ class KernelPlaceCorrectPass : public DebugPass { std::string node_name = out->AsArg().name; std::string arg_name = get_argname(node_name, inst.op_info()->outputs()); - + auto op_type = inst.op_type(); if (op_type == "reshape" || op_type == "reshape2") { for (auto* x_in : x->inlinks) { - - std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs()); + std::string in_name = + get_argname(x_in->AsArg().name, inst.op_info()->inputs()); // std::cout << "name: " << x_in->AsArg().name << std::endl; // std::cout << "in_name: " << in_name << std::endl; if (in_name == "X") { @@ -101,9 +101,11 @@ class KernelPlaceCorrectPass : public DebugPass { } p = in->AsArg().type->precision(); - if ( p != PrecisionType::kFP16) { - // std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl; - // std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl; + if (p != PrecisionType::kFP16) { + // std::cout << "found an arm ............... : " << + // inst.kernels().size() << std::endl; + // std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << + // std::endl; UpdateTarget(inst, TargetType::kHost); UpdateTensor(inst, in, out, TargetType::kHost); } @@ -113,8 +115,9 @@ class KernelPlaceCorrectPass : public DebugPass { UpdateTarget(inst, TargetType::kFPGA); } - if (inst.op_type() == "split" || inst.op_type() == "transpose") { - if ( p != PrecisionType::kFP16) { + if (inst.op_type() == "split" || inst.op_type() == "transpose" || + inst.op_type() == "transpose2") { + if (p != PrecisionType::kFP16) { UpdateTarget(inst, TargetType::kARM); for (auto* x_out : x->outlinks) { UpdateTensor(inst, in, x_out, TargetType::kARM); @@ -123,9 +126,12 @@ class KernelPlaceCorrectPass : public DebugPass { } if (inst.op_type() == "concat") { - std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) << std::endl; - std::cout << "concat p:" << PrecisionToStr(inst.kernels()[0]->precision()) << std::endl; - if ( p != PrecisionType::kFP16) { + std::cout << "concat target:" << TargetRepr(inst.kernels()[0]->target()) + << std::endl; + std::cout << "concat p:" + << PrecisionToStr(inst.kernels()[0]->precision()) + << std::endl; + if (p != PrecisionType::kFP16) { UpdateTarget(inst, TargetType::kARM); UpdateTensor(inst, in, out, TargetType::kARM); } @@ -134,8 +140,9 @@ class KernelPlaceCorrectPass : public DebugPass { // if (inst.op_type() == "elementwise_mul") { // for (auto* x_in : x->inlinks) { - - // std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs()); + + // std::string in_name = get_argname(x_in->AsArg().name, + // inst.op_info()->inputs()); // std::cout << "name: " << x_in->AsArg().name << std::endl; // std::cout << "in_name: " << in_name << std::endl; // if (in_name == "Y") { @@ -150,7 +157,6 @@ class KernelPlaceCorrectPass : public DebugPass { // UpdateTensor(inst, in, out, TargetType::kARM); // } // } - std::vector in_types; std::vector out_types; @@ -164,11 +170,13 @@ class KernelPlaceCorrectPass : public DebugPass { auto type = inst.picked_kernel().GetInputDeclType(arg_name); - // std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight) - // << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl; + // std::cout << arg_name <<" is weight:: " << + // std::to_string(x_in->AsArg().is_weight) + // << " is persist: " << + // std::to_string(x_in->AsArg().is_persist) << std::endl; // std::cout << " type: "<< inst.op_type() << std::endl; - + if (!x_in->AsArg().is_weight) { auto p = x_in->AsArg().type->precision(); auto t = x_in->AsArg().type->target(); @@ -224,10 +232,10 @@ class KernelPlaceCorrectPass : public DebugPass { } } - // Update me's kUnk fields by other's fields. void UpdateTarget(mir::Node::Stmt& inst, TargetType new_target) { // NOLINT - // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl; + // std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << + // std::endl; auto new_place = inst.place(); new_place.target = new_target; @@ -244,25 +252,30 @@ class KernelPlaceCorrectPass : public DebugPass { std::vector places; places.push_back(new_place); inst.ResetKernels(places); - // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl; + // std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << + // std::endl; } - void UpdateTensor(mir::Node::Stmt& inst, Node* in, Node* out, TargetType new_target = TargetType::kUnk) { - + void UpdateTensor(mir::Node::Stmt& inst, + Node* in, + Node* out, + TargetType new_target = TargetType::kUnk) { auto get_argname = [&]( - const std::string& node_name, - const std::map>& argname_map) - -> std::string { - for (auto& ele : argname_map) { - auto it = - std::find(ele.second.begin(), ele.second.end(), node_name); - if (it != ele.second.end()) return ele.first; - } - return ""; - }; + const std::string& node_name, + const std::map>& argname_map) + -> std::string { + for (auto& ele : argname_map) { + auto it = + std::find(ele.second.begin(), ele.second.end(), node_name); + if (it != ele.second.end()) return ele.first; + } + return ""; + }; - std::string arg_name = get_argname(out->AsArg().name, inst.op_info()->outputs()); - std::string in_name = get_argname(in->AsArg().name, inst.op_info()->inputs()); + std::string arg_name = + get_argname(out->AsArg().name, inst.op_info()->outputs()); + std::string in_name = + get_argname(in->AsArg().name, inst.op_info()->inputs()); auto type = inst.picked_kernel().GetInputDeclType(in_name); auto tmp_ptype = in->AsArg().type->precision(); @@ -281,7 +294,8 @@ class KernelPlaceCorrectPass : public DebugPass { tmp_layout = DataLayoutType::kNCHW; } - out->AsArg().type = LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout); + out->AsArg().type = + LiteType::GetTensorTy(tmp_target, tmp_ptype, tmp_layout); } }; diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc index 1cc8942d611db389a44cbf6a244775a5b666b587..8f18c3982d26e80fdfc7656a7b694d3a9b54d4a2 100755 --- a/lite/core/mir/static_kernel_pick_pass.cc +++ b/lite/core/mir/static_kernel_pick_pass.cc @@ -80,6 +80,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { std::sort(scored.begin(), scored.end(), KernelScoreCmp); instruct.kernels().clear(); + VLOG(2) << "picking kernel " << scored.front().second->name() << "\n\n"; + if (!instruct.op_info()->HasAttr("enable_int8")) { // Move kernel back // Just keep a single best kernel. diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index a5e057a11be969ab5e963c3cee68fc7e80016af4..3ac8978ce9d5cab44b693f5d17cab529f4eb4376 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -157,7 +157,21 @@ class StaticKernelPickPass : public mir::StmtPass { } } if (in_match) { - final_score = 5000; + final_score += 1000; + } + bool out_match = true; + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (out_types.count(out_names[i]) && + out_types.at(out_names[i]) != + kernel.GetOutputDeclType(tmp)->precision()) { + out_match = false; + } + } + + if (out_match) { + final_score += 1000; } } diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 89dbb4a420c610981d5a15ef6e961e7668f21fe7..3d67e98de3654e753d48e5da4ba490be65aceb5a 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -53,7 +53,6 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr& graph) { ComplementInputs(graph.get(), node, in, &copied_nodes); } } - } void TypeTargetTransformPass::ComplementInputs( @@ -74,6 +73,7 @@ void TypeTargetTransformPass::ComplementInputs( auto in_arg_name = in->AsArg().name; std::string tmp; CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp)); + VLOG(4) << "in_arg_name: " << in_arg_name << " tmp:" << tmp; auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp); CHECK(in->AsArg().type); if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) { diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h index 875bf23082a24cb6fcae878b46cc9dcdbb2b76f7..ae0a79d7c6254b85b7e65ebc7aae63bd81765d09 100644 --- a/lite/core/mir/variable_place_inference_pass.h +++ b/lite/core/mir/variable_place_inference_pass.h @@ -141,6 +141,7 @@ class VariablePlaceInferencePass : public DebugPass { x_in->AsArg().type = type; } else { PrecisionType tmp_ptype = x_in->AsArg().type->precision(); + VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype); x_in->AsArg().type = LiteType::GetTensorTy( type->target(), tmp_ptype, type->layout()); } @@ -172,6 +173,9 @@ class VariablePlaceInferencePass : public DebugPass { x_out->AsArg().type = type; } else { PrecisionType tmp_ptype = x_out->AsArg().type->precision(); + tmp_ptype = type->precision(); + // inst.picked_kernel().precision(); + VLOG(4) << "tmp_ptype:" << PrecisionToStr(tmp_ptype); x_out->AsArg().type = LiteType::GetTensorTy( type->target(), tmp_ptype, type->layout()); } diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 83e4abb3dafb619c5d0dfb3a7d93a0a1f113705e..5015b633e7b028ffe98a5c0a156c471271e16b0f 100755 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -134,7 +134,6 @@ class Optimizer { "mlu_postprocess_pass"}}; - if (passes.size() == 1) { // multi_stream_analysis_pass must be in the front of // runtime_context_assign_pass diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index fd1f3263c843b07c33e26ebc7a2b2b7ffdfcb423..d34ba83d0e78627e9419571960c1f9ef63b0275c 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -42,6 +42,8 @@ add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_dep add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps}) add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps}) +add_kernel(yolo_box_compute_fpga FPGA basic SRCS yolo_box_compute.cc DEPS ${fpga_deps}) + # add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps}) # add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc index f6704204d34c309835c1de0ef61afed97c0b29e3..9b5503a7e9946c77f62e9bbdafe4d59c457001e3 100644 --- a/lite/kernels/fpga/activation_compute.cc +++ b/lite/kernels/fpga/activation_compute.cc @@ -35,6 +35,28 @@ void ReluCompute::PrepareForRun() { void ReluCompute::Run() { pe_.dispatch(); } +void SigmoidCompute::Run() { + // TODO(chonwhite) use fpga and arm implementation; + auto& param = this->Param(); + auto output_data = param.Out->mutable_data(); + int numel = param.Out->numel(); + + float16* in_data = param.X->ZynqTensor()->data(); + float16* out_data = param.Out->ZynqTensor()->data(); + param.X->ZynqTensor()->syncToCPU(); + float max = 0.0f; + for (int i = 0; i < numel; i++) { + /* code */ + float value = zynqmp::half_to_float(in_data[i]); + value = 1 / (1 + exp(-value)); + out_data[i] = zynqmp::float_to_half(value); + max = std::max(std::abs(value), max); + } + param.Out->ZynqTensor()->scale()[0] = max / 127.0; + param.Out->ZynqTensor()->scale()[1] = 127.0 / max; + param.Out->ZynqTensor()->flush(); +} + } // namespace fpga } // namespace kernels } // namespace lite @@ -51,3 +73,19 @@ REGISTER_LITE_KERNEL( PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(sigmoid, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::SigmoidCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/activation_compute.h b/lite/kernels/fpga/activation_compute.h index 5cc431e2d41e2de1d841ee386de3aae4434e3865..f39a324f569a48f3c0d582a7376b5128259a504e 100644 --- a/lite/kernels/fpga/activation_compute.h +++ b/lite/kernels/fpga/activation_compute.h @@ -49,6 +49,16 @@ class ReluCompute zynqmp::Tensor output_; }; +class SigmoidCompute + : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~SigmoidCompute() = default; +}; + } // namespace fpga } // namespace kernels } // namespace lite diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc index e4f13aedd8c36126e995a45d83d41c55c957b771..7605f8cd52366d6200cf647c1efd7259e70568fe 100755 --- a/lite/kernels/fpga/calib_compute.cc +++ b/lite/kernels/fpga/calib_compute.cc @@ -49,7 +49,7 @@ void CalibComputeFloat2Int::Run() { const auto* din = param.input->data(); auto* dout = param.output->mutable_data(); // param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor()); - //TODO + // TODO auto out_lod = param.output->mutable_lod(); *out_lod = param.input->lod(); return; diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc index 523d3577095a4c4cbb9b21e8f67a92a22295108f..6af4e7d69d2d3352d44bf67e09d63cc4b26ea50b 100755 --- a/lite/kernels/fpga/concat_compute.cc +++ b/lite/kernels/fpga/concat_compute.cc @@ -45,11 +45,11 @@ void ConcatCompute::PrepareForRun() { void ConcatCompute::Run() { pe_.dispatch(); -#ifdef FPGA_PRINT_TENSOR + // #ifdef FPGA_PRINT_TENSOR zynqmp::ConcatParam& concat_param = pe_.param(); concat_param.output->flush(); - // Debugger::get_instance().registerOutput("concat", concat_param.output); -#endif + Debugger::get_instance().registerOutput("concat", concat_param.output); + // #endif } } // namespace fpga diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc index 14de934eb3c9a3a27d2395612ef97204aac4689a..7754d70d822ebc22e3c1bc4db50504ea2d71e14c 100644 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -53,7 +53,8 @@ void ConvCompute::PrepareForRun() { if (param.activation_param.Leaky_relu_alpha > 0.001) { conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU; - conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha; + conv_param.activeParam.leaky_relu_factor = + param.activation_param.Leaky_relu_alpha; } dw_conv_pe_.init(); @@ -79,13 +80,15 @@ void ConvCompute::PrepareForRun() { if (param.activation_param.Leaky_relu_alpha > 0.001) { conv_param.activeParam.type = zynqmp::TYPE_LEAKY_RELU; - conv_param.activeParam.leaky_relu_factor = param.activation_param.Leaky_relu_alpha; + conv_param.activeParam.leaky_relu_factor = + param.activation_param.Leaky_relu_alpha; } conv_pe_.init(); conv_pe_.apply(); } - // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl; + // std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha + // << std::endl; } void ConvCompute::Run() { diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index 1bcb7f2ae78114055c1c740d3f434594d7a02b64..569a6e049e1ede1608a3d62e66b705ddec4c43b2 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -96,11 +96,12 @@ void ElementwiseMulCompute::PrepareForRun() { scale_value = param.Y->data()[0]; // std::cout << "FP16 \n"; } - + // std::cout << "channel:" << channel << std::endl; // std::cout << "production:" << param.Y->dims().production() << std::endl; - // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; + // std::cout << "scale_value:" << + // std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; // exit(-1); for (int i = 0; i < channel; i++) { @@ -112,7 +113,8 @@ void ElementwiseMulCompute::PrepareForRun() { scale_value = param.Y->data()[i]; } } - // std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; + // std::cout << "scale_value:" << + // std::to_string(zynqmp::half_to_float(scale_value)) << std::endl; // exit(-1); scale_data[i] = scale_value; bias_data[i] = zero_; @@ -128,13 +130,13 @@ void ElementwiseMulCompute::Run() { if (!param.Y->persistable()) { // TODO scale_.copyFrom(param.Y->ZynqTensor()); - scale_.flush();//TODO + scale_.flush(); // TODO } pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ScaleParam& scale_param = pe_.param(); - // Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); - // Debugger::get_instance().registerOutput("ew_mul", scale_param.output); +// Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input); +// Debugger::get_instance().registerOutput("ew_mul", scale_param.output); #endif } @@ -214,8 +216,7 @@ REGISTER_LITE_KERNEL(elementwise_mul, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindInput("Y", - {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc index f28b4c9b3c92ea19d90034e9381e533597765e0f..5a029cd5189b5bb8ddbf920b7b174aab5e49dc18 100755 --- a/lite/kernels/fpga/feed_compute.cc +++ b/lite/kernels/fpga/feed_compute.cc @@ -28,7 +28,14 @@ void FeedCompute::PrepareForRun() { auto& param = this->Param(); Tensor& x = param.feed_list->at(param.col); param.out->Resize(x.dims()); - param.out->mutable_data(); + + auto in_type = x.ZynqTensor()->dataType(); + if (in_type == zynqmp::FP32 || in_type == zynqmp::FP16) { + param.out->mutable_data(); + } + if (in_type == zynqmp::INT32) { + param.out->mutable_data(); + } // ==================================================== zynqmp::InputParam& feed_param = pe_.param(); feed_param.input = x.ZynqTensor(); @@ -68,12 +75,18 @@ REGISTER_LITE_KERNEL( DATALAYOUT(kNHWC))}) .Finalize(); -// REGISTER_LITE_KERNEL(feed, -// kFPGA, -// kFP16, -// kNHWC, -// paddle::lite::kernels::fpga::FeedCompute, -// def_host) -// .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) -// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) -// .Finalize(); +REGISTER_LITE_KERNEL(feed, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::FeedCompute, + feed_int32) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt32), + DATALAYOUT(kNCHW))}) + .Finalize(); \ No newline at end of file diff --git a/lite/kernels/fpga/interpolate_compute.cc b/lite/kernels/fpga/interpolate_compute.cc index 7358ec1bf3f1e146b3b1399b1a51428880484aca..b7eb36c91021f953ff9f17edef8248f7d4f44bdd 100644 --- a/lite/kernels/fpga/interpolate_compute.cc +++ b/lite/kernels/fpga/interpolate_compute.cc @@ -49,7 +49,6 @@ void BilinearInterpCompute::Run() { // interp_method); } - void nearest_interp(const float16* src, int w_in, int h_in, @@ -85,7 +84,7 @@ void nearest_interp(const float16* src, int near_y = static_cast(scale_h_new * h); for (int w = 0; w < w_out; ++w) { int near_x = static_cast(scale_w_new * w); - + const float16* src_n = src + (near_y * w_in + near_x) * c; memcpy(dst_p, src_n, c * sizeof(float16)); dst_p += c; @@ -133,8 +132,6 @@ inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { return vec_new_data; } - - void interpolate(lite::Tensor* X, lite::Tensor* OutSize, std::vector SizeTensor, @@ -188,19 +185,18 @@ void interpolate(lite::Tensor* X, int spatial_in = in_h * in_w; int spatial_out = out_h * out_w; - for (int i = 0; i < count; ++i) { - nearest_interp(din + spatial_in * i, - in_w, - in_h, - out_c, - dout + spatial_out * i, - out_w, - out_h, - 1.f / width_scale, - 1.f / height_scale, - with_align); - } + nearest_interp(din + spatial_in * i, + in_w, + in_h, + out_c, + dout + spatial_out * i, + out_w, + out_h, + 1.f / width_scale, + 1.f / height_scale, + with_align); + } } void NearestInterpCompute::Run() { @@ -215,27 +211,24 @@ void NearestInterpCompute::Run() { int out_h = param.out_h; bool align_corners = param.align_corners; - std::string interp_method = ""; - X->ZynqTensor()->invalidate();//TODO + X->ZynqTensor()->invalidate(); // TODO X->ZynqTensor()->saveToFile("n_in", true); interpolate(X, - OutSize, - SizeTensor, - Scale, - Out, - out_h, - out_w, - scale, - align_corners, - interp_method); + OutSize, + SizeTensor, + Scale, + Out, + out_h, + out_w, + scale, + align_corners, + interp_method); - Out->ZynqTensor()->flush(); Out->ZynqTensor()->copyScaleFrom(X->ZynqTensor()); Out->ZynqTensor()->saveToFile("n_out", true); - } } /* namespace fpga */ @@ -249,15 +242,17 @@ REGISTER_LITE_KERNEL(bilinear_interp, kNHWC, paddle::lite::kernels::fpga::BilinearInterpCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); @@ -268,15 +263,17 @@ REGISTER_LITE_KERNEL(nearest_interp, kNHWC, paddle::lite::kernels::fpga::NearestInterpCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); diff --git a/lite/kernels/fpga/interpolate_compute.h b/lite/kernels/fpga/interpolate_compute.h index cc904f936452debc8c1cfb2b8c30a867afcd5163..86f83c71b0b98848d715178ab65c2fb82ab570d3 100644 --- a/lite/kernels/fpga/interpolate_compute.h +++ b/lite/kernels/fpga/interpolate_compute.h @@ -14,9 +14,9 @@ #pragma once #include +#include "lite/backends/fpga/KD/pes/resize_pe.hpp" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" -#include "lite/backends/fpga/KD/pes/resize_pe.hpp" namespace paddle { namespace lite { @@ -34,12 +34,12 @@ class BilinearInterpCompute class NearestInterpCompute : public KernelLite { public: - void PrepareForRun() override; void Run() override; virtual ~NearestInterpCompute() = default; + private: zynqmp::ResizePE pe_; }; diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc index a7dbf9359f6adf4bb89fbcd78c0ff03637fae173..84f75c2e2a257510c16907bd655863c5b51307de 100755 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -44,8 +44,6 @@ class IoCopyHostCHWToFpgaHWCCompute param.x->target() == TARGET(kFPGA)); param.x->ZynqTensor()->flush(); - - if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { param.y->mutable_data(); param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); @@ -86,7 +84,7 @@ class IoCopyFpgaToHostCompute auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - + param.x->ZynqTensor()->syncToDevice(); param.y->mutable_data(); param.y->ZynqTensor()->setDataType(zynqmp::FP32); @@ -104,7 +102,7 @@ class IoCopyFpgaToHostCompute } else { param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); } - + param.y->ZynqTensor()->invalidate(); copy_properties(param); } @@ -141,16 +139,22 @@ class IoCopyFpgaToHostCHWCompute CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kFPGA)); - Tensor hwc; + param.x->ZynqTensor()->syncToDevice(); + if (param.x->ZynqTensor()->dataType() == zynqmp::INT32) { + param.y->mutable_data(); + param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor()); + return; + } + + Tensor hwc; hwc.Resize(param.y->dims()); float* hwc_data = hwc.mutable_data(); float* chw_data = param.y->mutable_data(); param.y->ZynqTensor()->setDataType(zynqmp::FP32); - param.x->ZynqTensor()->syncToDevice(); hwc.ZynqTensor()->setDataLocation(zynqmp::CPU); param.y->ZynqTensor()->setDataLocation(zynqmp::CPU); - + if (param.x->ZynqTensor()->aligned() && param.x->ZynqTensor()->shape().shouldAlign()) { zynqmp::Tensor tempTensor; @@ -158,15 +162,15 @@ class IoCopyFpgaToHostCHWCompute param.x->ZynqTensor()->shape()); tempTensor.copyFrom(param.x->ZynqTensor()); tempTensor.setAligned(true); - // tempTensor.saveToFile("temp_1", true); - tempTensor.unalignImage(); - // tempTensor.saveToFile("temp_2", true); - + tempTensor.saveToFile("temp_1", true); + // tempTensor.unalignImage(); + tempTensor.saveToFile("temp_2", true); + hwc.ZynqTensor()->copyFrom(&tempTensor); } else { // hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); float16* in_data = param.x->ZynqTensor()->data(); - // float* f_data = + // float* f_data = param.x->ZynqTensor()->flush(); float max = 0; @@ -198,6 +202,7 @@ class IoCopyFpgaToHostCHWCompute dims.height(), dims.width()); + param.y->ZynqTensor()->copyFrom(hwc.ZynqTensor()); // param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); param.y->ZynqTensor()->flush(); copy_properties(param); @@ -205,8 +210,8 @@ class IoCopyFpgaToHostCHWCompute param.x->ZynqTensor()->invalidate(); param.x->ZynqTensor()->flush(); // hwc.ZynqTensor()->saveToFile("hwc", true); - // param.x->ZynqTensor()->saveToFile("io2_x", true); - // param.y->ZynqTensor()->saveToFile("io2_y", true); + param.x->ZynqTensor()->saveToFile("io2_x", true); + param.y->ZynqTensor()->saveToFile("io2_y", true); } std::string doc() const override { return "Copy IO from FPGA to HOST"; } }; @@ -238,15 +243,16 @@ REGISTER_LITE_KERNEL(io_copy, kAny, paddle::lite::kernels::fpga::IoCopyHostCHWToFpgaHWCCompute, host_float_chw_to_device_fp16_hwc) - .BindInput("Input", {LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW))}) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); - REGISTER_LITE_KERNEL(io_copy, kFPGA, kAny, @@ -311,25 +317,24 @@ REGISTER_LITE_KERNEL(io_copy, // DATALAYOUT(kAny))}) // .Finalize(); - // ========================================================== - // std::unique_ptr GetTypeInferHandler() override { - // std::unique_ptr res(new type_infer_handler_t); - // *res = [](const std::map& inputs, - // const std::string& out) -> const Type* { - // CHECK(!inputs.empty()); - // auto* type = inputs.at("Input"); - // CHECK(type->target() == TARGET(kHost)); - - // auto out_place = type->place(); - // out_place.target = TARGET(kFPGA); - // auto* out_type = Type::Get(type->id(), - // out_place.target, - // out_place.precision, - // out_place.layout, - // out_place.device); - // return out_type; - // }; - // return res; - // } \ No newline at end of file +// std::unique_ptr GetTypeInferHandler() override { +// std::unique_ptr res(new type_infer_handler_t); +// *res = [](const std::map& inputs, +// const std::string& out) -> const Type* { +// CHECK(!inputs.empty()); +// auto* type = inputs.at("Input"); +// CHECK(type->target() == TARGET(kHost)); + +// auto out_place = type->place(); +// out_place.target = TARGET(kFPGA); +// auto* out_type = Type::Get(type->id(), +// out_place.target, +// out_place.precision, +// out_place.layout, +// out_place.device); +// return out_type; +// }; +// return res; +// } \ No newline at end of file diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc index c19744fa520e1333180d97a72d8f1f45f9f3a9bf..ac4a5d18a9c7dc9b45684a1ae8072542245fd645 100644 --- a/lite/kernels/fpga/prior_box_compute.cc +++ b/lite/kernels/fpga/prior_box_compute.cc @@ -132,4 +132,3 @@ REGISTER_LITE_KERNEL(prior_box, .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); - diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc index 24c60f54efde70f67aa12a308437ea55ff5f47b7..3f088700f1ee9787d0d19eff00a809cb3daf9ddd 100644 --- a/lite/kernels/fpga/reshape_compute.cc +++ b/lite/kernels/fpga/reshape_compute.cc @@ -23,7 +23,6 @@ namespace fpga { using float16 = zynqmp::float16; - void FlattenCompute::Run() { auto& param = Param(); auto x = param.x; @@ -45,12 +44,10 @@ void FlattenCompute::Run() { output->Resize(output_dims); #ifdef FPGA_PRINT_TENSOR - Debugger::get_instance().registerOutput("flatten", - output->ZynqTensor()); + Debugger::get_instance().registerOutput("flatten", output->ZynqTensor()); #endif } - void ReshapeCompute::Run() { auto& param = Param(); auto x = param.x; @@ -69,17 +66,14 @@ void ReshapeCompute::Run() { } else { // output->CopyDataFrom(*x); } - - output->ZynqTensor()->copyFrom(x->ZynqTensor()); // output->ZynqTensor()->saveToFile("ro", true); output->ZynqTensor()->flush(); output->ZynqTensor()->setAligned(x->ZynqTensor()->aligned()); - + #ifdef FPGA_PRINT_TENSOR - Debugger::get_instance().registerOutput("reshape", - output->ZynqTensor()); + Debugger::get_instance().registerOutput("reshape", output->ZynqTensor()); #endif } @@ -163,7 +157,7 @@ REGISTER_LITE_KERNEL(flatten2, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindInput("Shape", - {LiteType::GetTensorTy(TARGET(kHost), + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny))}) .BindOutput("Out", diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc index 25fceda569a832e0f415fd878705b8a1b99ce34f..8e51a716b092489920f0cfbc729f0cb22e5c71c8 100755 --- a/lite/kernels/fpga/softmax_compute.cc +++ b/lite/kernels/fpga/softmax_compute.cc @@ -38,9 +38,9 @@ void SoftmaxCompute::Run() { zynqmp::SoftmaxParam& softmax_param = pe_.param(); // softmax_param.input->saveToFile("softmax_in", true); pe_.dispatch(); - + softmax_param.output->flush(); - // softmax_param.output->saveToFile("softmax", true); +// softmax_param.output->saveToFile("softmax", true); #ifdef FPGA_PRINT_TENSOR Debugger::get_instance().registerOutput("softmax", softmax_param.output); #endif @@ -61,17 +61,9 @@ REGISTER_LITE_KERNEL(softmax, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); - - - - - - - // .BindOutput("Out", // {LiteType::GetTensorTy(TARGET(kFPGA), // PRECISION(kFP16), diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc index 5f55ae3d9f18f15f82ef15941610a2aecca5e474..d2e9d9c16662a5e8a91ee43af59544c4c1efcecc 100644 --- a/lite/kernels/fpga/transpose_compute.cc +++ b/lite/kernels/fpga/transpose_compute.cc @@ -104,7 +104,7 @@ void Transpose2Compute::Run() { // param.x->ZynqTensor()->saveToFile("t_unaligned", true); param.x->ZynqTensor()->flush(); param.x->ZynqTensor()->invalidate(); - + if (param.x->dims().size() != 4) { transposeCompute(param); param.output->ZynqTensor()->setAligned(param.x->ZynqTensor()->aligned()); @@ -115,7 +115,7 @@ void Transpose2Compute::Run() { // param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor()); param.output->ZynqTensor()->flush(); - // param.output->ZynqTensor()->saveToFile("Transpose2", true); + param.output->ZynqTensor()->saveToFile("Transpose2", true); } } // namespace fpga @@ -151,8 +151,9 @@ REGISTER_LITE_KERNEL(transpose2, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), - PRECISION(kFP16), - DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/fpga/yolo_box_compute.cc b/lite/kernels/fpga/yolo_box_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..1e90cf30c4b0d6b5bf93d85811ef8fbe7324ba34 --- /dev/null +++ b/lite/kernels/fpga/yolo_box_compute.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/fpga/yolo_box_compute.h" +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +void YoloBoxCompute::PrepareForRun() { + auto& param = Param(); + lite::Tensor* X = param.X; + lite::Tensor* ImgSize = param.ImgSize; + lite::Tensor* Boxes = param.Boxes; + lite::Tensor* Scores = param.Scores; + + + Boxes->mutable_data(); + Scores->mutable_data(); + + zynqmp::YoloBoxParam& yolobox_param = pe_.param(); + yolobox_param.input = X->ZynqTensor(); + yolobox_param.imgSize = ImgSize->ZynqTensor(); + yolobox_param.outputBoxes = Boxes->ZynqTensor(); + yolobox_param.outputScores = Scores->ZynqTensor(); + yolobox_param.downsampleRatio = param.downsample_ratio; + yolobox_param.anchors = param.anchors; + yolobox_param.classNum = param.class_num; + yolobox_param.confThresh = param.conf_thresh; + + pe_.init(); + pe_.apply(); + +} + +void YoloBoxCompute::Run() { + + pe_.dispatch(); + + zynqmp::YoloBoxParam& yolobox_param = pe_.param(); + yolobox_param.imgSize->saveToFile("img_size", true); +// exit(-1); + yolobox_param.outputBoxes->saveToFile("yolo_boxes", true); + yolobox_param.outputScores->saveToFile("yolo_scores", true); +} + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle + +// REGISTER_LITE_KERNEL(yolo_box, +// kFPGA, +// kFP16, +// kNHWC, +// paddle::lite::kernels::fpga::YoloBoxCompute, +// def) +// .BindInput("X", {LiteType::GetTensorTy(TARGET(kFPGA), +// PRECISION(kFP16), +// DATALAYOUT(kNHWC))}) +// .BindInput("ImgSize", +// {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) +// .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))}) +// .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))}) +// .Finalize(); diff --git a/lite/kernels/fpga/yolo_box_compute.h b/lite/kernels/fpga/yolo_box_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e4c573cf6719ea1b49fb83b431182ff57c8f4796 --- /dev/null +++ b/lite/kernels/fpga/yolo_box_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +#include "lite/backends/fpga/KD/float16.hpp" +#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" +#include "lite/backends/fpga/KD/pes/yolobox_pe.hpp" + +namespace paddle { +namespace lite { +namespace kernels { +namespace fpga { + +using float16 = zynqmp::float16; + +class YoloBoxCompute + : public KernelLite { + public: + void PrepareForRun() override; + void Run() override; + + virtual ~YoloBoxCompute() { + + }; + + private: + zynqmp::YoloBoxPE pe_; +}; + +} // namespace fpga +} // namespace kernels +} // namespace lite +} // namespace paddle