From 8e699af15ea784afb2c8642fe1150534bb56e5b7 Mon Sep 17 00:00:00 2001 From: chonwhite Date: Fri, 15 May 2020 16:25:37 +0800 Subject: [PATCH] ReNext Pass --- lite/backends/fpga/KD/debugger.hpp | 37 +++++++++------- lite/backends/fpga/KD/llapi/filter.cpp | 2 + lite/backends/fpga/KD/pe_params.hpp | 54 ++++++++++++++---------- lite/backends/fpga/KD/pes/conv_pe.hpp | 12 +----- lite/backends/fpga/KD/pes/relu_pe.hpp | 51 ++++++++-------------- lite/backends/fpga/KD/pes/scale_pe.hpp | 1 + lite/backends/fpga/KD/tensor.hpp | 6 +-- lite/kernels/fpga/CMakeLists.txt | 4 +- lite/kernels/fpga/activation_compute.cc | 6 +-- lite/kernels/fpga/elementwise_compute.cc | 24 +++++++---- lite/kernels/fpga/elementwise_compute.h | 3 ++ lite/kernels/fpga/fetch_compute.cc | 1 + 12 files changed, 100 insertions(+), 101 deletions(-) diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index c0510e0f38..fa069688b7 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -48,26 +48,33 @@ class Debugger { void tock(std::string key) {} + void setEnable(bool en) { enabled_ = en; } + private: + bool enabled_ = false; + std::unordered_map op_config; std::unordered_map tick_tock_map; Debugger() { - op_config["concat"] = true; - op_config["pooling"] = true; - op_config["conv"] = true; - op_config["dwconv"] = true; - op_config["ew_add"] = true; - op_config["crop"] = true; - op_config["feed"] = true; - op_config["mul"] = true; - op_config["fetch"] = true; - op_config["boxes"] = true; - op_config["scores"] = true; - op_config["nms"] = true; - op_config["pb_boxes"] = true; - op_config["pb_variances"] = true; + // op_config["concat"] = true; + // op_config["pooling"] = true; + // op_config["conv"] = true; + // op_config["dropout"] = true; + // op_config["dwconv"] = true; + // op_config["ew_add"] = true; + // op_config["ew_mul"] = true; + // op_config["crop"] = true; + // op_config["feed"] = true; // op_config["fc"] = true; - op_config["softmax"] = true; + // op_config["mul"] = true; + // op_config["fetch"] = true; + // op_config["boxes"] = true; + // op_config["scores"] = true; + // op_config["nms"] = true; + // op_config["pb_boxes"] = true; + // op_config["pb_variances"] = true; + + // op_config["softmax"] = true; } }; diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp index 7727345b1c..e09b9d67d1 100755 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -240,6 +240,8 @@ int8_t* format_filter(float* data_in, for (int n = 0; n < num; n++) { float* filter_start = data_in + n * chw; int8_t* quantized_start = quantized_data + n * chw; + // float f_max = find_max(filter_start, chw); + float f_max = max; quantize(filter_start, quantized_start, chw, f_max); filter_max.push_back(f_max); } diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp index 42ec32957e..222a788d35 100644 --- a/lite/backends/fpga/KD/pe_params.hpp +++ b/lite/backends/fpga/KD/pe_params.hpp @@ -83,26 +83,34 @@ struct ConvParam : PEParam { std::vector kernelSize; std::vector dilations; - Tensor* scale() { return scale_; } + Tensor* scale() { return &scale_; } - Tensor* bias() { return bias_; } + Tensor* bias() { return &bias_; } std::vector& splitParams() { return splitParams_; } + ~ConvParam() { + for (int i = 0; i < splitParams_.size(); i++) { + BasicConvParam* basic_param = splitParams_[i]; + delete basic_param; + } + splitParams_.clear(); + } + protected: std::vector splitParams_; - Tensor* scale_ = new Tensor(); - Tensor* bias_ = new Tensor(); + Tensor scale_; + Tensor bias_; }; struct DepthwiseConvParam : ConvParam { public: - Tensor* quantizedFilter() { return quantizedFilter_; } + Tensor* quantizedFilter() { return &quantizedFilter_; } DWconvArgs args; protected: - Tensor* quantizedFilter_ = new Tensor(); + Tensor quantizedFilter_; }; enum PoolingType : int { @@ -142,7 +150,7 @@ struct ElementwiseAddParam : PEParam { struct ElementwiseMulParam : PEParam { public: - Tensor* input_x; + Tensor* input_x = nullptr; Tensor* input_y = nullptr; Tensor* output = nullptr; }; @@ -154,13 +162,13 @@ struct FullyConnectedParam : PEParam { Tensor* bias = nullptr; Tensor* output = nullptr; - Tensor* quantizedFilter() { return quantizedFilter_; } + Tensor* quantizedFilter() { return &quantizedFilter_; } - Tensor* biasScale() { return biasScale_; } + Tensor* biasScale() { return &biasScale_; } protected: - Tensor* quantizedFilter_ = new Tensor(); - Tensor* biasScale_ = new Tensor(); + Tensor quantizedFilter_; + Tensor biasScale_; }; struct SoftmaxParam : PEParam { @@ -193,10 +201,10 @@ struct NormParam : PEParam { }; struct PriorBoxParam : PEParam { - Tensor* input; - Tensor* image; - Tensor* outputBoxes; - Tensor* outputVariances; + Tensor* input = nullptr; + Tensor* image = nullptr; + Tensor* outputBoxes = nullptr; + Tensor* outputVariances = nullptr; std::vector minSizes; std::vector maxSizes; @@ -212,10 +220,10 @@ struct PriorBoxParam : PEParam { }; struct YoloBoxParam : PEParam { - Tensor* input; - Tensor* imgSize; - Tensor* outputBoxes; - Tensor* outputScores; + Tensor* input = nullptr; + Tensor* imgSize = nullptr; + Tensor* outputBoxes = nullptr; + Tensor* outputScores = nullptr; int downsampleRatio; std::vector anchors; int classNum; @@ -229,15 +237,15 @@ struct ScaleParam : PEParam { Tensor* scale = nullptr; Tensor* bias = nullptr; - Tensor* alignedScale() { return alignedScale_; } + Tensor* alignedScale() { return &alignedScale_; } - Tensor* alignedBias() { return alignedBias_; } + Tensor* alignedBias() { return &alignedBias_; } ScaleArgs args = {0}; protected: - Tensor* alignedScale_ = new Tensor(); - Tensor* alignedBias_ = new Tensor(); + Tensor alignedScale_; + Tensor alignedBias_; }; struct ResizeParam : PEParam { diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp index b4eac2c41e..48fb16a7ec 100644 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -195,16 +195,6 @@ class ConvPE : public PE { addPE_.init(); addPE_.apply(); addPE_.dispatch(); - - // param_.output->printScale(); - - // params[0]->input.saveToFile("conv_1.txt"); - // params[1]->input.saveToFile("conv_2.txt"); - - // params[0]->output.saveToFile("ew_o1.txt"); - // params[1]->output.saveToFile("ew_o2.txt"); - // std::cout << "\n ================== EW ================== \n"; - // } } return ret == 0; @@ -212,6 +202,8 @@ class ConvPE : public PE { ConvParam& param() { return param_; } + ~ConvPE() {} + private: bool use_cpu_ = false; bool split_channel = false; diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp index 5c125010c2..dfc7086773 100755 --- a/lite/backends/fpga/KD/pes/relu_pe.hpp +++ b/lite/backends/fpga/KD/pes/relu_pe.hpp @@ -23,43 +23,27 @@ class ReluPE : public PE { public: bool init() { Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); + output->setAligned(param_.input->aligned()); + output->setDataLocation(CPU); return true; } - void apply() { - Tensor* src = param_.input; - - args_.input_data_type = DATA_TYPE_FP16; - args_.output_data_type = DATA_TYPE_FP16; - args_.input_layout_type = LAYOUT_HWC; - args_.output_layout_type = LAYOUT_HWC; - args_.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().channel(), - .width = (uint32_t)src->shape().width(), - .height = (uint32_t)src->shape().height(), - .pad_width = 0u, - .pad_height = 0u}; - args_.output = { - .address = param_.output->data(), - .scale_address = param_.output->scale(), - }; - - inplace_.relu_enable = false; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - } + void apply() {} bool dispatch() { - inplace_.relu_enable = true; - config_inplace(inplace_); - param_.input->syncToDevice(); - param_.output->copyFrom(param_.input); - param_.output->invalidate(); - inplace_.relu_enable = false; - config_inplace(inplace_); + param_.input->invalidate(); + int16_t* input_data = param_.input->data(); + float16* out_data = param_.output->data(); + for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) { + int16_t v = param_.input->data()[i]; + if (v > 0) { + out_data[i] = input_data[i]; + } else { + out_data[i] = zero; + } + } + param_.output->copyScaleFrom(param_.input); + param_.output->flush(); return true; } @@ -67,8 +51,7 @@ class ReluPE : public PE { private: InputParam param_; - BypassArgs args_; - InplaceArgs inplace_; + float16 zero = float_to_half(0.0f); }; } // namespace zynqmp diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp index 09755c65a3..5ff94edd74 100755 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -36,6 +36,7 @@ class ScalePE : public PE { } inline int lcm(int a, int b) { return a * b / gcd(a, b); } + bool init() { Tensor* output = param_.output; output->setAligned(true); diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 1e1793faae..065495fd85 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -283,7 +283,6 @@ class Tensor { .address = data(), .scale_address = scale(), }; args.output = output; - src->syncToDevice(); size_t aligned_remainder = src->shape().numel() % 16; if (aligned_remainder > 0) { size_t dtype_size = @@ -293,7 +292,6 @@ class Tensor { fpga_flush(dst, aligned_remainder * dtype_size); } src->syncToDevice(); - this->invalidate(); perform_bypass(args); this->invalidate(); } @@ -303,8 +301,7 @@ class Tensor { return; } - size_t memorySize = - shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; + size_t memorySize = placeHolder_->memorySize(); fpga_flush(placeHolder_->data(), memorySize); } @@ -384,7 +381,6 @@ class Tensor { } void save_file_with_name(std::string path) { - invalidate(); std::ofstream ofs; ofs.open(path); ofs << scale()[0] << " / " << scale()[1] << std::endl; diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index 34125a0f47..0af17ecbe7 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -5,7 +5,7 @@ endif() set(fpga_deps fpga_target_wrapper kernel_fpga) -# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) +add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) # add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) @@ -25,7 +25,7 @@ add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) -# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) +add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps}) add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps}) # add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc index ecd9af0f8d..f6704204d3 100644 --- a/lite/kernels/fpga/activation_compute.cc +++ b/lite/kernels/fpga/activation_compute.cc @@ -25,10 +25,10 @@ using float16 = zynqmp::float16; void ReluCompute::PrepareForRun() { auto& param = this->Param(); auto output_data = param.Out->mutable_data(); - zynqmp::InputParam& input_param = pe_.param(); + zynqmp::InputParam& relu_param = pe_.param(); - input_param.input = param.X->ZynqTensor(); - input_param.output = param.Out->ZynqTensor(); + relu_param.input = param.X->ZynqTensor(); + relu_param.output = param.Out->ZynqTensor(); pe_.init(); pe_.apply(); } diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index d22cc7abac..1ef6d19e83 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -40,6 +40,7 @@ void ElementwiseAddCompute::PrepareForRun() { pe_.apply(); } void ElementwiseAddCompute::Run() { + usleep(50 * 100 * 1000); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ElementwiseAddParam& ew_param = pe_.param(); @@ -62,6 +63,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() { pe_.apply(); } void ElementwiseAddActivationCompute::Run() { + usleep(500 * 100 * 1000); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ElementwiseAddParam& ew_param = pe_.param(); @@ -80,21 +82,21 @@ void ElementwiseMulCompute::PrepareForRun() { scale_param.activeParam.type = zynqmp::TYPE_NONE; int channel = scale_param.input->shape().channel(); - zynqmp::Tensor* scale = new zynqmp::Tensor(); - zynqmp::Tensor* bias = new zynqmp::Tensor(); - scale_param.scale = scale; - scale_param.bias = bias; + scale_param.scale = &scale_; + scale_param.bias = &bias_; zynqmp::Shape shape(zynqmp::N, {channel}); - float* scale_data = scale->mutableData(zynqmp::FP32, shape); - float* bias_data = bias->mutableData(zynqmp::FP32, shape); + zynqmp::float16* scale_data = + scale_.mutableData(zynqmp::FP16, shape); + zynqmp::float16* bias_data = + bias_.mutableData(zynqmp::FP16, shape); float scale_value = param.Y->data()[0]; - for (int i = 0; i < channel; ++i) { + for (int i = 0; i < channel; i++) { if (param.Y->dims().production() != 1) { scale_value = param.Y->ZynqTensor()->data()[i]; } - scale_data[i] = scale_value; - bias_data[i] = 0; + scale_data[i] = zynqmp::float_to_half(scale_value); + bias_data[i] = zero_; } pe_.init(); @@ -102,6 +104,10 @@ void ElementwiseMulCompute::PrepareForRun() { } void ElementwiseMulCompute::Run() { + auto& param = Param(); + param.Y->ZynqTensor()->flush(); + scale_.copyFrom(param.Y->ZynqTensor()); + scale_.invalidate(); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ScaleParam& scale_param = pe_.param(); diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h index e3e9c52c4c..9fa4991161 100644 --- a/lite/kernels/fpga/elementwise_compute.h +++ b/lite/kernels/fpga/elementwise_compute.h @@ -61,6 +61,9 @@ class ElementwiseMulCompute private: zynqmp::ScalePE pe_; + zynqmp::Tensor scale_; + zynqmp::Tensor bias_; + zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f); }; } // namespace fpga diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc index 71ec37a64d..18aea40c4e 100755 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -55,6 +55,7 @@ void FetchCompute::Run() { #ifdef FPGA_PRINT_TENSOR zynqmp::OutputParam& fetch_param = pe_.param(); Debugger::get_instance().registerOutput("fetch", fetch_param.output); + Debugger::get_instance().setEnable(true); #endif } -- GitLab