diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index c0510e0f381a2ba6ae355870752dcb7dae1bd93f..fa069688b7cfa80adc299fa9668bd2b045897292 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -48,26 +48,33 @@ class Debugger { void tock(std::string key) {} + void setEnable(bool en) { enabled_ = en; } + private: + bool enabled_ = false; + std::unordered_map op_config; std::unordered_map tick_tock_map; Debugger() { - op_config["concat"] = true; - op_config["pooling"] = true; - op_config["conv"] = true; - op_config["dwconv"] = true; - op_config["ew_add"] = true; - op_config["crop"] = true; - op_config["feed"] = true; - op_config["mul"] = true; - op_config["fetch"] = true; - op_config["boxes"] = true; - op_config["scores"] = true; - op_config["nms"] = true; - op_config["pb_boxes"] = true; - op_config["pb_variances"] = true; + // op_config["concat"] = true; + // op_config["pooling"] = true; + // op_config["conv"] = true; + // op_config["dropout"] = true; + // op_config["dwconv"] = true; + // op_config["ew_add"] = true; + // op_config["ew_mul"] = true; + // op_config["crop"] = true; + // op_config["feed"] = true; // op_config["fc"] = true; - op_config["softmax"] = true; + // op_config["mul"] = true; + // op_config["fetch"] = true; + // op_config["boxes"] = true; + // op_config["scores"] = true; + // op_config["nms"] = true; + // op_config["pb_boxes"] = true; + // op_config["pb_variances"] = true; + + // op_config["softmax"] = true; } }; diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp index 7727345b1c138ba7d84bcbcd078badb2e2fb98d5..e09b9d67d1263278abcd84d6ab9d7e392ee94b48 100755 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -240,6 +240,8 @@ int8_t* format_filter(float* data_in, for (int n = 0; n < num; n++) { float* filter_start = data_in + n * chw; int8_t* quantized_start = quantized_data + n * chw; + // float f_max = find_max(filter_start, chw); + float f_max = max; quantize(filter_start, quantized_start, chw, f_max); filter_max.push_back(f_max); } diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp index 42ec32957e5884aaae3cc96f46060de114b44ead..222a788d351d9b3dd2cde7c595af898602990ea3 100644 --- a/lite/backends/fpga/KD/pe_params.hpp +++ b/lite/backends/fpga/KD/pe_params.hpp @@ -83,26 +83,34 @@ struct ConvParam : PEParam { std::vector kernelSize; std::vector dilations; - Tensor* scale() { return scale_; } + Tensor* scale() { return &scale_; } - Tensor* bias() { return bias_; } + Tensor* bias() { return &bias_; } std::vector& splitParams() { return splitParams_; } + ~ConvParam() { + for (int i = 0; i < splitParams_.size(); i++) { + BasicConvParam* basic_param = splitParams_[i]; + delete basic_param; + } + splitParams_.clear(); + } + protected: std::vector splitParams_; - Tensor* scale_ = new Tensor(); - Tensor* bias_ = new Tensor(); + Tensor scale_; + Tensor bias_; }; struct DepthwiseConvParam : ConvParam { public: - Tensor* quantizedFilter() { return quantizedFilter_; } + Tensor* quantizedFilter() { return &quantizedFilter_; } DWconvArgs args; protected: - Tensor* quantizedFilter_ = new Tensor(); + Tensor quantizedFilter_; }; enum PoolingType : int { @@ -142,7 +150,7 @@ struct ElementwiseAddParam : PEParam { struct ElementwiseMulParam : PEParam { public: - Tensor* input_x; + Tensor* input_x = nullptr; Tensor* input_y = nullptr; Tensor* output = nullptr; }; @@ -154,13 +162,13 @@ struct FullyConnectedParam : PEParam { Tensor* bias = nullptr; Tensor* output = nullptr; - Tensor* quantizedFilter() { return quantizedFilter_; } + Tensor* quantizedFilter() { return &quantizedFilter_; } - Tensor* biasScale() { return biasScale_; } + Tensor* biasScale() { return &biasScale_; } protected: - Tensor* quantizedFilter_ = new Tensor(); - Tensor* biasScale_ = new Tensor(); + Tensor quantizedFilter_; + Tensor biasScale_; }; struct SoftmaxParam : PEParam { @@ -193,10 +201,10 @@ struct NormParam : PEParam { }; struct PriorBoxParam : PEParam { - Tensor* input; - Tensor* image; - Tensor* outputBoxes; - Tensor* outputVariances; + Tensor* input = nullptr; + Tensor* image = nullptr; + Tensor* outputBoxes = nullptr; + Tensor* outputVariances = nullptr; std::vector minSizes; std::vector maxSizes; @@ -212,10 +220,10 @@ struct PriorBoxParam : PEParam { }; struct YoloBoxParam : PEParam { - Tensor* input; - Tensor* imgSize; - Tensor* outputBoxes; - Tensor* outputScores; + Tensor* input = nullptr; + Tensor* imgSize = nullptr; + Tensor* outputBoxes = nullptr; + Tensor* outputScores = nullptr; int downsampleRatio; std::vector anchors; int classNum; @@ -229,15 +237,15 @@ struct ScaleParam : PEParam { Tensor* scale = nullptr; Tensor* bias = nullptr; - Tensor* alignedScale() { return alignedScale_; } + Tensor* alignedScale() { return &alignedScale_; } - Tensor* alignedBias() { return alignedBias_; } + Tensor* alignedBias() { return &alignedBias_; } ScaleArgs args = {0}; protected: - Tensor* alignedScale_ = new Tensor(); - Tensor* alignedBias_ = new Tensor(); + Tensor alignedScale_; + Tensor alignedBias_; }; struct ResizeParam : PEParam { diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp index b4eac2c41e138cab19197ccb8ab89681a69ec6fe..48fb16a7ecde7416fb32ee228c9dd26e9c0f2d5b 100644 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -195,16 +195,6 @@ class ConvPE : public PE { addPE_.init(); addPE_.apply(); addPE_.dispatch(); - - // param_.output->printScale(); - - // params[0]->input.saveToFile("conv_1.txt"); - // params[1]->input.saveToFile("conv_2.txt"); - - // params[0]->output.saveToFile("ew_o1.txt"); - // params[1]->output.saveToFile("ew_o2.txt"); - // std::cout << "\n ================== EW ================== \n"; - // } } return ret == 0; @@ -212,6 +202,8 @@ class ConvPE : public PE { ConvParam& param() { return param_; } + ~ConvPE() {} + private: bool use_cpu_ = false; bool split_channel = false; diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp index 5c125010c27615c545ba274b259f18c775db3d55..dfc70867735b18f10970864888eca88c7f2dc56e 100755 --- a/lite/backends/fpga/KD/pes/relu_pe.hpp +++ b/lite/backends/fpga/KD/pes/relu_pe.hpp @@ -23,43 +23,27 @@ class ReluPE : public PE { public: bool init() { Tensor* output = param_.output; - output->setAligned(true); - output->setDataLocation(Device); + output->setAligned(param_.input->aligned()); + output->setDataLocation(CPU); return true; } - void apply() { - Tensor* src = param_.input; - - args_.input_data_type = DATA_TYPE_FP16; - args_.output_data_type = DATA_TYPE_FP16; - args_.input_layout_type = LAYOUT_HWC; - args_.output_layout_type = LAYOUT_HWC; - args_.image = {.address = src->data(), - .scale_address = src->scale(), - .channels = (uint32_t)src->shape().channel(), - .width = (uint32_t)src->shape().width(), - .height = (uint32_t)src->shape().height(), - .pad_width = 0u, - .pad_height = 0u}; - args_.output = { - .address = param_.output->data(), - .scale_address = param_.output->scale(), - }; - - inplace_.relu_enable = false; - inplace_.power_enable = false; - inplace_.normalize_enable = false; - } + void apply() {} bool dispatch() { - inplace_.relu_enable = true; - config_inplace(inplace_); - param_.input->syncToDevice(); - param_.output->copyFrom(param_.input); - param_.output->invalidate(); - inplace_.relu_enable = false; - config_inplace(inplace_); + param_.input->invalidate(); + int16_t* input_data = param_.input->data(); + float16* out_data = param_.output->data(); + for (int i = 0; i < param_.input->shape().alignedElementCount(); i++) { + int16_t v = param_.input->data()[i]; + if (v > 0) { + out_data[i] = input_data[i]; + } else { + out_data[i] = zero; + } + } + param_.output->copyScaleFrom(param_.input); + param_.output->flush(); return true; } @@ -67,8 +51,7 @@ class ReluPE : public PE { private: InputParam param_; - BypassArgs args_; - InplaceArgs inplace_; + float16 zero = float_to_half(0.0f); }; } // namespace zynqmp diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp index 09755c65a322da8ccab0d57dd2e877712b112361..5ff94edd747fe9f01741baf1efaad288ed32b98d 100755 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -36,6 +36,7 @@ class ScalePE : public PE { } inline int lcm(int a, int b) { return a * b / gcd(a, b); } + bool init() { Tensor* output = param_.output; output->setAligned(true); diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 1e1793faae664ff7ea999b11d2d1cfb16e57390d..065495fd8571691196700cd9da23af282b882240 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -283,7 +283,6 @@ class Tensor { .address = data(), .scale_address = scale(), }; args.output = output; - src->syncToDevice(); size_t aligned_remainder = src->shape().numel() % 16; if (aligned_remainder > 0) { size_t dtype_size = @@ -293,7 +292,6 @@ class Tensor { fpga_flush(dst, aligned_remainder * dtype_size); } src->syncToDevice(); - this->invalidate(); perform_bypass(args); this->invalidate(); } @@ -303,8 +301,7 @@ class Tensor { return; } - size_t memorySize = - shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; + size_t memorySize = placeHolder_->memorySize(); fpga_flush(placeHolder_->data(), memorySize); } @@ -384,7 +381,6 @@ class Tensor { } void save_file_with_name(std::string path) { - invalidate(); std::ofstream ofs; ofs.open(path); ofs << scale()[0] << " / " << scale()[1] << std::endl; diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index 34125a0f47d373022f0e5e7828a2b11dc629cd48..0af17ecbe76523b8dcff150863661da93b73d553 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -5,7 +5,7 @@ endif() set(fpga_deps fpga_target_wrapper kernel_fpga) -# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) +add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps}) # add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps}) # add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps}) @@ -25,7 +25,7 @@ add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps}) # add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps}) add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps}) add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps}) -# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) +add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op) # add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps}) add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps}) # add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps}) diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc index ecd9af0f8da5df62a15637e88dc4564efb187f6c..f6704204d34c309835c1de0ef61afed97c0b29e3 100644 --- a/lite/kernels/fpga/activation_compute.cc +++ b/lite/kernels/fpga/activation_compute.cc @@ -25,10 +25,10 @@ using float16 = zynqmp::float16; void ReluCompute::PrepareForRun() { auto& param = this->Param(); auto output_data = param.Out->mutable_data(); - zynqmp::InputParam& input_param = pe_.param(); + zynqmp::InputParam& relu_param = pe_.param(); - input_param.input = param.X->ZynqTensor(); - input_param.output = param.Out->ZynqTensor(); + relu_param.input = param.X->ZynqTensor(); + relu_param.output = param.Out->ZynqTensor(); pe_.init(); pe_.apply(); } diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index d22cc7abacc2ecd80e54aa5c62a7e57671b920c9..1ef6d19e83ee71a8e81862c9a4837243f18675f7 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -40,6 +40,7 @@ void ElementwiseAddCompute::PrepareForRun() { pe_.apply(); } void ElementwiseAddCompute::Run() { + usleep(50 * 100 * 1000); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ElementwiseAddParam& ew_param = pe_.param(); @@ -62,6 +63,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() { pe_.apply(); } void ElementwiseAddActivationCompute::Run() { + usleep(500 * 100 * 1000); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ElementwiseAddParam& ew_param = pe_.param(); @@ -80,21 +82,21 @@ void ElementwiseMulCompute::PrepareForRun() { scale_param.activeParam.type = zynqmp::TYPE_NONE; int channel = scale_param.input->shape().channel(); - zynqmp::Tensor* scale = new zynqmp::Tensor(); - zynqmp::Tensor* bias = new zynqmp::Tensor(); - scale_param.scale = scale; - scale_param.bias = bias; + scale_param.scale = &scale_; + scale_param.bias = &bias_; zynqmp::Shape shape(zynqmp::N, {channel}); - float* scale_data = scale->mutableData(zynqmp::FP32, shape); - float* bias_data = bias->mutableData(zynqmp::FP32, shape); + zynqmp::float16* scale_data = + scale_.mutableData(zynqmp::FP16, shape); + zynqmp::float16* bias_data = + bias_.mutableData(zynqmp::FP16, shape); float scale_value = param.Y->data()[0]; - for (int i = 0; i < channel; ++i) { + for (int i = 0; i < channel; i++) { if (param.Y->dims().production() != 1) { scale_value = param.Y->ZynqTensor()->data()[i]; } - scale_data[i] = scale_value; - bias_data[i] = 0; + scale_data[i] = zynqmp::float_to_half(scale_value); + bias_data[i] = zero_; } pe_.init(); @@ -102,6 +104,10 @@ void ElementwiseMulCompute::PrepareForRun() { } void ElementwiseMulCompute::Run() { + auto& param = Param(); + param.Y->ZynqTensor()->flush(); + scale_.copyFrom(param.Y->ZynqTensor()); + scale_.invalidate(); pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::ScaleParam& scale_param = pe_.param(); diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h index e3e9c52c4c660e9ae6852f2ec8cdd815829ad524..9fa4991161dff6bba6c860838863b1cb38393877 100644 --- a/lite/kernels/fpga/elementwise_compute.h +++ b/lite/kernels/fpga/elementwise_compute.h @@ -61,6 +61,9 @@ class ElementwiseMulCompute private: zynqmp::ScalePE pe_; + zynqmp::Tensor scale_; + zynqmp::Tensor bias_; + zynqmp::float16 zero_ = zynqmp::float_to_half(0.0f); }; } // namespace fpga diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc index 71ec37a64d94bcbef00d7e3c2a187bdb28c47935..18aea40c4e27241e0113f326d9cc98bfccf30d2b 100755 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -55,6 +55,7 @@ void FetchCompute::Run() { #ifdef FPGA_PRINT_TENSOR zynqmp::OutputParam& fetch_param = pe_.param(); Debugger::get_instance().registerOutput("fetch", fetch_param.output); + Debugger::get_instance().setEnable(true); #endif }