diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp old mode 100644 new mode 100755 index 2b9b23070616baf18f347c6b2af2d87a300d428f..9b1189c407d6d601bb3e5ba8172b1455f04710fd --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -32,7 +32,8 @@ class Debugger { } void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { - if (op_type != "conv") { // NOLINT + if (op_config[op_type]) { + tensor->saveToFile(op_type, true); } } @@ -40,8 +41,21 @@ class Debugger { std::unordered_map op_config; Debugger() { op_config["concat"] = true; + op_config["pooling"] = true; op_config["conv"] = true; + op_config["dwconv"] = true; + op_config["ew_add"] = true; op_config["crop"] = true; + op_config["feed"] = true; + op_config["mul"] = true; + op_config["fetch"] = true; + op_config["boxes"] = true; + op_config["scores"] = true; + op_config["nms"] = true; + op_config["pb_boxes"] = true; + op_config["pb_variances"] = true; + // op_config["fc"] = true; + op_config["softmax"] = true; } }; @@ -131,9 +145,7 @@ inline void save_tensor(const lite::Tensor* t, chw_to_hwc(const_cast(t), dst); data = dst; } - save_float(data, name, t->numel()); - delete[] dst; } } // namespace lite diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp old mode 100644 new mode 100755 index 30250969b6fbe6e9e5ce7e9f96f963e8bee89224..da81565cf5ca152a54b6cc1514cb660589428439 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -31,7 +31,7 @@ void saveToFile(std::string name, void* data_in, int size) { std::ofstream ofs; ofs.open(name); - int8_t* data = static_cast data_in; + int8_t* data = static_cast(data_in); for (int i = 0; i < size; i++) { float value = data[i]; ofs << value << std::endl; @@ -84,6 +84,14 @@ int calc_num_per_div(int num, int group_num, int division_capacity) { } } +int calc_pack_num(int num_per_group, int group, int division_capacity) { + auto n = 1; + while ((num_per_group * (group + n - 1) / n) > division_capacity) { + n++; + } + return (n); +} + void convert_to_hwc(int8_t* chw_data, int8_t* hwc_data, int num, @@ -231,10 +239,9 @@ int8_t* format_filter(float* data_in, for (int n = 0; n < num; n++) { float* filter_start = data_in + n * chw; - float f_max = find_max(filter_start, chw); int8_t* quantized_start = quantized_data + n * chw; quantize(filter_start, quantized_start, chw, max); - filter_max.push_back(max); + filter_max.push_back(1); } int8_t* hwc_data = @@ -256,6 +263,7 @@ int8_t* format_filter(float* data_in, int filter_num_alignment = get_filter_num_alignment(); int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, filter_num_alignment); + int num_element = div_num * num_per_div_after_alignment * chw_aligned; int8_t* num_aligned_data = reinterpret_cast(fpga_malloc(num_element * sizeof(int8_t))); diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h index 6e056ce0da0d8e731abf7dc418800a8e3d94969a..42d98e74923e116240b145c87b3dc5cfa0210f8d 100644 --- a/lite/backends/fpga/KD/llapi/filter.h +++ b/lite/backends/fpga/KD/llapi/filter.h @@ -31,6 +31,7 @@ int calc_division_capacity(int chw); int calc_split_num(int num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity); +int calc_pack_num(int num_per_group, int group, int division_capacity); float find_max(float* data_in, int data_size); int8_t* format_filter(float* data_in, @@ -40,11 +41,13 @@ int8_t* format_filter(float* data_in, int height, int width, int group_num, - float max, // NOLINT + float max, std::vector& filter_max); // NOLINT void convert_to_hwn(int16_t** data_in, int num, int height, int width); size_t align_element_n(int16_t** data_in, int num, int height, int width); +// void quantize_to_fp16(float** data_in, int num, int height, int width, +// float* scale_ptr); size_t format_dwconv_filter( float** data_in, int num, int height, int width, float* scale_ptr); diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp old mode 100755 new mode 100644 index 06488469d97c077a34b3cfdb8a049c8cd61dfc93..bcbf2b98f487aea3c6516fa6369e70d11be97ffc --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -28,7 +28,7 @@ limitations under the License. */ namespace paddle { namespace zynqmp { -#define PADDLE_OS_LINUX +#define PADDLE_MOBILE_OS_LINUX static int fd = -1; static const char *device_path = "/dev/fpgadrv0"; @@ -38,7 +38,7 @@ static size_t memory_size_max = 0; static size_t memory_size = 0; static inline int do_ioctl(uint64_t req, const void *arg) { -#ifdef PADDLE_OS_LINUX +#ifdef PADDLE_MOBILE_OS_LINUX return ioctl(fd, req, arg); #else return -1; @@ -61,17 +61,33 @@ void reset_device() { // memory management; void *fpga_malloc(size_t size) { -#ifdef ENABLE_DEBUG -#endif -#ifdef PADDLE_OS_LINUX +#ifdef PADDLE_MOBILE_OS_LINUX + void *ptr = reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); - if (ptr == NULL) { + if (ptr == MAP_FAILED) { std::cout << "not enough memory !"; exit(-1); } + if (errno == ENOMEM) { + std::cout << "mmap failed with not enough memory !"; + exit(-1); + } + if (errno == EINVAL) { + std::cout << "mmap failed with invalid arguments ! (size=" << size << ")" + << std::endl; + exit(-1); + } + if (ptr == NULL) { + std::cout << "NULL returned, errno=" << errno + << ", mmap failed with other errors other than memory usage !" + << std::endl; + exit(-1); + } + memory_map.insert(std::make_pair(ptr, size)); memory_size += size; + if (memory_size > memory_size_max) { memory_size_max = memory_size; } @@ -87,7 +103,7 @@ size_t fpga_get_memory_size_max() { return memory_size_max; } size_t fpga_diagnose_memory(int detailed) { size_t total = 0; - auto iter = memory_map.begin(); // std::map::iterator + auto iter = memory_map.begin(); while (iter != memory_map.end()) { total += iter->second; iter++; @@ -97,13 +113,15 @@ size_t fpga_diagnose_memory(int detailed) { void fpga_free(void *ptr) { size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator + auto iter = memory_map.find(ptr); if (iter != memory_map.end()) { size = iter->second; memory_map.erase(iter); } + memory_size -= size; -#ifdef PADDLE_OS_LINUX + +#ifdef PADDLE_MOBILE_OS_LINUX munmap(ptr, size); #else free(ptr); @@ -230,6 +248,7 @@ int perform_bypass(const struct BypassArgs &args) { ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); scale = std::max(scale, scales[0]); } + args.output.scale_address[0] = scale; args.output.scale_address[1] = 1.0f / scale; return ret; @@ -238,6 +257,26 @@ int perform_bypass(const struct BypassArgs &args) { int compute_fpga_concat(const struct ConcatArgs &args) { return -1; } int compute_fpga_scale(const struct ScaleArgs &args) { +#ifdef ENABLE_DEBUG + std::cout << "======Compute Scale======"; + std::cout << "scale_address:" << args.scale_address << std::endl; + std::cout << "bias_address:" << args.bias_address << std::endl; + + std::cout << "wc_alignment:" << args.wc_alignment << std::endl; + std::cout << "channel_alignment:" << args.channel_alignment << std::endl; + + std::cout << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + + std::cout << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; + +#endif return do_ioctl(IOCTL_CONFIG_SCALE, &args); } diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h old mode 100755 new mode 100644 index 9489c24730e52fb778ed341e0ce452b7ef86edf9..55c2fde079a1ca0ec368870e2bb8f727d870a8f3 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.h +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h @@ -28,7 +28,6 @@ namespace zynqmp { typedef int16_t half; #define IMAGE_ALIGNMENT 16 // Aligned to 16 -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 #define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 #define BS_NUM_ALIGNMENT 8 #define BIAS_NUM_ALIGNMENT 16 @@ -44,14 +43,11 @@ enum DLayoutType { }; enum ActiveType { - TYPE_RELU = 0, - TYPE_RELU6 = 1, - TYPE_LEAK_RELU = 2, - TYPE_SIGMOID = 3, -}; - -struct VersionArgs { - void* buffer; + TYPE_NONE = 0, + TYPE_RELU = 1, + TYPE_RELU6 = 2, + TYPE_LEAKY_RELU = 3, + TYPE_SIGMOID = 4, }; struct DeviceInfo { @@ -67,6 +63,11 @@ struct DeviceInfo { uint32_t reserved6; }; +struct VersionArgs { + void* buffer; + size_t size; +}; + struct MemoryCopyArgs { void* src; void* dest; @@ -78,7 +79,9 @@ struct MemoryCacheArgs { size_t size; }; -struct MemoryBarrierArgs {}; +struct MemoryBarrierArgs { + uint16_t dummy; +}; struct BNArgs { bool enabled; diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp index 9dc295a58d4bbfd50a0b9ecbdb06a22c8900cef7..42ec32957e5884aaae3cc96f46060de114b44ead 100644 --- a/lite/backends/fpga/KD/pe_params.hpp +++ b/lite/backends/fpga/KD/pe_params.hpp @@ -30,8 +30,13 @@ struct ReLUParam { float leaky_relu_factor = 0.0f; }; +struct ActiveParam { + enum ActiveType type = TYPE_NONE; + float leaky_relu_factor; +}; + struct PEParam { - ReLUParam relu; + ActiveParam activeParam; }; struct InputParam : PEParam { @@ -100,24 +105,6 @@ struct DepthwiseConvParam : ConvParam { Tensor* quantizedFilter_ = new Tensor(); }; -struct GRUParam : PEParam { - public: - Tensor* input = nullptr; - Tensor* h0 = nullptr; - Tensor* weight = nullptr; - Tensor* bias = nullptr; - - Tensor* batch_gate = nullptr; - Tensor* batch_reset_hidden_prev = nullptr; - Tensor* batch_hidden = nullptr; - Tensor* hidden = nullptr; - - std::string gate_activation = "sigmoid"; - std::string activation = "tanh"; - bool is_reverse = false; - bool origin_mode = false; -}; - enum PoolingType : int { MAX = 0, AVERAGE = 1, @@ -155,7 +142,8 @@ struct ElementwiseAddParam : PEParam { struct ElementwiseMulParam : PEParam { public: - std::vector inputs; + Tensor* input_x; + Tensor* input_y = nullptr; Tensor* output = nullptr; }; @@ -223,6 +211,17 @@ struct PriorBoxParam : PEParam { float offset; }; +struct YoloBoxParam : PEParam { + Tensor* input; + Tensor* imgSize; + Tensor* outputBoxes; + Tensor* outputScores; + int downsampleRatio; + std::vector anchors; + int classNum; + float confThresh; +}; + struct ScaleParam : PEParam { public: Tensor* input = nullptr; @@ -255,5 +254,24 @@ struct CropParam : PEParam { std::vector offsets; std::vector shape; }; + +struct GRUParam : PEParam { + public: + Tensor* input = nullptr; + Tensor* h0 = nullptr; + Tensor* weight = nullptr; + Tensor* bias = nullptr; + + Tensor* batch_gate = nullptr; + Tensor* batch_reset_hidden_prev = nullptr; + Tensor* batch_hidden = nullptr; + Tensor* hidden = nullptr; + + std::string gate_activation = "sigmoid"; + std::string activation = "tanh"; + bool is_reverse = false; + bool origin_mode = false; +}; + } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp index fb15eaf77822eed076ec2001bace6871e93587ff..b4eac2c41e138cab19197ccb8ab89681a69ec6fe 100644 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -25,6 +25,7 @@ limitations under the License. */ #include "lite/backends/fpga/KD/pes/conv_process.hpp" #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" #include "lite/backends/fpga/KD/pes/scale_pe.hpp" +#include "lite/backends/fpga/KD/pes/split_pe.hpp" namespace paddle { namespace zynqmp { @@ -41,6 +42,8 @@ class ConvPE : public PE { void apply() { split_axis = fill_split_arg(param_); + split_channel = param_.groups != 1 && param_.splitParams().size() > 1; + if (split_axis == 0 && param_.splitParams().size() > 1) { ConcatParam& concat_param = concatPE_.param(); for (auto conv_param : param_.splitParams()) { @@ -51,107 +54,28 @@ class ConvPE : public PE { concatPE_.apply(); } + if (split_channel) { + SplitParam& split_param = splitPE_.param(); + split_param.input = param_.input; + for (auto conv_param : param_.splitParams()) { + split_param.outputs.push_back(&conv_param->input); + } + splitPE_.init(); + splitPE_.apply(); + } + if (DLEngine::get_instance().isZU3() && param_.input->shape().dimSize() == 4 && param_.input->shape().width() == 1 && - param_.input->shape().width() >= 2048) { + param_.input->shape().channel() >= 2048) { use_cpu_ = true; } - - if (param_.filter->shape().width() == 1 && - param_.filter->shape().height() == 1) { // NOLINT - } - if (!use_cpu_) { // NOLINT + if (!use_cpu_) { + // param_.filter->releaseData(); } - } - - void cpu_conv_hwc() { - Tensor* input = param_.input; - Tensor* output = param_.output; - input->syncToCPU(); - - Tensor float_input; - Tensor float_output; - float* image_addr = float_input.mutableData(FP32, input->shape()); - float_input.copyFrom(input); - float_input.syncToCPU(); - float* out = float_output.mutableData(FP32, output->shape()); - - int out_width = output->shape().width(); - int out_channel = output->shape().channel(); - int in_channel = input->shape().channel(); - - float* filter_data = param_.filter->data(); - int image_height = input->shape().height(); - int image_width = input->shape().width(); - int image_channels = input->shape().channel(); - int image_pad_h = param_.paddings[0]; - int image_pad_w = param_.paddings[1]; - int kernel_height = param_.filter->shape().height(); - int kernel_width = param_.filter->shape().width(); - int kernel_step_h = param_.strides[0]; - int kernel_step_w = param_.strides[1]; - int pooled_height_ = output->shape().height(); - int pooled_width_ = out_width; - int filter_chw = image_channels * kernel_height * kernel_width; - - float max = 0; - - for (int ph = 0; ph < pooled_height_; ph++) { - for (int pw = 0; pw < pooled_width_; pw++) { - int hstart = ph * kernel_step_h - image_pad_h; - int wstart = pw * kernel_step_w - image_pad_w; - int hend = - std::min(hstart + kernel_height, static_cast(image_height)); - int wend = - std::min(wstart + kernel_width, static_cast(image_width)); - hstart = std::max(hstart, static_cast(0)); - wstart = std::max(wstart, static_cast(0)); - for (int oc = 0; oc < out_channel; oc++) { - float sum = 0.0f; - const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc; - for (int c = 0; c < image_channels; c++) { - for (int h = hstart; h < hend; h++) { - int hi = 0; - if (ph == 0) { - hi = h - hstart + image_pad_h; - } else { - hi = h - hstart; - } - for (int w = wstart; w < wend; w++) { - int wi = 0; - if (pw == 0) { - wi = w - wstart + image_pad_w; - } else { - wi = w - wstart; - } - const int index = (h * image_width + w) * image_channels + c; - int weight_index = oc * filter_chw + - kernel_width * kernel_height * c + - kernel_width * hi + wi; - float value = image_addr[index] * filter_data[weight_index]; - sum += value; - } - } - } - - if (param_.relu.enabled && sum < 0) { - sum = -sum; - } - if (sum > max) { - max = sum; - } - out[pool_index] = sum; - } - } - } - float_output.flush(); - output->copyFrom(&float_output); - output->scale()[0] = max / 127; - output->scale()[1] = 127 / max; + // exit(-1); } - void cpu_compute() { Tensor* input = param_.input; Tensor* output = param_.output; @@ -161,93 +85,98 @@ class ConvPE : public PE { Tensor float_output; float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); - float_input.syncToCPU(); - + // float16* data_out = output->data(); float* out = float_output.mutableData(FP32, output->shape()); - float* bias_data = param_.bias()->data(); - - int out_width = output->shape().width(); int out_channel = output->shape().channel(); int in_channel = input->shape().channel(); float* filter_data = param_.filter->data(); float* mi = new float[in_channel]; - float max = 0; - - int out_index = 0; for (int i = 0; i < out_channel; i++) { float* image = image_addr; float* filter_ptr = filter_data + i * in_channel; float* out_ptr = mi; +#pragma omp parallel for + for (int j = 0; j < in_channel; j++) { + // float32x4_t x0 = vld1q_f32(image); + // float32x4_t x1 = vld1q_f32(filter_ptr); + + // float32x4_t r = vmulq_f32(x0, x1); + + // vst1q_f32(out_ptr, r); + // image += 4; + // filter_ptr += 4; + // out_ptr += 4; + float value = image_addr[j] * filter_ptr[j]; + mi[j] = value; + } - for (int h = 0; h < output->shape().height(); h++) { - for (int w = 0; w < output->shape().width(); w++) { - float sum = 0; - - // #pragma omp parallel for - for (int j = 0; j < in_channel; j++) { - int image_index = h * out_width * in_channel + w * in_channel + j; - float value = image_addr[image_index] * filter_ptr[j]; - sum += value; - } - - sum += bias_data[i]; - - if (param_.relu.enabled && sum < 0) { - sum = 0; - } - if (sum > max) { - max = sum; - } - out_index = h * out_width * out_channel + w * out_channel + i; - out[out_index] = sum; - } + float sum = 0; + for (int j = 0; j < in_channel; j++) { + sum += mi[j]; } + out[i] = sum; } delete[] mi; float_output.flush(); output->copyFrom(&float_output); - output->scale()[0] = max / 127; - output->scale()[1] = 127 / max; } bool dispatch() { + fpga_reset(); if (use_cpu_) { cpu_compute(); return true; } - inplace_.leaky_relu_enable = - (param_.relu.leaky_relu_factor != 0) ? true : false; - inplace_.relu_enable = - inplace_.leaky_relu_enable ? false : param_.relu.enabled; + if (param_.activeParam.type == TYPE_RELU) { + inplace_.relu_enable = true; + } else if (param_.activeParam.type == TYPE_RELU6) { + inplace_.relu6_enable = true; + } else if (param_.activeParam.type == TYPE_SIGMOID) { + inplace_.sigmoid_enable = true; + } else if (param_.activeParam.type == TYPE_LEAKY_RELU) { + inplace_.leaky_relu_enable = true; + } - inplace_.power_enable = false; - inplace_.normalize_enable = false; - if (inplace_.relu_enable || inplace_.leaky_relu_enable) { + if (inplace_.relu_enable || inplace_.leaky_relu_enable || + inplace_.relu6_enable || inplace_.sigmoid_enable) { config_inplace(inplace_); if (inplace_.leaky_relu_enable) { - activeParamterArgs.type = TYPE_LEAK_RELU; + activeParamterArgs.type = TYPE_LEAKY_RELU; activeParamterArgs.leaky_relu_factor = - fp32_2_fp16(param_.relu.leaky_relu_factor); + fp32_2_fp16(param_.activeParam.leaky_relu_factor); config_activation(activeParamterArgs); } } std::vector& params = param_.splitParams(); + + if (split_channel) { + // splitPE_.param().input->saveToFile("input_image",true); + splitPE_.dispatch(); + } + int ret = 0; for (auto conv_param : params) { + // conv_param->input.printScale(); + // if (split_channel) { + // conv_param->input.saveToFile("pack_image",true); + // } ret |= compute_fpga_conv_basic(conv_param->args); } - if (inplace_.relu_enable || inplace_.leaky_relu_enable) { + if (inplace_.relu_enable || inplace_.leaky_relu_enable || + inplace_.relu6_enable || inplace_.sigmoid_enable) { inplace_.relu_enable = false; inplace_.leaky_relu_enable = false; + inplace_.relu6_enable = false; + inplace_.sigmoid_enable = false; config_inplace(inplace_); if (inplace_.leaky_relu_enable) { - activeParamterArgs.type = TYPE_LEAK_RELU; + activeParamterArgs.type = TYPE_LEAKY_RELU; activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0); config_activation(activeParamterArgs); } @@ -255,16 +184,29 @@ class ConvPE : public PE { size_t size = params.size(); if (split_axis == 0 && ret == 0 && size > 1) { + // std::cout << "concat size:" << size << std::endl; concatPE_.dispatch(); } if (split_axis == 1 && ret == 0 && size > 1) { + // for (int n = 0; n < size - 1; n++) { ElementwiseAddParam& add_param = addPE_.param(); add_param.inputs = {¶ms[0]->output, ¶ms[1]->output}; add_param.output = param_.output; addPE_.init(); addPE_.apply(); addPE_.dispatch(); + + // param_.output->printScale(); + + // params[0]->input.saveToFile("conv_1.txt"); + // params[1]->input.saveToFile("conv_2.txt"); + + // params[0]->output.saveToFile("ew_o1.txt"); + // params[1]->output.saveToFile("ew_o2.txt"); + // std::cout << "\n ================== EW ================== \n"; + // } } + return ret == 0; } @@ -272,8 +214,10 @@ class ConvPE : public PE { private: bool use_cpu_ = false; + bool split_channel = false; ConvParam param_; ConcatPE concatPE_; + SplitPE splitPE_; ElementwiseAddPE addPE_; int split_axis = 0; InplaceArgs inplace_ = {0}; diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp index ecee45569c8df3d3e3926b2ca78cb49da8415aa4..cea22e0edc647b3bf4f0ac15e43121b5d8926154 100755 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -53,6 +53,16 @@ inline int get_split_num(Tensor* filter) { return filter::calc_split_num(aligned_num, div_capacity); } +inline int get_pack_num(Tensor* filter, int group_num) { + auto chw = filter->shape().channel() * filter->shape().height() * + filter->shape().width(); + auto num = filter->shape().num(); + int div_capacity = filter::calc_division_capacity(chw); + int filter_num_alignment = filter::get_filter_num_alignment(); + int aligned_num_per_group = align_to_x(num / group_num, filter_num_alignment); + return filter::calc_pack_num(aligned_num_per_group, group_num, div_capacity); +} + inline void fill_scale_bias_const(ConvParam* param_) { int channel = param_->output->shape().channel(); Shape sb_shape(N, {channel}); @@ -117,6 +127,50 @@ inline void combine_add_bn_params(BatchnormParam* bn, param_->bias()->setDataLocation(CPU); } +inline int gcd_(int a, int b) { + while (b) { + int temp = a; + a = b; + b = temp % b; + } + return a; +} + +inline int lcm_(int a, int b) { return a * b / gcd_(a, b); } + +inline void format_bias_scale_new(Tensor* bias, + Tensor* scale, + Tensor* scale_bias) { + Shape& bias_shape = bias->shape(); + int channel = bias_shape.channel(); + int repeat = 1; + int alignment = 16; + int length = channel; + + if (channel % alignment != 0 || channel < alignment) { + int c_lcm = lcm_(channel, alignment); + repeat = c_lcm / (channel); + } + Shape shape(N, {2 * channel * repeat}); + float16* scale_bias_data = scale_bias->mutableData(FP16, shape); + + float* bias_data_float = bias->data(); + float* scale_data_float = scale->data(); + + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + float16 value_bias = float_to_half(bias_data_float[j]); + scale_bias_data[i * length + j] = value_bias; + } + } + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + float16 value_scale = float_to_half(scale_data_float[j]); + scale_bias_data[i * length + j + length * repeat] = value_scale; + } + } +} + inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter, @@ -180,8 +234,10 @@ inline void format_scale_bias(Tensor* scale, inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group, - std::vector& scales) { // NOLINT + std::vector& scales, // NOLINT + float max) { float max_value = find_max(*filter); + // max_value = max; //TODO: global quantization for filter Shape& filter_shape = filter->shape(); int mem_size; @@ -206,10 +262,22 @@ inline void format_filter(Tensor* filter, memcpy(src, quantized_data, mem_size); quantized_filter->flush(); - - for (size_t i = 0; i < max_values.size(); i++) { - scales.push_back(max_values[i] / max_value); - } + fpga_free(quantized_data); + + // for (size_t i = 0; i < max_values.size(); i++) { + // // scales.push_back(max_values[i] / max_value); + // scales.push_back(1.0f); + // } + + // filter->saveToFile("filter.txt"); + // std::ofstream ofs; + // ofs.open("quant.txt"); + // for (int i = 0; i < mem_size; i++) { + // float value = quantized_data[i]; + // ofs << value << std::endl; + // } + // ofs.close(); + // exit(-1); } inline void format_dw_filter(Tensor* filter, @@ -256,17 +324,10 @@ inline void split_filter_num(const ConvParam& c_param) { Tensor* out = param.output; Tensor* filter = param.filter; auto channel = out->shape().channel(); - int split_num = param.groups == 1 ? get_split_num(param.filter) : 1; + int split_num = get_split_num(param.filter); int filter_num_per_div = get_filter_num_per_div(filter, param.groups); - auto chw = filter->shape().channel() * filter->shape().height() * - filter->shape().width(); - auto num = filter->shape().num(); - int div_capacity = filter::calc_division_capacity(chw); - int filter_num_alignment = filter::get_filter_num_alignment(); - int aligned_num = - align_to_x(num / param.groups, filter_num_alignment) * param.groups; - split_num = filter::calc_split_num(aligned_num, div_capacity); + float max = find_max(*filter); Shape& out_shape = out->shape(); for (int i = 0; i < split_num; i++) { @@ -310,45 +371,29 @@ inline void split_filter_num(const ConvParam& c_param) { new_filter.flush(); conv_param->filter.mutableData(FP32, f_shape); - if (param.groups != 1) { - int mem_factor = - 32 / filter_num_per_div; // TODO(chonwhite): change 32 to param; - conv_param->filter.setMemScale(mem_factor); - } - - std::vector v; // TODO(chonwhite): change local variable name - format_filter(&new_filter, &(conv_param->filter), param.groups, v); + std::vector v; // TODO(chonwhite) change variable name; + format_filter(&new_filter, &(conv_param->filter), param.groups, v, max); conv_param->filter.setDataType(INT8); - int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT); Tensor scale; Tensor bias; int chnnnel_start = i * filter_num_per_div; - Shape s_shape(N, {filter_num}); + Shape s_shape(NC, {1, filter_num}); float* scale_data = scale.mutableData(FP32, s_shape); float* bias_data = bias.mutableData(FP32, s_shape); for (int n = 0; n < filter_num; n++) { - scale_data[n] = param.scale()->data()[n + chnnnel_start] * v[n]; + scale_data[n] = param.scale()->data()[n + chnnnel_start]; } for (int n = 0; n < filter_num; n++) { bias_data[n] = param.bias()->data()[n + chnnnel_start]; } - Shape sb_shape(N, {sb_num}); - format_scale_bias(&scale, - &bias, - &conv_param->filter, - &conv_param->scaleBias, - param.groups); - + format_bias_scale_new(&bias, &scale, &conv_param->scaleBias); conv_param->scaleBias.flush(); - float* bs_data = conv_param->scaleBias.data(); args.group_num = param.groups; - args.relu_enabled = param.relu.enabled; - args.sb_address = conv_param->scaleBias.data(); - args.sb_address = bs_data; + args.sb_address = conv_param->scaleBias.data(); args.kernel.stride_h = param.strides[1]; args.kernel.stride_w = param.strides[0]; args.kernel.height = new_filter.shape().height(); @@ -372,6 +417,135 @@ inline void split_filter_num(const ConvParam& c_param) { } } +inline void pack_channel_filter(const ConvParam& c_param) { + ConvParam& param = const_cast(c_param); + Tensor* input = param.input; + Tensor* out = param.output; + Tensor* filter = param.filter; + int filter_num_alignment = filter::get_filter_num_alignment(); + auto filter_num = filter->shape().num(); + int pack_num = get_pack_num(param.filter, param.groups); + int group_per_pack = (param.groups + pack_num - 1) / pack_num; + int filter_per_group = filter_num / param.groups; + int filter_per_pack = filter_per_group * group_per_pack; + int channel_per_pack = filter->shape().channel() * group_per_pack; + + float max = find_max(*filter); + + Shape& out_shape = out->shape(); + + for (int i = 0; i < pack_num; i++) { + BasicConvParam* conv_param = new BasicConvParam(); + + conv_param->output.setDataLocation(Device); + conv_param->output.setAligned(true); + + float16* out_address = nullptr; + float* out_scale_address = nullptr; + + float16* input_address = nullptr; + + ConvArgs& args = conv_param->args; + + if (pack_num == 1) { + out_address = out->data(); + out_scale_address = out->scale(); + } + + int new_group = param.groups; + int filter_current_pack = filter->shape().num(); + int channel_current_pack = input->shape().channel(); + + new_group = i == pack_num - 1 + ? param.groups - (pack_num - 1) * group_per_pack + : group_per_pack; + filter_current_pack = new_group * filter_per_group; + channel_current_pack = new_group * filter->shape().channel(); + + if (pack_num == 1) { + input_address = input->data(); + } else { + Shape in_shape(NCHW, + {1, + channel_current_pack, + input->shape().height(), + input->shape().width()}); + input_address = conv_param->input.mutableData(FP16, in_shape); + } + + if (pack_num != 1) { + Shape shape( + NHWC, + {1, out_shape.height(), out_shape.width(), filter_current_pack}); + out_address = conv_param->output.mutableData(FP16, shape); + out_scale_address = conv_param->output.scale(); + } + Shape f_shape(NCHW, + {filter_current_pack, + filter->shape().channel(), + filter->shape().height(), + filter->shape().width()}); + + Tensor new_filter; + float* new_filter_data = new_filter.mutableData(FP32, f_shape); + int filter_hwc = filter->shape().height() * filter->shape().width() * + filter->shape().channel(); + + memcpy(new_filter_data, + filter->data() + i * filter_per_pack * filter_hwc, + filter_current_pack * filter_hwc * sizeof(float)); + new_filter.flush(); + conv_param->filter.mutableData(FP32, f_shape); + + float mem_factor = filter_num_alignment / filter_per_pack; + conv_param->filter.setMemScale(mem_factor); + + std::vector v; // TODO(chonwhite) change variable name + format_filter(&new_filter, &(conv_param->filter), new_group, v, max); + conv_param->filter.setDataType(INT8); + + Tensor scale; + Tensor bias; + + int chnnnel_start = i * filter_per_pack; + + Shape s_shape(NC, {1, filter_current_pack}); + float* scale_data = scale.mutableData(FP32, s_shape); + float* bias_data = bias.mutableData(FP32, s_shape); + for (int n = 0; n < filter_current_pack; n++) { + scale_data[n] = param.scale()->data()[n + chnnnel_start]; + } + for (int n = 0; n < filter_current_pack; n++) { + bias_data[n] = param.bias()->data()[n + chnnnel_start]; + } + format_bias_scale_new(&bias, &scale, &conv_param->scaleBias); + conv_param->scaleBias.flush(); + + args.group_num = new_group; + args.sb_address = conv_param->scaleBias.data(); + args.kernel.stride_h = param.strides[1]; + args.kernel.stride_w = param.strides[0]; + args.kernel.height = new_filter.shape().height(); + args.kernel.width = new_filter.shape().width(); + + args.filter_address = conv_param->filter.data(); + args.filter_num = filter_current_pack; + args.filter_scale_address = conv_param->filter.scale(); + args.image.address = input_address; + args.image.scale_address = input->scale(); + args.image.channels = channel_current_pack; + args.image.width = input->shape().width(); + args.image.height = input->shape().height(); + args.image.pad_width = param.paddings[1]; + args.image.pad_height = param.paddings[0]; + args.dilation = param.dilations[0]; + + args.output.address = out_address; + args.output.scale_address = out_scale_address; + param.splitParams().push_back(conv_param); + } +} + inline void split_channel(const ConvParam& c_param) { ConvParam& param = const_cast(c_param); Tensor* input = param.input; @@ -383,6 +557,8 @@ inline void split_channel(const ConvParam& c_param) { Shape bs_shape(N, {channel}); + float max = 1.0f; + for (int i = 0; i < num; i++) { BasicConvParam* conv_param = new BasicConvParam(); @@ -406,7 +582,8 @@ inline void split_channel(const ConvParam& c_param) { } new_filter.flush(); std::vector scales; - format_filter(&new_filter, &(conv_param->filter), param.groups, scales); + format_filter( + &new_filter, &(conv_param->filter), param.groups, scales, max); Tensor bias; Tensor scale; @@ -428,7 +605,6 @@ inline void split_channel(const ConvParam& c_param) { ConvArgs& args = conv_param->args; args.group_num = param.groups; - args.relu_enabled = param.relu.enabled; args.sb_address = conv_param->scaleBias.data(); args.kernel.stride_h = param.strides[1]; args.kernel.stride_w = param.strides[0]; @@ -457,13 +633,17 @@ inline int fill_split_arg(const ConvParam& c_param) { ConvParam& param = const_cast(c_param); Tensor* input = param.input; Tensor* output = param.output; + if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 && input->shape().width() == 1) { split_channel(c_param); return 1; - } else { + } else if (param.groups == 1) { split_filter_num(c_param); return 0; + } else { + pack_channel_filter(c_param); + return 0; } } diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp index 0efca2ec2e60e8973d92f41463b0444722f2a73b..9958990af6eb237d2122a63e1b7ed947ca329d31 100755 --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp @@ -24,6 +24,17 @@ namespace zynqmp { class DepthwiseConvPE : public PE { public: + inline int gcd_(int a, int b) { + while (b) { + int temp = a; + a = b; + b = temp % b; + } + return a; + } + + inline int lcm_(int a, int b) { return a * b / gcd_(a, b); } + bool init() { Tensor* output = param_.output; output->setAligned(true); @@ -37,17 +48,40 @@ class DepthwiseConvPE : public PE { Tensor* output = param.output; int channel = output->shape().channel(); - float16* b_data = bias_.mutableData(FP16, param_.bias()->shape()); + int repeat = 1; + int alignment = 16; + int length = channel; + + if (channel % alignment != 0 || channel < alignment) { + int c_lcm = lcm_(channel, alignment); + repeat = c_lcm / (channel); + } + Shape shape(N, {channel * repeat}); + + float16* b_data = bias_.mutableData(FP16, shape); if (param_.bias()->dataType() == FP32) { float* new_bias_data = param_.bias()->data(); // bias从float转换成float16 - for (int i = 0; i < channel; i++) { - b_data[i] = float_to_half(new_bias_data[i]); + // for (int i = 0; i < channel; i++) { + // b_data[i] = float_to_half(new_bias_data[i]); + // } + // bias 按16对齐填充hw + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + float16 value = float_to_half(new_bias_data[j]); + b_data[i * length + j] = value; + } } bias_.flush(); } else { float16* new_bias_data = param_.bias()->data(); - memcpy(b_data, new_bias_data, channel * sizeof(float16)); + // memcpy(b_data, new_bias_data, channel * sizeof(float16)); + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + // float16 value = float_to_half(bias_data_float[j]); + b_data[i * length + j] = new_bias_data[j]; + } + } bias_.flush(); } @@ -62,6 +96,8 @@ class DepthwiseConvPE : public PE { float16* scale_data = param_.scale()->data(); float16* filter_data = param.quantizedFilter()->mutableData( FP16, param.filter->shape()); + + // memcpy(filter_data, scale_data, channel * sizeof(float16)); memcpy(filter_data, scale_data, param.filter->shape().numel() * sizeof(float16)); @@ -89,20 +125,33 @@ class DepthwiseConvPE : public PE { args.sub_conv_num = 1; param.args = args; - inplace_.relu_enable = param_.relu.enabled; inplace_.power_enable = false; inplace_.normalize_enable = false; } bool dispatch() { param_.input->syncToDevice(); - if (param_.relu.enabled) { - inplace_.relu_enable = param_.relu.enabled; + if (param_.activeParam.type == TYPE_RELU) { + inplace_.relu_enable = true; + } else if (param_.activeParam.type == TYPE_RELU6) { + inplace_.relu6_enable = true; + } else if (param_.activeParam.type == TYPE_SIGMOID) { + inplace_.sigmoid_enable = true; + } else if (param_.activeParam.type == TYPE_LEAKY_RELU) { + inplace_.leaky_relu_enable = true; + } + + if (inplace_.relu_enable || inplace_.leaky_relu_enable || + inplace_.relu6_enable || inplace_.sigmoid_enable) { config_inplace(inplace_); } bool ret = compute_fpga_dwconv(param_.args) == 0; - if (param_.relu.enabled) { + if (inplace_.relu_enable || inplace_.leaky_relu_enable || + inplace_.relu6_enable || inplace_.sigmoid_enable) { inplace_.relu_enable = false; + inplace_.leaky_relu_enable = false; + inplace_.relu6_enable = false; + inplace_.sigmoid_enable = false; config_inplace(inplace_); } return ret; diff --git a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp index a498a2bde9a3656cf8b7006b867eec088d87b425..6f76ae3d4a1d9d054339d929515f24989f1c15b0 100755 --- a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp +++ b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp @@ -58,15 +58,29 @@ class ElementwiseAddPE : public PE { bool dispatch() { param_.inputs[0]->syncToDevice(); param_.inputs[1]->syncToDevice(); - InplaceArgs inplace_args = {0}; - if (param_.relu.enabled) { - inplace_args.relu_enable = true; - config_inplace(inplace_args); + // InplaceArgs inplace_ = {0}; + + if (param_.activeParam.type == TYPE_RELU) { + inplace_.relu_enable = true; + } else if (param_.activeParam.type == TYPE_RELU6) { + inplace_.relu6_enable = true; + } else if (param_.activeParam.type == TYPE_SIGMOID) { + inplace_.sigmoid_enable = true; + } else if (param_.activeParam.type == TYPE_LEAKY_RELU) { + inplace_.leaky_relu_enable = true; + } + if (inplace_.relu_enable || inplace_.leaky_relu_enable || + inplace_.relu6_enable || inplace_.sigmoid_enable) { + config_inplace(inplace_); } compute_fpga_ewadd(param_.ewargs); - if (param_.relu.enabled) { - inplace_args.relu_enable = false; - config_inplace(inplace_args); + if (inplace_.relu_enable || inplace_.leaky_relu_enable || + inplace_.relu6_enable || inplace_.sigmoid_enable) { + inplace_.relu_enable = false; + inplace_.relu6_enable = false; + inplace_.sigmoid_enable = false; + inplace_.leaky_relu_enable = false; + config_inplace(inplace_); } return true; } @@ -75,6 +89,7 @@ class ElementwiseAddPE : public PE { private: ElementwiseAddParam param_; + InplaceArgs inplace_ = {0}; }; } // namespace zynqmp diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp index 0505e78b61e3b0130c876880894cec29c78406f2..7730c598b0e8745e47df7d5c456e2b5420fbe6c0 100644 --- a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp +++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp @@ -29,17 +29,17 @@ class ElementwiseMulPE : public PE { } void apply() { - Tensor* input = param_.inputs[0]; + Tensor* input = param_.input_x; Tensor* output = param_.output; - int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32); + int wc_aligned = align_to_x(param_.input_x->shape().numel(), 32); Shape s(N, {wc_aligned}); float16* bias_data = bias_tensor.mutableData(FP16, s); memset(bias_data, 0, wc_aligned * sizeof(float16)); ScaleArgs& args = args_; - args.scale_address = param_.inputs[1]->data(); + args.scale_address = param_.input_y->data(); args.bias_address = bias_tensor.data(); args.wc_alignment = wc_aligned; args.channel_alignment = wc_aligned; diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp old mode 100755 new mode 100644 index db3e05276171607da4cea421dd554846a00314a6..a2b184e383aa600b1279197a115c58309e204a95 --- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp +++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp @@ -37,10 +37,9 @@ class FullyConnectedPE : public PE { ConvParam& convParam_ = convPE_.param(); Tensor* input = param_.input; convParam_.input = param_.input; - num_ = param_.input->shape().num(); - convParam_.output = param_.output; - + // convParam_.relu = param_.relu; + convParam_.activeParam.type = param_.activeParam.type; convParam_.groups = 1; convParam_.strides = {1, 1}; convParam_.paddings = {0, 0}; @@ -49,6 +48,9 @@ class FullyConnectedPE : public PE { int num = param_.filter->shape().channel(); int chw = param_.filter->shape().num(); + // if (num == 2) { + // return; + // } int height = param_.input->shape().height(); int width = param_.input->shape().width(); @@ -66,6 +68,7 @@ class FullyConnectedPE : public PE { new_filter_data[i * chw + j] = scale; } } + conv_filter->flush(); convParam_.filter = conv_filter; @@ -84,15 +87,51 @@ class FullyConnectedPE : public PE { convPE_.apply(); } - bool dispatch() { return convPE_.dispatch(); } + void cpu_compute() { + int num = param_.filter->shape().channel(); + int chw = param_.filter->shape().num(); + + float* filter_data = param_.filter->data(); + float max = 0.0f; + Tensor* input = param_.input; + Tensor* output = param_.output; + float16* input_data = input->data(); + float16* output_data = output->data(); + + for (int i = 0; i < num; i++) { + float sum = 0; + float bias = param_.bias->data()[i]; + for (int j = 0; j < chw; j++) { + float scale = filter_data[j * num + i]; + float data = half_to_float(input_data[j]); + sum += scale * data; + } + output_data[i] = float_to_half(sum + bias); + if (max < output_data[i]) { + max = output_data[i]; + } + } + + output->flush(); + output->scale()[0] = max / 127.0f; + output->scale()[1] = 127.0f / max; + } + + bool dispatch() { + // int num = param_.filter->shape().channel(); + // if (num == 2) { + // cpu_compute(); + // return 1; + // } else { + return convPE_.dispatch(); + // } + } FullyConnectedParam& param() { return param_; } private: FullyConnectedParam param_; ConvPE convPE_; - Tensor tempOut_; - int num_ = 1; }; } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp old mode 100644 new mode 100755 index dcacab4eeef32b245d4126b72597b398a6627ba6..299ffb872b4620fc409eb8e66760a6308a814efb --- a/lite/backends/fpga/KD/pes/gru_pe.hpp +++ b/lite/backends/fpga/KD/pes/gru_pe.hpp @@ -47,8 +47,10 @@ class GRUPE : public PE { zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}}; float16* prev_hidden_data = prev_hidden_.mutableData(zynqmp::FP16, hidden_shape); + // set previous hidden data to 0; memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16)); + // copy 2/3 weight from param.weight; zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}}; float* weight_data = weight_.mutableData(zynqmp::FP32, weight_shape); memset(weight_data, 0, weight_shape.numel() * sizeof(float)); @@ -74,7 +76,9 @@ class GRUPE : public PE { reset_hidden_.mutableData(FP16, hidden_shape); ElementwiseMulParam& mul_param = mul_pe_.param(); - mul_param.inputs = {&reset_gate_, &prev_hidden_}; + // mul_param.inputs = {&reset_gate_, &prev_hidden_}; + mul_param.input_x = &reset_gate_; + mul_param.input_y = &prev_hidden_; mul_param.output = &reset_hidden_; mul_pe_.init(); mul_pe_.apply(); @@ -115,11 +119,8 @@ class GRUPE : public PE { if (hidden_prev) { // TODO(chonwhite): change to pre_out; prev_hidden_.copyFrom(value.pre_output); - prev_hidden_.saveToFile("prev_.txt"); } - mul_pe_.dispatch(); - reset_hidden_.saveToFile("reset_hidden_.txt"); update_gate_data += stride_update; reset_gate_data += stride_update; diff --git a/lite/backends/fpga/KD/pes/norm_pe.hpp b/lite/backends/fpga/KD/pes/norm_pe.hpp index 3e2fd8062766c84282233b91fcaecf5e0a26fd72..0537df27e212014ed309245b0e86b8d8f077489e 100644 --- a/lite/backends/fpga/KD/pes/norm_pe.hpp +++ b/lite/backends/fpga/KD/pes/norm_pe.hpp @@ -72,8 +72,10 @@ class NormPE : public PE { input_float.mutableData(FP32, param_.input->shape()); float_out.mutableData(FP32, param_.output->shape()); + // param_.input->syncToDevice(); input_float.copyFrom(param_.input); input_float.syncToCPU(); + // input_float.saveToFile("normalize_", true); int channel = input_float.shape().channel(); int height = input_float.shape().height(); @@ -85,6 +87,7 @@ class NormPE : public PE { float* out_ptr = float_out.data(); int loop = height * width; +#pragma omp parallel for for (int i = 0; i < loop; i++) { float sum = param_.epsilon; for (int c = 0; c < channel; c++) { @@ -98,11 +101,26 @@ class NormPE : public PE { } } float_out.flush(); + // float_out.saveToFile("normalize_", true); param_.output->copyFrom(&float_out); } bool dispatch() { cpuCompute(); + // std::cout << "CPU normalize ---------------------" << std::endl; + + // param_.input->syncToDevice(); + // // param_.input->saveToFile("normalize_fpga_", true); + // config_norm_param(norm_param_args_); + // inplace_args_.normalize_enable = true; + // config_inplace(inplace_args_); + + // perform_bypass(bypass_args_); + // inplace_args_.normalize_enable = false; + // config_inplace(inplace_args_); + // compute_norm(norm_args_); + // param_.output->saveToFile("normalize_fpga_", true); + // std::cout << "FPGA normalize ---------------------" << std::endl; return true; } diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp old mode 100755 new mode 100644 index a8725b51a690e0e134785fcfdb3dd70edeffd441..60755ee1dbf81512bde618389cbf3a88cf93d1ce --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -35,14 +35,17 @@ class PoolingPE : public PE { Tensor* input = param_.input; Tensor* output = param_.output; - uint32_t k_height = param_.kernelSize[0]; - uint32_t k_width = param_.kernelSize[1]; + uint32_t k_height = 1; + uint32_t k_width = 1; if (param_.globalPooling) { k_width = input->shape().width(); k_height = input->shape().height(); param_.kernelSize[0] = k_height; param_.kernelSize[1] = k_width; + } else { + k_height = param_.kernelSize[0]; + k_width = param_.kernelSize[1]; } PoolingArgs args = {0}; @@ -65,9 +68,12 @@ class PoolingPE : public PE { args.out_width = output->shape().width(); param_.poolingArgs = args; + // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 + // && + // (k_width > 7 || k_height > 7); use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && - (k_width > 7 || k_height > 7); - use_cpu_ = param_.type == AVERAGE; + (k_width > 255 || k_height > 255); + // use_cpu_ = param_.type == AVERAGE; } void compute() { @@ -76,6 +82,7 @@ class PoolingPE : public PE { input->syncToCPU(); Tensor float_input; + // Tensor float_output; float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); float16* data_out = output->data(); @@ -110,6 +117,8 @@ class PoolingPE : public PE { for (int c = 0; c < image_channels; ++c) { const int pool_index = (ph * pooled_width_ + pw) * image_channels + c; float sum = 0; + // const int index = + // (hstart * image_width + wstart) * image_channels + c; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = (h * image_width + w) * image_channels + c; @@ -138,7 +147,9 @@ class PoolingPE : public PE { Tensor float_input; float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); + // float_input.saveToFile("pool_float.txt"); float16* data_out = output->data(); + int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1]; float scale_max = 0; @@ -155,6 +166,7 @@ class PoolingPE : public PE { output->scale()[0] = scale_max / 127.0f; output->scale()[1] = 127.0f / scale_max; output->flush(); + // exit(-1); } void cpu_compute() { @@ -184,11 +196,14 @@ class PoolingPE : public PE { output->scale()[0] = scale_max / 127.0f; output->scale()[1] = 127.0f / scale_max; output->flush(); + // exit(-1); } bool dispatch() { if (use_cpu_) { + // cpu_compute(); compute(); + // exit(-1); return true; } param_.input->syncToDevice(); diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp old mode 100644 new mode 100755 index cc89ac943f90cb20062a3d6ef9a46b705193ad04..09755c65a322da8ccab0d57dd2e877712b112361 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -89,7 +89,6 @@ class ScalePE : public PE { } } } - float* scale_data_float = param_.scale->data(); for (int i = 0; i < repeat; i++) { for (int j = 0; j < length; j++) { diff --git a/lite/backends/fpga/KD/pes/split_pe.hpp b/lite/backends/fpga/KD/pes/split_pe.hpp index 26598a4c87f0b88882b3fe76de64ddfa5c6cd6a8..01a036787441c596bf74858aa9bf6a6613864cc1 100644 --- a/lite/backends/fpga/KD/pes/split_pe.hpp +++ b/lite/backends/fpga/KD/pes/split_pe.hpp @@ -53,20 +53,37 @@ class SplitPE : public PE { int64_t src_after = src_stride_numel[axis]; int64_t dst_after = dst_stride_numel[axis]; + // PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(), + // "src and dst tensor should have the same dims + // size."); + for (int64_t i = 0; i < axis; ++i) { if (i < axis) { + // PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] == + // dst_stride_numel[i] / + // dst_stride_numel[axis], + // "src and dst should have the same elements " + // "except the specified axis."); } else if (i == axis) { continue; } else { + // PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i], + // "src and dst should have the same elements " + // "except the specified axis."); } } for (int64_t i = 0; i < before; ++i) { - memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size); + memcpy(dst + i * dst_after, src + i * src_after, sizeof(T) * size); } } - void split3D() { int axis = param_.axis; } + void split3D() { + int axis = param_.axis; + // float16* dst = param_.output->data(); + // std::vector& dst_dims = ; + // StridedNumelCopyWithAxis(); + } bool dispatch() { Tensor* input = param_.input; @@ -88,6 +105,7 @@ class SplitPE : public PE { in_stride, out_stride[axis]); input_offset += out_stride[axis]; + // out->flush(); } return true; } @@ -95,21 +113,26 @@ class SplitPE : public PE { std::vector outputs = param_.outputs; int in_channel = input->shape().channel(); - int split_channel = input->shape().channel() / param_.num; + // int split_channel = input->shape().channel() / param_.num; int hw = input->shape().height() * input->shape().width(); float16* in_data = input->data(); + for (int i = 0; i < hw; i++) { + int channel_stride = 0; for (int n = 0; n < outputs.size(); n++) { Tensor* out = outputs[n]; float16* out_data = out->data(); - memcpy(out_data + i * split_channel, - in_data + i * in_channel + n * split_channel, - split_channel * sizeof(float16)); + memcpy(out_data + i * out->shape().channel(), + in_data + i * in_channel + channel_stride, + out->shape().channel() * sizeof(float16)); + channel_stride += out->shape().channel(); } } + for (int n = 0; n < outputs.size(); n++) { Tensor* out = outputs[n]; + out->flush(); out->copyScaleFrom(input); } return true; @@ -120,5 +143,6 @@ class SplitPE : public PE { private: SplitParam param_; }; + } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index f1b07d02622fad32e99205667424a4cb3c9fb46d..988bc1bb507036de8f13a6c6549c549718bd1256 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -300,14 +300,12 @@ class Tensor { } void flush() { - size_t memorySize = - shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; + size_t memorySize = placeHolder_->memorySize(); fpga_flush(placeHolder_->data(), memorySize); } void invalidate() { - size_t memorySize = - shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; + size_t memorySize = placeHolder_->memorySize(); fpga_invalidate(placeHolder_->data(), memorySize); } @@ -385,6 +383,7 @@ class Tensor { invalidate(); std::ofstream ofs; ofs.open(path); + ofs << scale()[0] << " / " << scale()[1] << std::endl; for (int i = 0; i < shape_->numel(); i++) { float value = 0; diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc old mode 100755 new mode 100644 index a5a7c3b92f8ce01dacfc0518d0763cdb642bd838..69e600a043389fcc36bda2906f38432f2771aaf8 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -46,7 +46,10 @@ void ConvCompute::PrepareForRun() { conv_param.dilations = *param.dilations; fill_scale_bias_const(&conv_param); conv_param.bias()->copyFrom(param.bias->ZynqTensor()); - conv_param.relu.enabled = param.fuse_relu; + + if (param.fuse_relu) { + conv_param.activeParam.type = zynqmp::TYPE_RELU; + } dw_conv_pe_.init(); dw_conv_pe_.apply(); @@ -65,7 +68,16 @@ void ConvCompute::PrepareForRun() { conv_param.bias()->copyFrom(param.bias->ZynqTensor()); } - conv_param.relu.enabled = param.fuse_relu; + if (param.fuse_relu) { + conv_param.activeParam.type = zynqmp::TYPE_RELU; + } + + // conv_param.filter->saveToFile("conv_filter_", true); + // if (param.bias != nullptr) { + // std::cout << "param.bias != nullptr" << std::endl; + // conv_param.bias()->saveToFile("conv_bias_", true); + // } + conv_pe_.init(); conv_pe_.apply(); } @@ -76,8 +88,14 @@ void ConvCompute::Run() { if (param.x->ZynqTensor()->shape().channel() != 1 && param.groups == param.x->ZynqTensor()->shape().channel()) { dw_conv_pe_.dispatch(); +#ifdef FPGA_PRINT_TENSOR + zynqmp::DepthwiseConvParam& dwconv_param = dw_conv_pe_.param(); + Debugger::get_instance().registerOutput("dwconv", dwconv_param.output); +#endif } else { + // zynqmp::ConvParam& conv_param = conv_pe_.param(); conv_pe_.dispatch(); + #ifdef FPGA_PRINT_TENSOR zynqmp::ConvParam& conv_param = conv_pe_.param(); Debugger::get_instance().registerOutput("conv", conv_param.output); @@ -103,3 +121,21 @@ REGISTER_LITE_KERNEL( PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(depthwise_conv2d, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::ConvCompute, + def) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc index 1010e5a4d6541b22c07d1d9efbc2465381081997..d22cc7abacc2ecd80e54aa5c62a7e57671b920c9 100755 --- a/lite/kernels/fpga/elementwise_compute.cc +++ b/lite/kernels/fpga/elementwise_compute.cc @@ -33,7 +33,8 @@ void ElementwiseAddCompute::PrepareForRun() { ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()}; ew_param.output = param.Out->ZynqTensor(); ew_param.axis = param.axis; - ew_param.relu.enabled = false; + + ew_param.activeParam.type = zynqmp::TYPE_NONE; pe_.init(); pe_.apply(); @@ -56,7 +57,7 @@ void ElementwiseAddActivationCompute::PrepareForRun() { ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()}; ew_param.output = param.Out->ZynqTensor(); ew_param.axis = param.axis; - ew_param.relu.enabled = true; + ew_param.activeParam.type = zynqmp::TYPE_RELU; pe_.init(); pe_.apply(); } @@ -76,7 +77,7 @@ void ElementwiseMulCompute::PrepareForRun() { scale_param.input = param.X->ZynqTensor(); scale_param.output = param.Out->ZynqTensor(); - scale_param.relu.enabled = false; + scale_param.activeParam.type = zynqmp::TYPE_NONE; int channel = scale_param.input->shape().channel(); zynqmp::Tensor* scale = new zynqmp::Tensor(); @@ -124,7 +125,10 @@ REGISTER_LITE_KERNEL(elementwise_add, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC))}) - .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kFPGA), + PRECISION(kFP16), + DATALAYOUT(kNHWC))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc index add60f64602105d317c3657985c0011aff246608..51c9d54cad9054e4767860096d42e9c991d4f936 100644 --- a/lite/kernels/fpga/elementwise_compute_test.cc +++ b/lite/kernels/fpga/elementwise_compute_test.cc @@ -93,18 +93,25 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param, } // do elementwise add/sub/max... if (elt_type == "add") { - for (int i = 0; i < batch; ++i) { - for (int j = 0; j < channels; ++j) { - int offset = (i * channels + j) * num; - const dtype* din_ptr = x_data + offset; - const dtype diny_data = y_data[j]; - dtype* dout_ptr = out_data + offset; - for (int k = 0; k < num; ++k) { - *dout_ptr = sum(*din_ptr, diny_data); - dout_ptr++; - din_ptr++; - } - } + // for (int i = 0; i < batch; ++i) { + // for (int j = 0; j < channels; ++j) { + // int offset = (i * channels + j) * num; + // const dtype* din_ptr = x_data + offset; + // const dtype diny_data = y_data[j]; + // dtype* dout_ptr = out_data + offset; + // for (int k = 0; k < num; ++k) { + // *dout_ptr = + // zynqmp::float_to_half(sum(zynqmp::half_to_float(*din_ptr), + // zynqmp::half_to_float(diny_data))); + // dout_ptr++; + // din_ptr++; + // } + // } + // } + int count = x_dims[0] * x_dims[1] * x_dims[2] * x_dims[3]; + for (int i = 0; i < count; ++i) { + out_data[i] = zynqmp::float_to_half(sum( + zynqmp::half_to_float(x_data[i]), zynqmp::half_to_float(y_data[i]))); } } else if (elt_type == "sub") { for (int i = 0; i < batch; ++i) { @@ -148,9 +155,9 @@ TEST(elementwise_add, compute) { lite::Tensor x, y, output, output_ref; for (auto n : {1}) { - for (auto c : {8}) { - for (auto h : {8}) { - for (auto w : {8}) { + for (auto h : {72}) { + for (auto w : {192}) { + for (auto c : {24}) { for (auto axis : {0}) { for (auto yd : {std::vector({n, c, h, w})}) { auto x_dim = DDim(std::vector({n, c, h, w})); @@ -174,10 +181,16 @@ TEST(elementwise_add, compute) { auto* output_ref_data = output_ref.mutable_data(TARGET(kFPGA)); for (int i = 0; i < x_dim.production(); i++) { - x_data[i] = zynqmp::float_to_half(i); + float sign = i % 3 == 0 ? -0.03 : 0.05f; + float x = sign * (i % 128); + std::cout << "x:" << x << std::endl; + x_data[i] = zynqmp::float_to_half(x); } for (int i = 0; i < y_dim.production(); i++) { - y_data[i] = zynqmp::float_to_half(i); + float sign = i % 3 == 0 ? -0.03 : 0.05f; + float y = sign * (i % 128); + std::cout << "y:" << y << std::endl; + y_data[i] = zynqmp::float_to_half(y); } param.X = &x; param.Y = &y; @@ -190,7 +203,14 @@ TEST(elementwise_add, compute) { elementwise_compute_ref(param, "add", ""); for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); + std::cout << "output_data:" + << zynqmp::half_to_float(output_data[i]) + << ",output_ref_data:" + << zynqmp::half_to_float(output_ref_data[i]) + << std::endl; + EXPECT_NEAR(zynqmp::half_to_float(output_data[i]), + zynqmp::half_to_float(output_ref_data[i]), + 1e-5); } } } @@ -209,73 +229,74 @@ TEST(fusion_elementwise_add_activation_fpga, retrive_op) { ASSERT_TRUE(fusion_elementwise_add_activation.front()); } -TEST(fusion_elementwise_add_activation_fpga, init) { - ElementwiseAddActivationCompute fusion_elementwise_add_activation; - ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16)); - ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA)); -} +// TEST(fusion_elementwise_add_activation_fpga, init) { +// ElementwiseAddActivationCompute fusion_elementwise_add_activation; +// ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16)); +// ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA)); +// } -TEST(fusion_elementwise_add_activation_fpga, compute) { - ElementwiseAddActivationCompute fusion_elementwise_add_activation; - operators::FusionElementwiseActivationParam param; - lite::Tensor x, y, output, output_ref; +// TEST(fusion_elementwise_add_activation_fpga, compute) { +// ElementwiseAddActivationCompute fusion_elementwise_add_activation; +// operators::FusionElementwiseActivationParam param; +// lite::Tensor x, y, output, output_ref; - for (auto act_type : {"relu"}) { - for (auto n : {1}) { - for (auto c : {8}) { - for (auto h : {8}) { - for (auto w : {8}) { - for (auto axis : {0}) { - for (auto yd : {std::vector({n, c, h, w})}) { - auto x_dim = DDim(std::vector({n, c, h, w})); - auto y_dim = DDim(yd); - int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; +// for (auto act_type : {"relu"}) { +// for (auto n : {1}) { +// for (auto c : {8}) { +// for (auto h : {8}) { +// for (auto w : {8}) { +// for (auto axis : {0}) { +// for (auto yd : {std::vector({n, c, h, w})}) { +// auto x_dim = DDim(std::vector({n, c, h, w})); +// auto y_dim = DDim(yd); +// int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; - if (axis_t + y_dim.size() > 4) continue; - bool flag = false; - for (int i = 0; i < y_dim.size(); i++) { - if (x_dim[i + axis_t] != y_dim[i]) flag = true; - } - if (flag) continue; +// if (axis_t + y_dim.size() > 4) continue; +// bool flag = false; +// for (int i = 0; i < y_dim.size(); i++) { +// if (x_dim[i + axis_t] != y_dim[i]) flag = true; +// } +// if (flag) continue; - x.Resize(x_dim); - y.Resize(y_dim); - output.Resize(x_dim); - output_ref.Resize(x_dim); - auto* x_data = x.mutable_data(TARGET(kFPGA)); - auto* y_data = y.mutable_data(TARGET(kFPGA)); - auto* output_data = output.mutable_data(TARGET(kFPGA)); - auto* output_ref_data = - output_ref.mutable_data(TARGET(kFPGA)); - for (int i = 0; i < x_dim.production(); i++) { - float sign = i % 3 == 0 ? -1.0f : 1.0f; - x_data[i] = zynqmp::float_to_half(i * sign); - } - for (int i = 0; i < y_dim.production(); i++) { - float sign = i % 2 == 0 ? 0.5f : -0.5f; - y_data[i] = zynqmp::float_to_half(i * sign); - } - param.X = &x; - param.Y = &y; - param.axis = axis; - param.Out = &output; - param.act_type = act_type; - fusion_elementwise_add_activation.SetParam(param); - fusion_elementwise_add_activation.PrepareForRun(); - fusion_elementwise_add_activation.Run(); - param.Out = &output_ref; - elementwise_compute_ref(param, "add", act_type); - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); - } - } - } - } - } - } - } - } -} +// x.Resize(x_dim); +// y.Resize(y_dim); +// output.Resize(x_dim); +// output_ref.Resize(x_dim); +// auto* x_data = x.mutable_data(TARGET(kFPGA)); +// auto* y_data = y.mutable_data(TARGET(kFPGA)); +// auto* output_data = +// output.mutable_data(TARGET(kFPGA)); +// auto* output_ref_data = +// output_ref.mutable_data(TARGET(kFPGA)); +// for (int i = 0; i < x_dim.production(); i++) { +// float sign = i % 3 == 0 ? -1.0f : 1.0f; +// x_data[i] = zynqmp::float_to_half(i * sign); +// } +// for (int i = 0; i < y_dim.production(); i++) { +// float sign = i % 2 == 0 ? 0.5f : -0.5f; +// y_data[i] = zynqmp::float_to_half(i * sign); +// } +// param.X = &x; +// param.Y = &y; +// param.axis = axis; +// param.Out = &output; +// param.act_type = act_type; +// fusion_elementwise_add_activation.SetParam(param); +// fusion_elementwise_add_activation.PrepareForRun(); +// fusion_elementwise_add_activation.Run(); +// param.Out = &output_ref; +// elementwise_compute_ref(param, "add", act_type); +// for (int i = 0; i < output.dims().production(); i++) { +// EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); +// } +// } +// } +// } +// } +// } +// } +// } +// } } // namespace fpga } // namespace kernels @@ -283,4 +304,4 @@ TEST(fusion_elementwise_add_activation_fpga, compute) { } // namespace paddle USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def); -USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); +// USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def); diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc index 7670bf0007def88c27c12ea54c569a7fcf263693..79329e99a3e5e812dca487c17452f3f5d1e96449 100755 --- a/lite/kernels/fpga/feed_compute.cc +++ b/lite/kernels/fpga/feed_compute.cc @@ -67,3 +67,13 @@ REGISTER_LITE_KERNEL( PRECISION(kFP16), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(feed, + kFPGA, + kFP16, + kNHWC, + paddle::lite::kernels::fpga::FeedCompute, + def_host) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc old mode 100644 new mode 100755 index 9b5f3f60232bb8527f823395693cf3b3851bc04e..2d296f4d4a89b1fd86e5b2330d3caf44fbad0903 --- a/lite/kernels/fpga/fetch_compute.cc +++ b/lite/kernels/fpga/fetch_compute.cc @@ -43,8 +43,14 @@ void FetchCompute::PrepareForRun() { } void FetchCompute::Run() { - pe_.dispatch(); auto& param = this->Param(); + auto fetch_list = param.fetch_list; + if (fetch_list->size() <= static_cast(param.col)) { + fetch_list->resize(param.col + 1); + } + Tensor& out = param.fetch_list->at(param.col); + out.Resize(param.input->dims()); + pe_.dispatch(); #ifdef FPGA_PRINT_TENSOR zynqmp::OutputParam& fetch_param = pe_.param(); @@ -67,10 +73,7 @@ REGISTER_LITE_KERNEL(fetch, {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); REGISTER_LITE_KERNEL(fetch, @@ -79,12 +82,6 @@ REGISTER_LITE_KERNEL(fetch, kNHWC, paddle::lite::kernels::fpga::FetchCompute, host_host) - .BindInput("X", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) - .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kHost), - PRECISION(kAny), - DATALAYOUT(kAny))}) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); diff --git a/lite/kernels/fpga/gru_compute.cc b/lite/kernels/fpga/gru_compute.cc index 25fdcb505bcc8221da74b8cc87dc4fbec86b6190..a157382a6fb7ac39e4b102f5ac65dea337ed0f13 100755 --- a/lite/kernels/fpga/gru_compute.cc +++ b/lite/kernels/fpga/gru_compute.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include + #include #include #include @@ -83,6 +84,7 @@ void GRUCompute::PrepareForRun() { void GRUCompute::Run() { auto& param = this->Param(); param.hidden->mutable_data(); + // inputs auto input = param.input; auto h0 = param.h0; @@ -130,6 +132,7 @@ void GRUCompute::Run() { // //3. gru_value.prev_out_value = ordered_h0.mutable_data(); gru_tensors.pre_output = ordered_h0.ZynqTensor(); + } else { gru_value.prev_out_value = nullptr; gru_tensors.pre_output = nullptr; @@ -169,6 +172,7 @@ void GRUCompute::Run() { float* hidden_data = hidden_out.mutableData(zynqmp::FP32, float_input_shape); + gru_tensors.gate = &float_input; gru_tensors.output = &hidden_out; @@ -187,11 +191,6 @@ void GRUCompute::Run() { *(batch_hidden->mutable_lod()) = batch_gate->lod(); batch_hidden->mutable_data(); to_seq(*batch_hidden, hidden); - - save_tensor(const_cast(input), "_input.txt"); - save_tensor(hidden, "_gru.txt"); - - exit(-1); } } // namespace fpga diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc old mode 100644 new mode 100755 index 10a0e3116b920a2f408606ef211f408ed2279f60..4554c24e07de656b948826c2fa6f9526f61daaa6 --- a/lite/kernels/fpga/io_copy_compute.cc +++ b/lite/kernels/fpga/io_copy_compute.cc @@ -119,7 +119,79 @@ class IoCopyFpgaToHostCompute auto out_lod = param.y->mutable_lod(); *out_lod = param.x->lod(); } + std::string doc() const override { return "Copy IO from FPGA to HOST"; } +}; + +void hwc_to_chw(float* chw_data, + float* hwc_data, + int num, + int channel, + int height, + int width) { + int chw = channel * height * width; + int wc = width * channel; + int wh = width * height; + int index = 0; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + chw_data[n * chw + c * wh + h * width + w] = hwc_data[index]; + index++; + } + } + } + } +} + +class IoCopyFpgaToHostCHWCompute + : public KernelLite { + public: + void Run() override { + auto& param = Param(); + CHECK(param.x->target() == TARGET(kHost) || + param.x->target() == TARGET(kFPGA)); + + Tensor hwc; + hwc.Resize(param.y->dims()); + float* hwc_data = hwc.mutable_data(); + + float* chw_data = param.y->mutable_data(); + param.y->ZynqTensor()->setDataType(zynqmp::FP32); + param.x->ZynqTensor()->syncToDevice(); + if (param.x->ZynqTensor()->aligned() && + param.x->ZynqTensor()->shape().shouldAlign()) { + zynqmp::Tensor tempTensor; + tempTensor.mutableData(zynqmp::FP16, + param.x->ZynqTensor()->shape()); + tempTensor.copyFrom(param.x->ZynqTensor()); + tempTensor.setAligned(true); + tempTensor.unalignImage(); + hwc.ZynqTensor()->copyFrom(&tempTensor); + } else { + hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor()); + } + + int num = 1; + int channel = 1; + int height = 1; + int width = 1; + + auto dims = param.y->ZynqTensor()->shape(); + + hwc_to_chw(chw_data, + hwc_data, + dims.num(), + dims.channel(), + dims.height(), + dims.width()); + + param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor()); + param.y->ZynqTensor()->flush(); + auto out_lod = param.y->mutable_lod(); + *out_lod = param.x->lod(); + } std::string doc() const override { return "Copy IO from FPGA to HOST"; } }; @@ -170,7 +242,7 @@ REGISTER_LITE_KERNEL(io_copy, PRECISION(kFP16), DATALAYOUT(kNHWC))}) .BindOutput("Out", - {LiteType::GetTensorTy(TARGET(kARM), + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); @@ -179,8 +251,8 @@ REGISTER_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, - paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute, - device_to_host_22) + paddle::lite::kernels::fpga::IoCopyFpgaToHostCHWCompute, + device_to_host_chw) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kFPGA), PRECISION(kFP16), diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc index cee5e16205370df7faabc6f37d57fe360e8a9e67..4834054df6371a9faaa17bd17b53a29b999ddf03 100644 --- a/lite/kernels/fpga/multiclass_nms_compute.cc +++ b/lite/kernels/fpga/multiclass_nms_compute.cc @@ -384,6 +384,7 @@ void MulticlassNmsCompute::Run() { scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out); outs->ZynqTensor()->copyFrom(out.ZynqTensor()); } + outs->Resize({static_cast(e - s), out_dim}); } } LoD lod; diff --git a/lite/kernels/fpga/pooling_compute.cc b/lite/kernels/fpga/pooling_compute.cc old mode 100644 new mode 100755 index 5411d3370d4b39a7d587cbf1887e7d1372b30036..64e5d204336303ca0deca796f7145ac88c016330 --- a/lite/kernels/fpga/pooling_compute.cc +++ b/lite/kernels/fpga/pooling_compute.cc @@ -34,7 +34,9 @@ void PoolCompute::PrepareForRun() { zynqmp::PoolingParam& pool_param = pe_.param(); pool_param.input = param.x->ZynqTensor(); pool_param.output = param.output->ZynqTensor(); - pool_param.relu.enabled = false; + + pool_param.activeParam.type = zynqmp::TYPE_RELU; + pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX : zynqmp::PoolingType::AVERAGE; pool_param.globalPooling = param.global_pooling;