diff --git a/lite/backends/fpga/CMakeLists.txt b/lite/backends/fpga/CMakeLists.txt index b12fd85caf7e0c79de830b45569e02ba916c34e6..a5207c01a4d5e7b8d05490bd7c9be0dcc01f365e 100644 --- a/lite/backends/fpga/CMakeLists.txt +++ b/lite/backends/fpga/CMakeLists.txt @@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA) endif() set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD") +set(LITE_FPGA_KD_LLAPI_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/llapi") +set(LITE_FPGA_KD_PE_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/pes") set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga") message("fpga_kd_path ${LITE_FPGA_KD_PATH}") message("fpga_path ${LITE_FPGA_PATH}") -file(GLOB_RECURSE KD_CPP *.cpp *.cc) +file(GLOB KD_CPP "${LITE_FPGA_KD_PATH}/*.cpp") +file(GLOB PE_CPP "${LITE_FPGA_KD_PE_PATH}/*.cpp") +file(GLOB LLAPI_CPP "${LITE_FPGA_KD_LLAPI_PATH}/*.cpp") file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc") - -cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP}) +set(FPGA_ALL_CPP "") +FOREACH(FILE_PATH ${KD_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND FPGA_ALL_CPP KD/${FILE_NAME}) +ENDFOREACH(FILE_PATH) +FOREACH(FILE_PATH ${PE_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND FPGA_ALL_CPP KD/pes/${FILE_NAME}) +ENDFOREACH(FILE_PATH) +FOREACH(FILE_PATH ${LLAPI_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list(APPEND FPGA_ALL_CPP KD/llapi/${FILE_NAME}) +ENDFOREACH(FILE_PATH) +FOREACH(FILE_PATH ${FPGA_CPP}) + STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH}) + list( APPEND FPGA_ALL_CPP ${FILE_NAME}) +ENDFOREACH(FILE_PATH) +message("fpga kd: ${FPGA_ALL_CPP}") +cc_library(kernel_fpga SRCS ${FPGA_ALL_CPP}) +#cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP}) cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory) -cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga) +cc_library(fpga_target_wrapper SRCS target_wrapper.cc DEPS kernel_fpga) diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp new file mode 100755 index 0000000000000000000000000000000000000000..33efaf20169dfad4035d40d3ca02ac7dc7047db3 --- /dev/null +++ b/lite/backends/fpga/KD/debugger.hpp @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +// #include "lite/backends/fpga/lite_tensor.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { + +#define FPGA_PRINT_TENSOR + +class Debugger { + public: + static Debugger& get_instance() { + static Debugger s_instance; + return s_instance; + } + + void registerOutput(std::string op_type, zynqmp::Tensor* tensor) { + // tensor->printScale(); + if (op_type != "conv") { + // tensor->saveToFile(op_type, true); + } + } + + private: + std::unordered_map op_config; + Debugger() { + op_config["concat"] = true; + op_config["conv"] = true; + op_config["crop"] = true; + } +}; + +inline void chw_to_hwc(Tensor* t, float* dst) { + int num = t->dims()[0]; + int channel = t->dims()[1]; + + int height = 1; + int width = 1; + if (t->dims().size() > 2) { + height = t->dims()[2]; + } + if (t->dims().size() > 3) { + width = t->dims()[3]; + } + // int width = t->dims()[3]; + const float* chw_data = t->data(); + float* hwc_data = dst; + + int chw = channel * height * width; + int wc = width * channel; + int index = 0; + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index]; + index++; + } + } + } + } +} + +inline void read_from_file(lite::Tensor* t, const std::string& path) { + std::ifstream file_stream; + file_stream.open(path); + if (!file_stream) { + return; + } + float* data = t->mutable_data(); + int num = t->numel(); + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + data[i] = value; + } + // flush(); +} + +inline void save_float(float* data, const std::string& name, int len) { + // return; + static int counter = 0; + std::string old_string = std::to_string(counter); + std::string new_string = + std::string(3 - old_string.length(), '0') + old_string; + + std::string file = "arm_" + new_string + name; + counter++; + + std::cout + << "-------------------------- saving file: --------------------------" + << file << std::endl; + std::ofstream ofs; + ofs.open(file); + // float* data = dst; + for (int i = 0; i < len; i++) { + float value = data[i]; + ofs << value << std::endl; + } + ofs.close(); +} + +inline void save_tensor(lite::Tensor* t, + const std::string& name, + bool convert = true) { + float* data = const_cast(t->data()); + float* dst = new float[t->numel()]; + if (convert) { + chw_to_hwc(t, dst); + data = dst; + } + + save_float(data, name, t->numel()); + delete[] dst; +} + +inline void save_tensor(const lite::Tensor* t, + const std::string& name, + bool convert = true) { + // return; + float* data = const_cast(t->data()); + float* dst = new float[t->numel()]; + if (convert) { + chw_to_hwc(const_cast(t), dst); + data = dst; + } + + save_float(data, name, t->numel()); + + delete[] dst; +} +} // namespace lite +} // namespace paddle diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp old mode 100644 new mode 100755 index 9849e4275b5d0f59346b9684530610853f1a560c..ea503518a0f39671e77157f14788a1cadb4579f3 --- a/lite/backends/fpga/KD/dl_engine.cpp +++ b/lite/backends/fpga/KD/dl_engine.cpp @@ -13,14 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "lite/backends/fpga/KD/dl_engine.hpp" + namespace paddle { namespace zynqmp { DLEngine::DLEngine() { open_device(); - struct DeviceInfo info; - int ret = get_device_info(info); - filter::set_filter_capacity(info.filter_cap); + int ret = get_device_info(info_); + filter::set_filter_capacity(info_.filter_cap); + filter::set_colunm(info_.colunm); } } // namespace zynqmp diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp old mode 100644 new mode 100755 index 829f41dfebfabfe5642bd4cf107fc6c54f3ffd86..eddf5ca454cdc9e91f87d6e4f2c8dfc13f35fdc6 --- a/lite/backends/fpga/KD/dl_engine.hpp +++ b/lite/backends/fpga/KD/dl_engine.hpp @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include - #include "lite/backends/fpga/KD/llapi/filter.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" @@ -29,8 +28,15 @@ class DLEngine { return s_instance; } + DeviceInfo& deviceInfo(); + + bool isZU3() { return info_.device_type / 100 == 3; } + + float* out_data = nullptr; + private: DLEngine(); + DeviceInfo info_; }; } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/layout.hpp b/lite/backends/fpga/KD/layout.hpp index 74819cd2120630def0114422b04efe076e1d6cb2..c6b5c911872b6b22633a4319ea708ed23c7e7e36 100644 --- a/lite/backends/fpga/KD/layout.hpp +++ b/lite/backends/fpga/KD/layout.hpp @@ -22,6 +22,7 @@ namespace paddle { namespace zynqmp { enum LayoutType { + None, N, NC, NCHW, @@ -39,6 +40,15 @@ class Layout { virtual int elementCount(const std::vector& dims) = 0; }; +struct None : Layout { + int numIndex() { return -1; } + int channelIndex() { return -1; } + int heightIndex() { return -1; } + int widthIndex() { return -1; } + int alignedElementCount(const std::vector& dims) { return 16; } + virtual int elementCount(const std::vector& dims) { return 1; } +}; + struct NCHW : Layout { int numIndex() { return 0; } int channelIndex() { return 1; } diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp index cd60f27f9896e857f8ad566d285a9b9aea1d4721..339a442207e811be31161ff25f60a080572efe8d 100644 --- a/lite/backends/fpga/KD/llapi/bias_scale.cpp +++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include +#include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/llapi/bias_scale.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" @@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { *data_in = ptr_aligned; } -void interleave(float **data_in, int num_after_alignment) { +size_t interleave(float **data_in, int num_after_alignment) { float *ptr_uninterleaved = *data_in; float *ptr_interleaved = (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT @@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) { fpga_free(ptr_uninterleaved); *data_in = ptr_interleaved; + return 2 * num_after_alignment * sizeof(float); } void format_bias_scale_array(float **bias_scale_array, @@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array, int div_num = (num + element_num_per_division - 1) / element_num_per_division; int element_num_after_division = align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); - interleave(bias_scale_array, div_num * element_num_after_division); - fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); + size_t mem = + interleave(bias_scale_array, div_num * element_num_after_division); + fpga_flush(*bias_scale_array, mem); } void format_bias_array(float **bias_array, int num) { float *ptr_unaligned = *bias_array; diff --git a/lite/backends/fpga/KD/llapi/bias_scale.h b/lite/backends/fpga/KD/llapi/bias_scale.h index 83f30df18fc7e5967d727ed8ce275d63e1cb29e0..d47d082ccdc6b41cf43860495e43076c17b13ac3 100644 --- a/lite/backends/fpga/KD/llapi/bias_scale.h +++ b/lite/backends/fpga/KD/llapi/bias_scale.h @@ -19,7 +19,7 @@ namespace zynqmp { namespace bias_scale { void align_element(float** data_in, int num_per_div_before_alignment, int num); -void interleave(float** data_in, int num_after_alignment); +size_t interleave(float** data_in, int num_after_alignment); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp index 0e41a204a854b0b57e1a8c98fb3cc8d5224c807c..dcb7dbe8775ae66b909bfea04af8756c7f683d15 100644 --- a/lite/backends/fpga/KD/llapi/filter.cpp +++ b/lite/backends/fpga/KD/llapi/filter.cpp @@ -15,6 +15,8 @@ limitations under the License. */ #include "lite/backends/fpga/KD/llapi/filter.h" #include #include +#include +#include #include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" @@ -23,11 +25,42 @@ namespace zynqmp { namespace filter { static int FILTER_SIZE = 2048; +static int COLUMN = 4; + +void saveToFile(std::string name, void* data_in, int size) { + // std::ofstream ofs; + // ofs.open(name); + + // int8_t* data = (int8_t*)data_in; + // for (int i = 0; i < size; i++) { + // float value = data[i]; + // ofs << value << std::endl; + // } + // ofs.close(); +} + +void saveFloatToFile(std::string name, float* data_in, int size) { + // std::ofstream ofs; + // ofs.open(name); + + // for (int i = 0; i < size; i++) { + // float value = data_in[i]; + // ofs << value << std::endl; + // } + // ofs.close(); +} void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; } +void set_colunm(uint32_t column) { COLUMN = column; } + +// replace zynqmp_api.h #define FILTER_NUM_ALIGNMENT +int get_filter_num_alignment() { return COLUMN * 4; } + int calc_division_capacity(int chw) { - int n = FILTER_SIZE / ((chw + 15) / 16) * 32; + // int n = FILTER_SIZE / ((chw + 15) / 16) * 32; + int filter_num_alignment = get_filter_num_alignment(); + int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment; return n < FILTER_SIZE ? n : FILTER_SIZE; } @@ -52,28 +85,28 @@ int calc_num_per_div(int num, int group_num, int division_capacity) { } } -void convert_to_hwc( - char **data_in, int num, int channel, int height, int width) { - char *tmp = *data_in; +void convert_to_hwc(int8_t* chw_data, + int8_t* hwc_data, + int num, + int channel, + int height, + int width) { int chw = channel * height * width; - char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT + int wc = width * channel; + int index = 0; for (int n = 0; n < num; n++) { - int64_t amount_per_row = width * channel; for (int c = 0; c < channel; c++) { for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; for (int w = 0; w < width; w++) { - *(data_tmp + n * chw + offset_height + w * channel + c) = - *((*data_in)++); + hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index]; + index++; } } } } - *data_in = data_tmp; - fpga_free(tmp); } -float find_max(float *data_in, int data_size) { +float find_max(float* data_in, int data_size) { float max = 0.0; for (int i = 0; i < data_size; ++i) { float value = data_in[i]; @@ -83,166 +116,194 @@ float find_max(float *data_in, int data_size) { return max; } -signed char float_to_int8(float fdata) { +int8_t float_to_int8(float fdata) { if (fdata < 0.0) { fdata -= 0.5; } else { fdata += 0.5; } - return (signed char)fdata; + return (int8_t)fdata; } -void quantize(float **data_in, int data_size, float max) { - float *tmp = *data_in; +void quantize(float* src, int8_t* dst, int len, float max) { float fix_range = 127; float scale = fix_range / max; - - signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = float_to_int8( - (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); + for (size_t i = 0; i < len; i++) { + dst[i] = float_to_int8(src[i] * scale); } - *data_in = (float *)tmp_data; // NOLINT - fpga_free(tmp); } -void align_element(char **data_in, int num, int chw) { - int j = 0; +bool should_align_chw(int chw) { int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - if (align_chw != chw) { - char *tmp = *data_in; - char *data_tmp = - (char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num * align_chw); - for (j = 0; j < num; j++) { - memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw); - } - *data_in = data_tmp; - fpga_free(tmp); + return align_chw != chw; +} + +void align_chw(int8_t* src, int8_t* dst, int num, int chw) { + int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); + memset(dst, 0, num * aligned_chw); + for (int j = 0; j < num; j++) { + memcpy((dst + j * aligned_chw), (src + j * chw), chw); } } -void align_num(char **data_in, +void align_num(int8_t* src, + int8_t* dst, int num_per_div_before_alignment, int num, - int chw) { - int i = 0; - int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); + int align_chw) { + int filter_num_alignment = get_filter_num_alignment(); int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); + align_to_x(num_per_div_before_alignment, filter_num_alignment); - char *tmp = *data_in; int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - - memset(data_tmp, 0, num_element * sizeof(char)); + memset(dst, 0, num_element * sizeof(int8_t)); + int i = 0; for (i = 0; i < div_num - 1; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, + memcpy(dst + num_per_div_after_alignment * align_chw * i, + src + num_per_div_before_alignment * align_chw * i, num_per_div_before_alignment * align_chw); } - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, + memcpy(dst + num_per_div_after_alignment * align_chw * i, + src + num_per_div_before_alignment * align_chw * i, (num - (div_num - 1) * num_per_div_before_alignment) * align_chw); - - *data_in = data_tmp; - fpga_free(tmp); } -void reorder(char **data_in, int num_after_alignment, int chw) { +void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) { int index = 0; int new_index = 0; - + int filter_num_alignment = get_filter_num_alignment(); int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; for (index = 0; index < num_after_alignment; index++) { - new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + - (index / 16 % 2 * 4); - memcpy(data_tmp + index * chw_align, - *data_in + new_index * chw_align, - chw_align); + new_index = index / filter_num_alignment * filter_num_alignment + + (index % (filter_num_alignment / 2) / 4 * 8) + + (index % (filter_num_alignment / 2) % 4) + + (index / (filter_num_alignment / 2) % 2 * 4); + memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align); } - *data_in = data_tmp; - fpga_free(tmp); } -size_t interleave(char **data_in, int num_after_alignment, int chw) { - int i = 0; - int j = 0; - int k = 0; +void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) { int interleave_per_num = 16; - int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); - char *data_tmp = - (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT - sizeof(char)); - char *tmp = *data_in; int interleave_num = chw_align * 2 / interleave_per_num; - for (i = 0; i < num_after_alignment; i += 2) { - for (j = 0, k = 0; j < interleave_num; j += 2, k++) { - memcpy(data_tmp + i * chw_align + interleave_per_num * j, - *data_in + i * chw_align + interleave_per_num * k, + for (int i = 0; i < num_after_alignment; i += 2) { + for (int j = 0, k = 0; j < interleave_num; j += 2, k++) { + memcpy(dst + i * chw_align + interleave_per_num * j, + src + i * chw_align + interleave_per_num * k, interleave_per_num); - memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1), - *data_in + (i + 1) * chw_align + interleave_per_num * k, + memcpy(dst + i * chw_align + interleave_per_num * (j + 1), + src + (i + 1) * chw_align + interleave_per_num * k, interleave_per_num); } } - *data_in = data_tmp; - fpga_free(tmp); - return chw_align * num_after_alignment; } -size_t format_filter(float **data_in, - int num, - int channel, - int height, - int width, - int group_num, - float max) { +int8_t* format_filter(float* data_in, + int& mem_size_a, // NOLINT + int num, + int channel, + int height, + int width, + int group_num, + float max, + std::vector& filter_max) { // NOLINT int data_size = channel * height * width * num; int chw = channel * height * width; int division_capacity = calc_division_capacity(chw); + int filter_num_alignment = get_filter_num_alignment(); int num_per_div_before_alignment = calc_num_per_div(num, group_num, division_capacity); int num_per_div_after_alignment = - align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); + align_to_x(num_per_div_before_alignment, filter_num_alignment); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; + // int num_after_alignment = num_per_div_after_alignment * div_num; int residual = num % num_per_div_before_alignment; int num_after_alignment = num_per_div_after_alignment * ((residual == 0) ? div_num : (div_num - 1)) + - align_to_x(residual, FILTER_NUM_ALIGNMENT); - quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); - align_element(quantize_data, num, chw); - if (num_after_alignment != num) { - align_num(quantize_data, num_per_div_before_alignment, num, chw); + align_to_x(residual, filter_num_alignment); + + // saveFloatToFile("quantize_before", data_in, data_size); + + int8_t* quantized_data = + reinterpret_cast(fpga_malloc(data_size * sizeof(int8_t))); + + for (int n = 0; n < num; n++) { + float* filter_start = data_in + n * chw; + float f_max = find_max(filter_start, chw); + int8_t* quantized_start = quantized_data + n * chw; + // quantize(filter_start, quantized_start, chw, f_max); + quantize(filter_start, quantized_start, chw, max); + // filter_max.push_back(f_max); + filter_max.push_back(max); } - reorder(quantize_data, num_after_alignment, chw); - size_t mem_size = interleave(quantize_data, num_after_alignment, chw); - fpga_flush(*quantize_data, + // saveToFile("chw.txt", quantized_data, data_size); + + int8_t* hwc_data = + reinterpret_cast(fpga_malloc(data_size * sizeof(int8_t))); + convert_to_hwc(quantized_data, hwc_data, num, channel, height, width); + fpga_free(quantized_data); + + // saveToFile("hwc.txt", hwc_data, data_size); + + int8_t* temp_data = hwc_data; // NOLINT + int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); + if (should_align_chw(chw)) { + int8_t* hwc_aligned_data = reinterpret_cast( + fpga_malloc(num * chw_aligned * sizeof(int8_t))); + align_chw(hwc_data, hwc_aligned_data, num, chw); + + // saveToFile("align_el.txt", hwc_aligned_data, data_size * 2); + temp_data = hwc_aligned_data; + fpga_free(hwc_data); + } + if (num_after_alignment != num) { + int filter_num_alignment = get_filter_num_alignment(); + int num_per_div_after_alignment = + align_to_x(num_per_div_before_alignment, filter_num_alignment); + // int div_num = + // (num + num_per_div_before_alignment - 1) / + // num_per_div_before_alignment; + int num_element = div_num * num_per_div_after_alignment * chw_aligned; + int8_t* num_aligned_data = + reinterpret_cast(fpga_malloc(num_element * sizeof(int8_t))); + align_num(temp_data, + num_aligned_data, + num_per_div_before_alignment, + num, + chw_aligned); + + // saveToFile("align_num.txt", num_aligned_data, data_size * 8); + fpga_free(temp_data); + temp_data = num_aligned_data; + } + int8_t* aligned_data = + reinterpret_cast(fpga_malloc(num_after_alignment * chw_aligned)); + reorder(temp_data, aligned_data, num_after_alignment, chw); + // saveToFile("reorder.txt", aligned_data, data_size * 8); + fpga_free(temp_data); + int8_t* interleaved_data = + reinterpret_cast(fpga_malloc(num_after_alignment * chw_aligned)); + interleave(aligned_data, interleaved_data, num_after_alignment, chw); + // saveToFile("interleave.txt", interleaved_data, data_size * 8); + fpga_free(aligned_data); + fpga_flush(interleaved_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment * sizeof(char)); - return mem_size; + mem_size_a = num_after_alignment * chw_aligned; + return interleaved_data; } -void convert_to_hwn(int16_t **data_in, int num, int height, int width) { - int16_t *tmp = *data_in; - int16_t *data_tmp = - (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT +void convert_to_hwn(int16_t** data_in, int num, int height, int width) { + int16_t* tmp = *data_in; + int16_t* data_tmp = + (int16_t*)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT for (int n = 0; n < num; n++) { for (int h = 0; h < height; h++) { for (int w = 0; w < width; w++) { @@ -254,16 +315,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) { fpga_free(tmp); } -size_t align_element_n(int16_t **data_in, int num, int height, int width) { +size_t align_element_n(int16_t** data_in, int num, int height, int width) { int unalign_n = num; int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT); int num_element = height * width * align_n; if (unalign_n != align_n) { - int16_t *tmp = *data_in; + int16_t* tmp = *data_in; int num_element = height * width * align_n; - int16_t *data_tmp = - (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT + int16_t* data_tmp = + (int16_t*)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT memset(data_tmp, 0, num_element * sizeof(int16_t)); for (int h = 0; h < height; h++) { @@ -276,17 +337,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) { } } *data_in = data_tmp; - free(tmp); + fpga_free(tmp); } return num_element * sizeof(int16_t); } +void to_fp16(float* src, + float16* dst, + int num, + int height, + int width, + float* scale_ptr) { + int size = num * height * width; + for (int n = 0; n < num; n++) { + float scale_val = scale_ptr[n]; + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + int index = n * height * width + h * width + w; + float value = src[index] * scale_val; + dst[index] = float_to_half(value); + } + } + } + fpga_flush(dst, size * sizeof(int16_t)); +} + void quantize_to_fp16( - float **data_in, int num, int height, int width, float *scale_ptr) { - float *tmp = *data_in; + float** data_in, int num, int height, int width, float* scale_ptr) { + float* tmp = *data_in; int size = num * height * width; - float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16)); // NOLINT + float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16)); // NOLINT for (int n = 0; n < num; n++) { float scale_val = scale_ptr[n]; for (int h = 0; h < height; h++) { @@ -298,13 +379,14 @@ void quantize_to_fp16( } } fpga_flush(tmp_data, size * sizeof(int16_t)); - *data_in = (float *)tmp_data; // NOLINT + *data_in = (float*)tmp_data; // NOLINT fpga_free(tmp); } size_t format_dwconv_filter( - float **data_in, int num, int height, int width, float *scale_ptr) { + float** data_in, int num, int height, int width, float* scale_ptr) { quantize_to_fp16(data_in, num, height, width, scale_ptr); - int16_t **quantize_data = (int16_t **)data_in; // NOLINT + int16_t** quantize_data = reinterpret_cast(data_in); + convert_to_hwn(quantize_data, num, height, width); size_t size = align_element_n(quantize_data, num, height, width); fpga_flush(*quantize_data, diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h index 7d9c6c2e015250cbcba2d1dba71b7c1f3554d9f0..90093fe05b30150d6a8f7cc21e9bf7b4eb736ff9 100644 --- a/lite/backends/fpga/KD/llapi/filter.h +++ b/lite/backends/fpga/KD/llapi/filter.h @@ -18,38 +18,35 @@ limitations under the License. */ #include #include +#include + namespace paddle { namespace zynqmp { namespace filter { void set_filter_capacity(uint32_t cap); +void set_colunm(uint32_t column); +int get_filter_num_alignment(); int calc_division_capacity(int chw); int calc_split_num(int num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity); -void convert_to_hwc( - char** data_in, int num, int channel, int height, int width); + float find_max(float* data_in, int data_size); -void quantize(float** data_in, int data_size, float max); -void align_element(char** data_in, int num, int chw); -void align_num(char** data_in, - int num_per_div_before_alignment, - int num, - int chw); -void reorder(char** data_in, int num_after_alignment, int chw); -size_t interleave(char** data_in, int num_after_alignment, int chw); -size_t format_filter(float** data_in, - int num, - int channel, - int height, - int width, - int group_num, - float max); +int8_t* format_filter(float* data_in, + int& mem_size, // NOLINT + int num, + int channel, + int height, + int width, + int group_num, + float max, // NOLINT + std::vector& filter_max); // NOLINT void convert_to_hwn(int16_t** data_in, int num, int height, int width); size_t align_element_n(int16_t** data_in, int num, int height, int width); -void quantize_to_fp16( - float** data_in, int num, int height, int width, float* scale_ptr); +// void quantize_to_fp16(float** data_in, int num, int height, int width, +// float* scale_ptr); size_t format_dwconv_filter( float** data_in, int num, int height, int width, float* scale_ptr); diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp old mode 100644 new mode 100755 index 1f1226ead3d4e9b50100f4de574104a5d6f777b2..2f29e5c1b539f47f5650928e14e8180c26414860 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp @@ -23,13 +23,12 @@ limitations under the License. */ #include #include -#include "lite/backends/fpga/KD/llapi/config.h" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" namespace paddle { namespace zynqmp { -#define PADDLE_LITE_OS_LINUX +#define PADDLE_OS_LINUX static int fd = -1; static const char *device_path = "/dev/fpgadrv0"; @@ -39,14 +38,10 @@ static size_t memory_size_max = 0; static size_t memory_size = 0; static inline int do_ioctl(uint64_t req, const void *arg) { - int ret = -1; -#ifdef PADDLE_LITE_OS_LINUX - ret = ioctl(fd, req, arg); - if (ret != 0) { - throw - 1; - } +#ifdef PADDLE_OS_LINUX + return ioctl(fd, req, arg); #else - return ret; + return -1; #endif } @@ -66,7 +61,10 @@ void reset_device() { // memory management; void *fpga_malloc(size_t size) { -#ifdef PADDLE_LITE_OS_LINUX +#ifdef ENABLE_DEBUG +// std::cout << "fpga_malloc:" << size << std::endl; +#endif +#ifdef PADDLE_OS_LINUX void *ptr = reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); if (ptr == NULL) { @@ -105,11 +103,8 @@ void fpga_free(void *ptr) { size = iter->second; memory_map.erase(iter); } - memory_size -= size; - -#ifdef PADDLE_LITE_OS_LINUX - +#ifdef PADDLE_OS_LINUX munmap(ptr, size); #else free(ptr); @@ -150,6 +145,11 @@ void fpga_copy(void *dest, const void *src, size_t num) { memcpy(dest, src, num); } +int fpga_reset() { + struct FpgaResetArgs args; + return do_ioctl(IOCTL_FPGA_RESET, &args); +} + int ioctl_conv(const struct ConvArgs &args) { return do_ioctl(IOCTL_CONFIG_CONV, &args); } @@ -166,7 +166,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) { } if (split_num > 1) { - std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl; exit(-1); } return ret; @@ -186,6 +185,7 @@ int get_device_info(const struct DeviceInfo &args) { } int perform_bypass(const struct BypassArgs &args) { + int ret = -1; int size = args.image.channels * args.image.width * args.image.height; int max_size = 1 << 21; @@ -213,7 +213,7 @@ int perform_bypass(const struct BypassArgs &args) { reinterpret_cast(input_address + i * max_size * type_size); bypassArgs.output.address = reinterpret_cast(output_address + i * max_size * out_type_size); - int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); + ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); scale = std::max(scale, scales[0]); if (ret != 0) { @@ -222,13 +222,15 @@ int perform_bypass(const struct BypassArgs &args) { } int remainder = size - max_size * count; - bypassArgs.image.channels = remainder; - bypassArgs.image.address = - reinterpret_cast(input_address + count * max_size * type_size); - bypassArgs.output.address = reinterpret_cast( - output_address + count * max_size * out_type_size); - int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); - scale = std::max(scale, scales[0]); + if (remainder > 0) { + bypassArgs.image.channels = remainder; + bypassArgs.image.address = + reinterpret_cast(input_address + count * max_size * type_size); + bypassArgs.output.address = reinterpret_cast( + output_address + count * max_size * out_type_size); + ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs); + scale = std::max(scale, scales[0]); + } args.output.scale_address[0] = scale; args.output.scale_address[1] = 1.0f / scale; return ret; @@ -237,52 +239,21 @@ int perform_bypass(const struct BypassArgs &args) { int compute_fpga_concat(const struct ConcatArgs &args) { return -1; } int compute_fpga_scale(const struct ScaleArgs &args) { -#ifdef ENABLE_DEBUG - std::cout << "======Compute Scale======"; - std::cout << "scale_address:" << args.scale_address << std::endl; - std::cout << "bias_address:" << args.bias_address << std::endl; - - std::cout << "wc_alignment:" << args.wc_alignment << std::endl; - std::cout << "channel_alignment:" << args.channel_alignment << std::endl; - - std::cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - - std::cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; - -#endif return do_ioctl(IOCTL_CONFIG_SCALE, &args); } int compute_fpga_dwconv(const struct DWconvArgs &args) { -#ifdef ENABLE_DEBUG - std::cout << "======Compute Basic Conv======"; - std::cout << " relu_enabled:" << args.relu_enabled - << " filter_address:" << args.filter_address; - std::cout << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - std::cout << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - std::cout << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; - -#endif return do_ioctl(IOCTL_CONFIG_DWCONV, &args); } +int config_activation(const struct ActiveParamterArgs &args) { + return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args); +} + +// int config_power(const struct PowerArgs& args) { +// return do_ioctl(IOCTL_CONFIG_POWER, &args); +// } + int config_inplace(const struct InplaceArgs &args) { return do_ioctl(IOCTL_CONFIG_INPLACE, &args); } diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h old mode 100644 new mode 100755 index 7d22de95a2272862c6fe781295bdaab7177a92fe..e00507e3247a70caf0dd57f5ed5b20d9ebbffd77 --- a/lite/backends/fpga/KD/llapi/zynqmp_api.h +++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H +#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H + #include #include #include @@ -40,6 +43,13 @@ enum DLayoutType { LAYOUT_HWC = 0, }; +enum ActiveType { + TYPE_RELU = 0, + TYPE_RELU6 = 1, + TYPE_LEAK_RELU = 2, + TYPE_SIGMOID = 3, +}; + struct VersionArgs { void* buffer; }; @@ -48,7 +58,7 @@ struct DeviceInfo { uint32_t filter_cap; uint32_t version; uint16_t device_type; - uint32_t reserved0; + uint32_t colunm; uint32_t reserved1; uint32_t reserved2; uint32_t reserved3; @@ -108,6 +118,7 @@ struct ConvArgs { void* filter_scale_address; uint32_t filter_num; uint32_t group_num; + uint32_t dilation; struct KernelArgs kernel; struct ImageInputArgs image; // input image; @@ -199,9 +210,16 @@ struct NormalizeParameterArgs { uint32_t hight_width; }; +struct ActiveParamterArgs { + ActiveType type; + uint16_t leaky_relu_factor; +}; + struct InplaceArgs { bool leaky_relu_enable; bool relu_enable; + bool sigmoid_enable; + bool relu6_enable; bool power_enable; bool normalize_enable; }; @@ -216,7 +234,9 @@ struct FpgaRegReadArgs { uint64_t value; }; -struct FpgaResetArgs {}; +struct FpgaResetArgs { + uint32_t val; +}; #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4) @@ -248,6 +268,8 @@ struct FpgaResetArgs {}; _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs) #define IOCTL_CONFIG_NORMALIZE_PARAMETER \ _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs) +#define IOCTL_CONFIG_ACTIVATION_PARAMETER \ + _IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs) #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs) #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs) #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs) @@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args); int compute_fpga_concat(const struct ConcatArgs& args); int compute_fpga_resize(const struct ResizeArgs& args); +int config_activation(const struct ActiveParamterArgs& args); int config_power(const struct PowerArgs& args); int compute_fpga_dwconv(const struct DWconvArgs& args); int config_norm_param(const struct NormalizeParameterArgs& args); @@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args); int flush_cache(void* addr, int size); int invalidate_cache(void* addr, int size); +int fpga_reset(); + int16_t fp32_2_fp16(float fp32_num); float fp16_2_fp32(int16_t fp16_num); } // namespace zynqmp } // namespace paddle + +#endif // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H diff --git a/lite/backends/fpga/KD/pe.hpp b/lite/backends/fpga/KD/pe.hpp index d1dc3c4caa18cbfeba74fac26cca9e19230e2c21..2796124341012574dc719ae9f30633d1d9524680 100644 --- a/lite/backends/fpga/KD/pe.hpp +++ b/lite/backends/fpga/KD/pe.hpp @@ -32,6 +32,5 @@ class PE { virtual ~PE() {} }; - } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp index 709f04d399793c6f21c34fc1265f7ed8b5818314..9dc295a58d4bbfd50a0b9ecbdb06a22c8900cef7 100644 --- a/lite/backends/fpga/KD/pe_params.hpp +++ b/lite/backends/fpga/KD/pe_params.hpp @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" @@ -26,6 +27,7 @@ namespace zynqmp { struct ReLUParam { public: bool enabled = false; + float leaky_relu_factor = 0.0f; }; struct PEParam { @@ -98,6 +100,24 @@ struct DepthwiseConvParam : ConvParam { Tensor* quantizedFilter_ = new Tensor(); }; +struct GRUParam : PEParam { + public: + Tensor* input = nullptr; + Tensor* h0 = nullptr; + Tensor* weight = nullptr; + Tensor* bias = nullptr; + + Tensor* batch_gate = nullptr; + Tensor* batch_reset_hidden_prev = nullptr; + Tensor* batch_hidden = nullptr; + Tensor* hidden = nullptr; + + std::string gate_activation = "sigmoid"; + std::string activation = "tanh"; + bool is_reverse = false; + bool origin_mode = false; +}; + enum PoolingType : int { MAX = 0, AVERAGE = 1, @@ -133,6 +153,12 @@ struct ElementwiseAddParam : PEParam { EWAddArgs ewargs; }; +struct ElementwiseMulParam : PEParam { + public: + std::vector inputs; + Tensor* output = nullptr; +}; + struct FullyConnectedParam : PEParam { public: Tensor* input = nullptr; diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp old mode 100644 new mode 100755 index e897f82280fa57f904bd7c749e371d8ec9219b51..ca894bdc242faf58760743a98b16a40e10a7fc82 --- a/lite/backends/fpga/KD/pes/conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/conv_pe.hpp @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "lite/backends/fpga/KD/pe.hpp" @@ -49,7 +50,111 @@ class ConvPE : public PE { concatPE_.init(); concatPE_.apply(); } + + if (DLEngine::get_instance().isZU3() && + param_.input->shape().dimSize() == 4 && + param_.input->shape().width() == 1 && + param_.input->shape().width() >= 2048) { + use_cpu_ = true; + } + + if (param_.filter->shape().width() == 1 && + param_.filter->shape().height() == 1) { + // use_cpu_ = true; + } + if (!use_cpu_) { + // param_.filter->releaseData(); + } } + + void cpu_conv_hwc() { + Tensor* input = param_.input; + Tensor* output = param_.output; + input->syncToCPU(); + + Tensor float_input; + Tensor float_output; + float* image_addr = float_input.mutableData(FP32, input->shape()); + float_input.copyFrom(input); + float_input.syncToCPU(); + float* out = float_output.mutableData(FP32, output->shape()); + + int out_width = output->shape().width(); + int out_channel = output->shape().channel(); + int in_channel = input->shape().channel(); + + float* filter_data = param_.filter->data(); + + int image_height = input->shape().height(); + int image_width = input->shape().width(); + int image_channels = input->shape().channel(); + int image_pad_h = param_.paddings[0]; + int image_pad_w = param_.paddings[1]; + int kernel_height = param_.filter->shape().height(); + int kernel_width = param_.filter->shape().width(); + int kernel_step_h = param_.strides[0]; + int kernel_step_w = param_.strides[1]; + // int out_channel = param_.strides[1]; + int pooled_height_ = output->shape().height(); + int pooled_width_ = out_width; + int filter_chw = image_channels * kernel_height * kernel_width; + + float max = 0; + + for (int ph = 0; ph < pooled_height_; ph++) { + for (int pw = 0; pw < pooled_width_; pw++) { + int hstart = ph * kernel_step_h - image_pad_h; + int wstart = pw * kernel_step_w - image_pad_w; + int hend = + std::min(hstart + kernel_height, static_cast(image_height)); + int wend = + std::min(wstart + kernel_width, static_cast(image_width)); + hstart = std::max(hstart, static_cast(0)); + wstart = std::max(wstart, static_cast(0)); + for (int oc = 0; oc < out_channel; oc++) { + float sum = 0.0f; + const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc; + for (int c = 0; c < image_channels; c++) { + for (int h = hstart; h < hend; h++) { + int hi = 0; + if (ph == 0) { + hi = h - hstart + image_pad_h; + } else { + hi = h - hstart; + } + for (int w = wstart; w < wend; w++) { + int wi = 0; + if (pw == 0) { + wi = w - wstart + image_pad_w; + } else { + wi = w - wstart; + } + const int index = (h * image_width + w) * image_channels + c; + int weight_index = oc * filter_chw + + kernel_width * kernel_height * c + + kernel_width * hi + wi; + float value = image_addr[index] * filter_data[weight_index]; + sum += value; + } + } + } + + if (param_.relu.enabled && sum < 0) { + sum = -sum; + } + if (sum > max) { + max = sum; + } + out[pool_index] = sum; + } + } + } + float_output.flush(); + output->copyFrom(&float_output); + output->scale()[0] = max / 127; + output->scale()[1] = 127 / max; + } + void cpu_compute() { Tensor* input = param_.input; Tensor* output = param_.output; @@ -59,43 +164,79 @@ class ConvPE : public PE { Tensor float_output; float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); + float_input.syncToCPU(); + float* out = float_output.mutableData(FP32, output->shape()); + float* bias_data = param_.bias()->data(); + + int out_width = output->shape().width(); int out_channel = output->shape().channel(); int in_channel = input->shape().channel(); float* filter_data = param_.filter->data(); float* mi = new float[in_channel]; + float max = 0; + int out_index = 0; for (int i = 0; i < out_channel; i++) { float* image = image_addr; float* filter_ptr = filter_data + i * in_channel; float* out_ptr = mi; -#pragma omp parallel for - for (int j = 0; j < in_channel; j++) { - float value = image_addr[j] * filter_ptr[j]; - mi[j] = value; - } - float sum = 0; - for (int j = 0; j < in_channel; j++) { - sum += mi[j]; + for (int h = 0; h < output->shape().height(); h++) { + for (int w = 0; w < output->shape().width(); w++) { + float sum = 0; + + // #pragma omp parallel for + for (int j = 0; j < in_channel; j++) { + int image_index = h * out_width * in_channel + w * in_channel + j; + float value = image_addr[image_index] * filter_ptr[j]; + sum += value; + } + + sum += bias_data[i]; + + if (param_.relu.enabled && sum < 0) { + sum = 0; + } + if (sum > max) { + max = sum; + } + out_index = h * out_width * out_channel + w * out_channel + i; + out[out_index] = sum; + // out_index++; + } } - out[i] = sum; } delete[] mi; float_output.flush(); output->copyFrom(&float_output); + output->scale()[0] = max / 127; + output->scale()[1] = 127 / max; } bool dispatch() { - inplace_.relu_enable = param_.relu.enabled; + if (use_cpu_) { + cpu_compute(); + return true; + } + + inplace_.leaky_relu_enable = + (param_.relu.leaky_relu_factor != 0) ? true : false; + inplace_.relu_enable = + inplace_.leaky_relu_enable ? false : param_.relu.enabled; + inplace_.power_enable = false; inplace_.normalize_enable = false; - - if (param_.relu.enabled) { - inplace_.relu_enable = param_.relu.enabled; + if (inplace_.relu_enable || inplace_.leaky_relu_enable) { config_inplace(inplace_); + if (inplace_.leaky_relu_enable) { + activeParamterArgs.type = TYPE_LEAK_RELU; + activeParamterArgs.leaky_relu_factor = + fp32_2_fp16(param_.relu.leaky_relu_factor); + config_activation(activeParamterArgs); + } } std::vector& params = param_.splitParams(); @@ -104,9 +245,16 @@ class ConvPE : public PE { ret |= compute_fpga_conv_basic(conv_param->args); } - if (param_.relu.enabled) { + if (inplace_.relu_enable || inplace_.leaky_relu_enable) { inplace_.relu_enable = false; + inplace_.leaky_relu_enable = false; config_inplace(inplace_); + + if (inplace_.leaky_relu_enable) { + activeParamterArgs.type = TYPE_LEAK_RELU; + activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0); + config_activation(activeParamterArgs); + } } size_t size = params.size(); @@ -127,11 +275,13 @@ class ConvPE : public PE { ConvParam& param() { return param_; } private: + bool use_cpu_ = false; ConvParam param_; ConcatPE concatPE_; ElementwiseAddPE addPE_; int split_axis = 0; InplaceArgs inplace_ = {0}; + ActiveParamterArgs activeParamterArgs; }; } // namespace zynqmp diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp old mode 100644 new mode 100755 index 23332b422df65250f8cadf07f5e0d95e970d316a..3db9662b62cce6ed33d059f60835dca25be5f60e --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#ifndef conv_process_hpp +#define conv_process_hpp + #include #include #include @@ -45,7 +48,9 @@ inline int get_split_num(Tensor* filter) { filter->shape().width(); auto num = filter->shape().num(); int div_capacity = filter::calc_division_capacity(chw); - return filter::calc_split_num(num, div_capacity); + int filter_num_alignment = filter::get_filter_num_alignment(); + int aligned_num = align_to_x(num, filter_num_alignment); + return filter::calc_split_num(aligned_num, div_capacity); } inline void fill_scale_bias_const(ConvParam* param_) { @@ -126,41 +131,87 @@ inline void format_scale_bias(Tensor* scale, bias_data = bias->data(); } int channel = filter->shape().num(); - Shape bias_scale_shape(N, {2 * channel}); + int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group; + + int c_per_group = channel / group; + int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT); + + Shape bias_scale_shape(N, {2 * scale_bias_len}); float* bs_data = scale_bias->mutableData(FP32, bias_scale_shape); - for (int i = 0; i < channel; i++) { - float scale_value = scale_data == nullptr ? 1 : scale_data[i]; - float bias_value = bias_data == nullptr ? 0 : bias_data[i]; - bs_data[i + channel] = scale_value; - bs_data[i] = bias_value; + float* temp_data = + reinterpret_cast(fpga_malloc(2 * scale_bias_len * sizeof(float))); + memset(temp_data, 0, 2 * scale_bias_len * sizeof(float)); + + std::vector scales; + if (scale_data != nullptr) { + for (int i = 0; i < channel; ++i) { + scales.push_back(scale_data[i]); + } + for (int i = 0; i < scale_bias_len - channel; i++) { + scales.push_back(1); + } + } else { + for (int i = 0; i < scale_bias_len; i++) { + scales.push_back(1); + } } - int element_num_per_div = get_filter_num_per_div(filter, group); - bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel); + for (int i = 0; i < scale_bias_len; ++i) { + temp_data[i + scale_bias_len] = 1; + temp_data[i] = 0; + } + + for (int g = 0; g < group; g++) { + for (int c = 0; c < c_per_group; c++) { + int src_index = g * c_per_group + c; + int dst_index = g * aligned_c_per_group + c; + float scale_value = scales[src_index]; + float bias_value = bias_data == nullptr ? 0 : bias_data[src_index]; + temp_data[dst_index + scale_bias_len] = scale_value; + temp_data[dst_index] = bias_value; + } + } + + // int element_num_per_div = get_filter_num_per_div(filter, group); + // int scale_bias_len = align_to_x(channel / group, 8) * group; + bias_scale::format_bias_scale_array( + &temp_data, scale_bias_len / group, scale_bias_len); + memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float)); } -inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) { +inline void format_filter(Tensor* filter, + Tensor* quantized_filter, + int group, + std::vector& scales) { // NOLINT float max_value = find_max(*filter); Shape& filter_shape = filter->shape(); + + int mem_size; + std::vector max_values; + int8_t* quantized_data = filter::format_filter(filter->data(), + mem_size, + filter_shape.num(), + filter_shape.channel(), + filter_shape.height(), + filter_shape.width(), + group, + max_value, + max_values); + + float mem_factor = mem_size * 1.0f / filter->shape().numel(); + quantized_filter->setMemScale(mem_factor); + quantized_filter->setAligned(true); - quantized_filter->mutableData(INT8, filter->shape()); + int8_t* src = quantized_filter->mutableData(INT8, filter->shape()); quantized_filter->scale()[0] = max_value / 127.0f; quantized_filter->scale()[1] = 127.0f / max_value; - auto memory_size = filter->shape().memorySize(sizeof(float)); - auto new_data = reinterpret_cast(fpga_malloc(memory_size)); - memcpy(new_data, filter->data(), memory_size); - size_t mem_size = filter::format_filter(&new_data, - filter_shape.num(), - filter_shape.channel(), - filter_shape.height(), - filter_shape.width(), - group, - max_value); - int8_t* src = quantized_filter->mutableData(INT8, filter->shape()); - memcpy(src, new_data, mem_size); - fpga_free(new_data); + memcpy(src, quantized_data, mem_size); quantized_filter->flush(); + + for (size_t i = 0; i < max_values.size(); i++) { + scales.push_back(max_values[i] / max_value); + } } inline void format_dw_filter(Tensor* filter, @@ -207,10 +258,20 @@ inline void split_filter_num(const ConvParam& c_param) { Tensor* out = param.output; Tensor* filter = param.filter; auto channel = out->shape().channel(); - int split_num = param.groups == 1 ? get_split_num(param.filter) : 1; int filter_num_per_div = get_filter_num_per_div(filter, param.groups); + auto chw = filter->shape().channel() * filter->shape().height() * + filter->shape().width(); + auto num = filter->shape().num(); + int div_capacity = filter::calc_division_capacity(chw); + int filter_num_alignment = filter::get_filter_num_alignment(); + int aligned_num = + align_to_x(num / param.groups, filter_num_alignment) * param.groups; + // int aligned_num = align_to_x(num / param.groups ,FILTER_NUM_ALIGNMENT) * + // param.groups; + split_num = filter::calc_split_num(aligned_num, div_capacity); + Shape& out_shape = out->shape(); for (int i = 0; i < split_num; i++) { BasicConvParam* conv_param = new BasicConvParam(); @@ -251,9 +312,17 @@ inline void split_filter_num(const ConvParam& c_param) { filter->data() + i * filter_num_per_div * filter_hwc, filter_num * filter_hwc * sizeof(float)); new_filter.flush(); - conv_param->filter.mutableData(FP32, f_shape); - format_filter(&new_filter, &(conv_param->filter), param.groups); + + if (param.groups != 1) { + int mem_factor = + 32 / filter_num_per_div; // TODO(chonwhite): change 32 to param; + conv_param->filter.setMemScale(mem_factor); + } + + std::vector v; // TODO(chonwhite): change local variable name + format_filter(&new_filter, &(conv_param->filter), param.groups, v); + conv_param->filter.setDataType(INT8); int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT); Tensor scale; @@ -265,7 +334,7 @@ inline void split_filter_num(const ConvParam& c_param) { float* scale_data = scale.mutableData(FP32, s_shape); float* bias_data = bias.mutableData(FP32, s_shape); for (int n = 0; n < filter_num; n++) { - scale_data[n] = param.scale()->data()[n + chnnnel_start]; + scale_data[n] = param.scale()->data()[n + chnnnel_start] * v[n]; } for (int n = 0; n < filter_num; n++) { bias_data[n] = param.bias()->data()[n + chnnnel_start]; @@ -276,11 +345,14 @@ inline void split_filter_num(const ConvParam& c_param) { &conv_param->filter, &conv_param->scaleBias, param.groups); + conv_param->scaleBias.flush(); + float* bs_data = conv_param->scaleBias.data(); args.group_num = param.groups; args.relu_enabled = param.relu.enabled; args.sb_address = conv_param->scaleBias.data(); + args.sb_address = bs_data; args.kernel.stride_h = param.strides[1]; args.kernel.stride_w = param.strides[0]; args.kernel.height = new_filter.shape().height(); @@ -294,17 +366,13 @@ inline void split_filter_num(const ConvParam& c_param) { args.image.channels = input->shape().channel(); args.image.width = input->shape().width(); args.image.height = input->shape().height(); - auto paddings = *param.padding; - args.image.pad_width = param.paddings[2]; + args.image.pad_width = param.paddings[1]; args.image.pad_height = param.paddings[0]; + // dilations[0] = dilations[1] ; + args.dilation = param.dilations[0]; + args.output.address = out_address; args.output.scale_address = out_scale_address; - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); - if (!pad_equal) { - LOG(FATA) << "This pad not support ! " << paddings[0] << ", " - << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; - } param.splitParams().push_back(conv_param); } } @@ -317,7 +385,7 @@ inline void split_channel(const ConvParam& c_param) { int num = ceil(input->shape().channel() * 1.0f / 2047); int channel = input->shape().channel() / num; - std::cout << "channel::" << channel << "num::" << num << std::endl; + Shape bs_shape(N, {channel}); for (int i = 0; i < num; i++) { @@ -331,6 +399,7 @@ inline void split_channel(const ConvParam& c_param) { // filter transformation; Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1}); + Tensor new_filter; float* dst = new_filter.mutableData(FP32, f_shape); @@ -341,7 +410,8 @@ inline void split_channel(const ConvParam& c_param) { src += param.filter->shape().channel(); } new_filter.flush(); - format_filter(&new_filter, &(conv_param->filter), param.groups); + std::vector scales; + format_filter(&new_filter, &(conv_param->filter), param.groups, scales); Tensor bias; Tensor scale; @@ -354,6 +424,7 @@ inline void split_channel(const ConvParam& c_param) { } scale.flush(); bias.flush(); + // Shape sb_shape(N, {2 * channel}); format_scale_bias(&scale, &bias, &conv_param->filter, @@ -379,18 +450,12 @@ inline void split_channel(const ConvParam& c_param) { args.image.channels = conv_param->input.shape().channel(); args.image.width = conv_param->input.shape().width(); args.image.height = conv_param->input.shape().height(); - auto paddings = *param.paddings; - args.image.pad_width = paddings[2]; - args.image.pad_height = paddings[0]; - + args.image.pad_width = param.paddings[1]; + args.image.pad_height = param.paddings[0]; + // dilations[0] = dilations[1] + args.dilation = param.dilations[0]; args.output.address = conv_param->output.mutableData(); args.output.scale_address = conv_param->output.scale(); - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); - if (!pad_equal) { - LOG(FATA) << "This pad not support ! " << paddings[0] << ", " - << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; - } param.splitParams().push_back(conv_param); } } @@ -418,11 +483,11 @@ inline bool compute_conv(const ConvParam& c_conv_params) { } size_t size = params.size(); if (ret == 0 && size > 1) { + // Tensor* output = conv_params.output; Tensor& img = params[0]->output; for (int i = 0; i < 1; i++) { for (int i = 0; i < img.shape().numel(); i++) { float value = half_to_float(img.data()[i]); - std::cout << "value:" << value << std::endl; } } } @@ -431,3 +496,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) { } // namespace zynqmp } // namespace paddle + +#endif /* conv_process_hpp */ diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp old mode 100644 new mode 100755 index c29df623aa610d329a46ee337cdcb1abd801881c..1438aaba6565cefa72f863d5fc3af0a389fc95e0 --- a/lite/backends/fpga/KD/pes/crop_pe.cpp +++ b/lite/backends/fpga/KD/pes/crop_pe.cpp @@ -14,8 +14,6 @@ limitations under the License. */ #include "lite/backends/fpga/KD/pes/crop_pe.hpp" -#include - namespace paddle { namespace zynqmp { diff --git a/lite/backends/fpga/KD/pes/crop_pe.hpp b/lite/backends/fpga/KD/pes/crop_pe.hpp index 6ebbcdb31f1afb7939c75a2ba9254c0b31f67d31..ccd1e0c98968375ebd840c7e8b15aedd6ad7ef77 100755 --- a/lite/backends/fpga/KD/pes/crop_pe.hpp +++ b/lite/backends/fpga/KD/pes/crop_pe.hpp @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp old mode 100644 new mode 100755 index f86806102d4a217ae4bb7355b36ca10d96ca4a05..8b88d24918bbbecae997817e72466798c1211a18 --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp @@ -37,18 +37,37 @@ class DepthwiseConvPE : public PE { Tensor* output = param.output; int channel = output->shape().channel(); - float* new_scale_data = param_.scale()->data(); - float* new_bias_data = param_.bias()->data(); - float16* b_data = bias_.mutableData(FP16, param_.bias()->shape()); - for (int i = 0; i < channel; i++) { - b_data[i] = float_to_half(new_bias_data[i]); + if (param_.bias()->dataType() == FP32) { + float* new_bias_data = param_.bias()->data(); + // bias从float转换成float16 + for (int i = 0; i < channel; i++) { + b_data[i] = float_to_half(new_bias_data[i]); + } + bias_.flush(); + } else { + float16* new_bias_data = param_.bias()->data(); + memcpy(b_data, new_bias_data, channel * sizeof(float16)); + bias_.flush(); } - bias_.flush(); - Tensor* quantized_filter = param.quantizedFilter(); - quantized_filter->mutableData(FP16, param.filter->shape()); - format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data); + if (param_.scale()->dataType() == FP32) { + float* new_scale_data = param_.scale()->data(); + Tensor* quantized_filter = param.quantizedFilter(); + quantized_filter->mutableData(FP16, param.filter->shape()); + format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data); + + } else { + // filter 全为1时,且channal为对齐时 + float16* scale_data = param_.scale()->data(); + float16* filter_data = param.quantizedFilter()->mutableData( + FP16, param.filter->shape()); + // memcpy(filter_data, scale_data, channel * sizeof(float16)); + memcpy(filter_data, + scale_data, + param.filter->shape().numel() * sizeof(float16)); + param.quantizedFilter()->flush(); + } DWconvArgs args = {0}; args.bias_address = b_data; @@ -61,21 +80,14 @@ class DepthwiseConvPE : public PE { args.image.channels = input->shape().channel(); args.image.height = input->shape().height(); args.image.width = input->shape().width(); - auto paddings = *param.paddings; - args.image.pad_width = param.paddings[2]; - args.image.pad_height = param.paddings[0]; + args.image.pad_width = param.paddings[0]; + args.image.pad_height = param.paddings[1]; args.image.scale_address = input->scale(); args.output.address = output->data(); args.output.scale_address = output->scale(); args.out_width = param.output->shape().width(); args.out_height = param.output->shape().height(); args.sub_conv_num = 1; - bool pad_equal = - ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); - if (!pad_equal) { - LOG(FATA) << "This pad not support ! " << paddings[0] << ", " - << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; - } param.args = args; inplace_.relu_enable = param_.relu.enabled; diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp new file mode 100644 index 0000000000000000000000000000000000000000..15a3f5c98aed0d858bc40240286b42f4576a5069 --- /dev/null +++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp @@ -0,0 +1,78 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "lite/backends/fpga/KD/pe.hpp" +#include "lite/backends/fpga/KD/pe_params.hpp" +namespace paddle { +namespace zynqmp { + +class ElementwiseMulPE : public PE { + public: + bool init() { + Tensor* output = param_.output; + output->setAligned(true); + output->setDataLocation(Device); + return true; + } + + void apply() { + Tensor* input = param_.inputs[0]; + Tensor* output = param_.output; + + int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32); + // int wc_aligned = / 32 * 32; + + Shape s(N, {wc_aligned}); + float16* bias_data = bias_tensor.mutableData(FP16, s); + memset(bias_data, 0, wc_aligned * sizeof(float16)); + + ScaleArgs& args = args_; + args.scale_address = param_.inputs[1]->data(); + args.bias_address = bias_tensor.data(); + args.wc_alignment = wc_aligned; + args.channel_alignment = wc_aligned; + args.image.address = input->data(); + args.image.scale_address = input->scale(); + args.image.channels = wc_aligned; + args.image.height = 1; + args.image.width = 1; + args.image.pad_width = 0; + args.image.pad_height = 0; + args.output.address = output->data(); + args.output.scale_address = output->scale(); + } + + void updateInput(Tensor* t, int index) { + if (index == 0) { + args_.scale_address = t->data(); // replace inputs? + } + } + + bool dispatch() { + compute_fpga_scale(args_) == 0; + return true; + } + + ElementwiseMulParam& param() { return param_; } + + private: + ElementwiseMulParam param_; + ScaleArgs args_ = {0}; + Tensor bias_tensor; +}; + +} // namespace zynqmp +} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp old mode 100644 new mode 100755 index 2179a142ad3b3a990512b3ea1cd202bc5ce502f1..db3e05276171607da4cea421dd554846a00314a6 --- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp +++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp @@ -37,7 +37,10 @@ class FullyConnectedPE : public PE { ConvParam& convParam_ = convPE_.param(); Tensor* input = param_.input; convParam_.input = param_.input; + num_ = param_.input->shape().num(); + convParam_.output = param_.output; + convParam_.groups = 1; convParam_.strides = {1, 1}; convParam_.paddings = {0, 0}; @@ -63,7 +66,6 @@ class FullyConnectedPE : public PE { new_filter_data[i * chw + j] = scale; } } - conv_filter->flush(); convParam_.filter = conv_filter; @@ -89,6 +91,8 @@ class FullyConnectedPE : public PE { private: FullyConnectedParam param_; ConvPE convPE_; + Tensor tempOut_; + int num_ = 1; }; } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp new file mode 100755 index 0000000000000000000000000000000000000000..2f1812707356c53e2ed846d68711b0687648a744 --- /dev/null +++ b/lite/backends/fpga/KD/pes/gru_pe.hpp @@ -0,0 +1,325 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "lite/backends/arm/math/sgemm.h" +#include "lite/backends/fpga/KD/pe.hpp" +#include "lite/backends/fpga/KD/pe_params.hpp" +#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp" +#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp" +#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp" +#include "lite/backends/fpga/KD/pes/relu_pe.hpp" + +#include "lite/api/paddle_place.h" +#include "lite/backends/arm/math/funcs.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace zynqmp { + +struct GRUTensors { + Tensor* gate; + Tensor* pre_output; + Tensor* output; + Tensor* reset_output; +}; + +class GRUPE : public PE { + public: + bool init() { + // Tensor* output = param_.output; + // output->setAligned(true); + // output->setDataLocation(Device); + return true; + } + + void apply() { + auto hidden = param_.hidden; + // auto hidden_dims = hidden->dims(); + int frame_size = hidden->shape().channel(); + + zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}}; + float16* prev_hidden_data = + prev_hidden_.mutableData(zynqmp::FP16, hidden_shape); + // set previous hidden data to 0; + memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16)); + + // copy 2/3 weight from param.weight; + zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}}; + float* weight_data = weight_.mutableData(zynqmp::FP32, weight_shape); + memset(weight_data, 0, weight_shape.numel() * sizeof(float)); + weight_data = weight_.mutableData(zynqmp::FP32, weight_shape); + memcpy(weight_data, + param_.weight->data(), + weight_shape.numel() * sizeof(float)); + + Shape gate_shape(zynqmp::NC, {1, frame_size * 2}); + gate_ping_.mutableData(FP32, gate_shape); + gate_pong_.mutableData(FP16, gate_shape); + + zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param(); + pre_out_param.input = &prev_hidden_; + pre_out_param.output = &gate_pong_; + pre_out_param.filter = &weight_; + pre_out_param.bias = &gate_ping_; + pre_out_pe_.init(); + pre_out_pe_.apply(); + + // // ============= C + // ElementwiseAddParam& bias_add_param = bias_ew_pe_.param(); + // bias_add_param.inputs = {&pre_output_, &pre_input_}; + // bias_add_param.output = &pre_input_; + // bias_ew_pe_.init(); + // bias_ew_pe_.apply(); + // // ==================== + + // Shape state_weight_shape(NC,{frame_size, frame_size}); + // float* state_weight_data = state_weight_.mutableData(FP32, + // state_weight_shape); + // memcpy(state_weight_data, weight_data + 2 * frame_size * frame_size, + // state_weight_shape.numel() * sizeof(float)); + // FullyConnectedParam& reset_out_param = reset_out_pe_.param(); + // reset_out_param.input = &prev_hidden; + // reset_out_param.output = &gate_ping; + // reset_out_param.filter = &state_weight_; + + // // ============== unit reset; + // update_gate_.mutableData(FP16, pre_input_shape); + // InputParam& relu_param = update_relu_pe_.param(); + // relu_param.input = &tempTensor; + // relu_param.output = &update_gate_; + // update_relu_pe_.init(); + // update_relu_pe_.apply(); + + reset_gate_.mutableData(FP16, hidden_shape); + prev_hidden_.mutableData(FP16, hidden_shape); + reset_hidden_.mutableData(FP16, hidden_shape); + // InputParam& reset_param = reset_relu_pe_.param(); + // reset_param.input = &tempTensor; + // reset_param.output = &reset_gate_; + // reset_relu_pe_.init(); + // reset_relu_pe_.apply(); + + // float16* prev_data = prev_.mutableData(FP16, pre_input_shape); + // memset(prev_data, 0, (pre_input_shape.numel() + 32) * sizeof(float16)); + // // TODO + // reset_hidden_prev_.mutableData(FP16, pre_input_shape); + + ElementwiseMulParam& mul_param = mul_pe_.param(); + mul_param.inputs = {&reset_gate_, &prev_hidden_}; + mul_param.output = &reset_hidden_; + mul_pe_.init(); + mul_pe_.apply(); + // ============== + } + + bool dispatch() { return true; } + + void gru_unit_reset_act(const lite_api::ActivationType active_gate, + GRUTensors& value, // NOLINT + int frame_size, + int batch_size) { + int stride_update = 3 * frame_size; + int stride_cell_state = 3 * frame_size; + int stride_hidden_prev = frame_size; + int stride_hidden = frame_size; + + // Tensor* gate = value.gate; + // value.gate->saveToFile("value_input.txt"); + + float* update_gate_data = gate_ping_.data(); + float* reset_gate_data = update_gate_data + frame_size; + + for (int b = 0; b < batch_size; b++) { + // memcpy(tempTensor.data(), reset_gate_data, gate->shape().numel() + // * sizeof(float)); + // tempTensor.flush(); + + Tensor tmp; + Shape s(NC, {1, frame_size}); + float* tmp_data = tmp.mutableData(FP32, s); + + for (int i = 0; i < frame_size; i++) { + // f(x) = x / (1 + abs(x))? + update_gate_data[i] = + lite::arm::math::active_f32( + update_gate_data[i]); + reset_gate_data[i] = + lite::arm::math::active_f32( + reset_gate_data[i]); + } + memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float)); + tmp.flush(); + reset_gate_.copyFrom(&tmp); + + // reset_gate_.copyFrom(&tempTensor); + Tensor* hidden_prev = value.pre_output; + if (hidden_prev) { + // memcpy(prev_data, ) + // TODO(chonwhite): change to pre_out; + prev_hidden_.copyFrom(value.pre_output); + prev_hidden_.saveToFile("prev_.txt"); + } + + // // 4.0 reset_date * hidden_prev; + // // reset_hidden_prev[i] = reset_gate[i] * prev; + mul_pe_.dispatch(); + reset_hidden_.saveToFile("reset_hidden_.txt"); + update_gate_data += stride_update; + reset_gate_data += stride_update; + + // reset_hidden_prev += stride_hidden;// TODO + } + } + + void gru_unit_out_act(const lite_api::ActivationType active_node, + bool origin_mode, + GRUTensors& value, // NOLINT + int frame_size, + int batch_size) { + // int stride_update = 3 * frame_size; + // int stride_cell_state = 3 * frame_size; + // int stride_hidden_prev = frame_size; + // int stride_hidden = frame_size; + + // Tensor* hidden = value.output_value; + // float* hidden_prev = nullptr; + // if (hidden) { + // hidden_prev = hidden->data(); + // } + + // float* cell_state = value.gate->data() + 2 * frame_size; + + // float* updata_gate = value.gate->data(); + // // float* reset_gate_data = update_gate_data + frame_size; + + // float prev = 0.0f; + // for (int b = 0; b < batch_size; ++b) { + // if (origin_mode) { + // // for (int i = 0; i < frame_size; i++) { + // // float prev = 0; + // // if (hidden_prev) { + // // prev = hidden_prev[i]; + // // } + // // cell_state[i] = + // lite::arm::math::active_f32(cell_state[i]); + // // hidden[i] = + // // cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * + // prev; + // // } + // } else { + // for (int i = 0; i < frame_size; ++i) { + // cell_state[i] = + // lite::arm::math::active_f32(cell_state[i]); + // if (hidden_prev) { + // prev = hidden_prev[i]; + // } + // float hidden_value = + // prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i]; + // hidden_prev[i] = hidden_value; + // std::cout << "hidden_value::" << hidden_value << std::endl; + // } + // } + // updata_gate += stride_update; + // cell_state += stride_cell_state; + // hidden_prev += frame_size; + // } + } + + void copy_input(GRUTensors& value) { // NOLINT + float max = find_max(*(value.gate)); + gate_ping_.mutableData(FP32, value.gate->shape()); + gate_ping_.copyFrom(value.gate); + // update input pointer? + + // gate_.readFromFile("input/in.txt"); + // // pre_input_.saveToFile("pppp_in.txt"); + // gate_.scale()[0] = max / 127; + // gate_.scale()[1] = 127 / max; + // gate_.printScale("pre_input_"); + + // gate_.saveToFile("pre_input_.txt"); + + // pre_out_pe_.dispatch(); + + // pre_output_.saveToFile("pp_out.txt"); + } + + void GRUCOmpute(GRUTensors& value, // NOLINT + int frame_size, + int batch_size, + const lite_api::ActivationType active_node, + const lite_api::ActivationType active_gate, + bool origin_mode) { + copy_input(value); + + if (value.pre_output) { + // copy by batch; + pre_out_pe_.dispatch(); + gate_ping_.copyFrom(&gate_pong_); + } + + gru_unit_reset_act(active_gate, value, frame_size, batch_size); + + // if (value.pre_output) { + // // state weight; + // reset_out_pe_.dispatch(); + // } + // gru_unit_out_act(active_node, origin_mode, value, frame_size, + // batch_size); + } + + GRUParam& param() { return param_; } + + // Tensor* preOutput() { + // return &pre_output_; + // } + + // Tensor* gate() { + // return &gate_; + // } + + Tensor* updateGate() { return &update_gate_; } + + Tensor* resetGate() { return &reset_gate_; } + + private: + GRUParam param_; + zynqmp::Tensor gate_ping_; + zynqmp::Tensor gate_pong_; + zynqmp::Tensor bias_; + zynqmp::Tensor weight_; + zynqmp::Tensor state_weight_; + // ================================= + zynqmp::Tensor update_gate_; + zynqmp::Tensor reset_gate_; + zynqmp::Tensor cell_state_; + zynqmp::Tensor prev_hidden_; + zynqmp::Tensor reset_hidden_; + + Tensor tempTensor; + // ================================= + + ReluPE update_relu_pe_; + ReluPE reset_relu_pe_; + zynqmp::ElementwiseMulPE mul_pe_; + zynqmp::FullyConnectedPE pre_out_pe_; + zynqmp::FullyConnectedPE reset_out_pe_; + + zynqmp::ElementwiseAddPE bias_ew_pe_; +}; + +} // namespace zynqmp +} // namespace paddle diff --git a/lite/backends/fpga/KD/pes/gru_util.hpp b/lite/backends/fpga/KD/pes/gru_util.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d49169846f4f18e4d8e30f3658c2173157678f81 --- /dev/null +++ b/lite/backends/fpga/KD/pes/gru_util.hpp @@ -0,0 +1,23 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/backends/arm/math/gru_utils.h" + +namespace paddle { +namespace lite { +namespace fpga {} +} +} diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp old mode 100644 new mode 100755 index 1c99386ab19f485c07723c7fcc8501bdf5556f6c..2944691693b135a2d2df7b91ecbe0ef249b015d8 --- a/lite/backends/fpga/KD/pes/output_pe.hpp +++ b/lite/backends/fpga/KD/pes/output_pe.hpp @@ -25,6 +25,8 @@ class OutputPE : public PE { bool init() { Tensor* output = param_.output; output->setAligned(false); + DLEngine::get_instance().out_data = reinterpret_cast( + fpga_malloc(output->shape().numel() * sizeof(float))); return true; } @@ -41,6 +43,15 @@ class OutputPE : public PE { } else { output->copyFrom(input); } + // + output->syncToCPU(); + if (DLEngine::get_instance().out_data == nullptr) { + DLEngine::get_instance().out_data = reinterpret_cast( + fpga_malloc(output->shape().numel() * sizeof(float))); + } + memcpy(DLEngine::get_instance().out_data, + output->data(), + output->shape().numel() * sizeof(float)); return true; } diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp old mode 100644 new mode 100755 index 5bb4f5285a48c7696b1f0f78a9b1c4fe6a9d76c5..386a470975261871137429f03d7c76b43aedb94b --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -35,24 +35,25 @@ class PoolingPE : public PE { Tensor* input = param_.input; Tensor* output = param_.output; - uint32_t k_width = param_.kernelSize[0]; - uint32_t k_height = param_.kernelSize[1]; + uint32_t k_height = param_.kernelSize[0]; + uint32_t k_width = param_.kernelSize[1]; if (param_.globalPooling) { k_width = input->shape().width(); k_height = input->shape().height(); + param_.kernelSize[0] = k_height; + param_.kernelSize[1] = k_width; } PoolingArgs args = {0}; args.mode = param_.type; - auto paddings = *param_.paddings; args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height)); args.image.address = input->data(); args.image.channels = input->shape().channel(); args.image.height = input->shape().height(); args.image.width = input->shape().width(); - args.image.pad_height = paddings[0]; - args.image.pad_width = paddings[2]; + args.image.pad_height = param_.paddings[0]; + args.image.pad_width = param_.paddings[1]; args.image.scale_address = input->scale(); args.output.address = output->mutableData(); args.output.scale_address = output->scale(); @@ -66,6 +67,10 @@ class PoolingPE : public PE { use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 && (k_width > 7 || k_height > 7); + // use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 + // && + // (k_width > 255 || k_height > 255); + use_cpu_ = param_.type == AVERAGE; } void compute() { @@ -74,16 +79,16 @@ class PoolingPE : public PE { input->syncToCPU(); Tensor float_input; + // Tensor float_output; float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); float16* data_out = output->data(); - auto paddings = *param_.paddings; int image_height = input->shape().height(); int image_width = input->shape().width(); int image_channels = input->shape().channel(); - int image_pad_h = paddings[0]; - int image_pad_w = paddings[2]; + int image_pad_h = param_.paddings[0]; + int image_pad_w = param_.paddings[1]; int kernel_height = param_.kernelSize[1]; int kernel_width = param_.kernelSize[0]; int kernel_step_h = param_.strides[0]; @@ -129,7 +134,7 @@ class PoolingPE : public PE { output->flush(); } - void cpu_compute() { + void cpu_compute1() { Tensor* input = param_.input; Tensor* output = param_.output; input->syncToCPU(); @@ -138,7 +143,6 @@ class PoolingPE : public PE { float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); float16* data_out = output->data(); - int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1]; float scale_max = 0; @@ -154,13 +158,43 @@ class PoolingPE : public PE { } output->scale()[0] = scale_max / 127.0f; output->scale()[1] = 127.0f / scale_max; - std::cout << "pool scale:" << scale_max / 127.0f << std::endl; + output->flush(); + } + + void cpu_compute() { + Tensor* input = param_.input; + Tensor* output = param_.output; + input->syncToCPU(); + + Tensor float_input; + float* float_input_data = + float_input.mutableData(FP32, input->shape()); + float_input.copyFrom(input); + + float16* data_out = output->data(); + + int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1]; + + float scale_max = 0; + for (int i = 0; i < output->shape().channel(); i++) { + float sum = 0; + for (int j = 0; j < kernel_hw; j++) { + sum += float_input_data[i * kernel_hw + j]; + } + float value = sum / kernel_hw; + data_out[i] = float_to_half(value); + scale_max = std::max(scale_max, std::abs(value)); + } + output->scale()[0] = scale_max / 127.0f; + output->scale()[1] = 127.0f / scale_max; output->flush(); } bool dispatch() { if (use_cpu_) { + // cpu_compute(); compute(); + // exit(-1); return true; } param_.input->syncToDevice(); diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp index d6a503a31d4e0736724740ce1875c916969d93e0..00dfe1830f6f44cbf6a30708fa5783563470c686 100644 --- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp +++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp @@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() { if (cachedBoxes_ == nullptr) { cachedBoxes_ = new Tensor(); cachedVariances_ = new Tensor(); - cachedBoxes_->mutableData(FP16, param_.outputBoxes->shape()); - cachedVariances_->mutableData(FP16, - param_.outputVariances->shape()); + cachedBoxes_->mutableData(FP32, param_.outputBoxes->shape()); + cachedVariances_->mutableData(FP32, param_.outputVariances->shape()); cachedBoxes_->setDataLocation(CPU); cachedVariances_->setDataLocation(CPU); compute_prior_box(); diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp index d5e16615d9943a1771dfabe916433768ecf16319..91f698ba514b949a4d22416791ed3993c1df737f 100755 --- a/lite/backends/fpga/KD/pes/scale_pe.hpp +++ b/lite/backends/fpga/KD/pes/scale_pe.hpp @@ -14,11 +14,16 @@ limitations under the License. */ #pragma once +#include + #include "lite/backends/fpga/KD/pe.hpp" #include "lite/backends/fpga/KD/pe_params.hpp" +#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp" +#include "lite/backends/fpga/KD/tensor.hpp" namespace paddle { namespace zynqmp { + class ScalePE : public PE { public: inline int gcd(int a, int b) { @@ -42,6 +47,8 @@ class ScalePE : public PE { Tensor* input = param_.input; Tensor* output = param_.output; Shape& input_shape = input->shape(); + DepthwiseConvParam& dw_param = dw_pe_.param(); + int channel = input_shape.channel(); int repeat = 1; int alignment = 16; @@ -51,70 +58,154 @@ class ScalePE : public PE { int c_lcm = lcm(channel, alignment); repeat = c_lcm / (channel); } + + // FPGA限制 H >2047, W >1023 , WC> 65536 ,需要使用CPU实现 Shape shape(N, {channel * repeat}); - param_.alignedBias()->mutableData(FP16, shape); - param_.alignedScale()->mutableData(FP16, shape); - float16* bias_data = param_.alignedBias()->data(); - float16* scale_data = param_.alignedScale()->data(); + float* filter_data = filter.mutableData(FP32, shape); + std::fill_n(filter_data, input->shape().channel(), 1.0f); + + Tensor* scale = dw_param.scale(); + float16* scale_data = scale->mutableData(FP16, shape); + // memcpy(scale_data, param_.scale->data(), input->shape().channel() + // * sizeof(float)); + + Tensor* bias = dw_param.bias(); + float16* bias_data = bias->mutableData(FP16, shape); + std::fill_n(bias_data, input->shape().channel(), 0); + + if (param_.scale->dataType() == FP32) { + // std::cout << "scale dataType FP32:" << std::endl; + if (param_.bias != nullptr) { + float* bias_data_float = param_.bias->data(); + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + float16 value = float_to_half(bias_data_float[j]); + bias_data[i * length + j] = value; + } + } + } else { + float16 zero = float_to_half(0.0f); + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + bias_data[i * length + j] = zero; + } + } + } - if (param_.bias != nullptr) { - float* bias_data_float = param_.bias->data(); + float* scale_data_float = param_.scale->data(); for (int i = 0; i < repeat; i++) { for (int j = 0; j < length; j++) { - float16 value = float_to_half(bias_data_float[j]); - bias_data[i * length + j] = value; + float16 value = float_to_half(scale_data_float[j]); + scale_data[i * length + j] = value; } } } else { - float16 zero = float_to_half(0.0f); + if (param_.bias != nullptr) { + float16* bias_data_float = param_.bias->data(); + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + float16 value = bias_data_float[j]; + bias_data[i * length + j] = value; + } + } + } else { + float16 zero = float_to_half(0.0f); + for (int i = 0; i < repeat; i++) { + for (int j = 0; j < length; j++) { + bias_data[i * length + j] = zero; + } + } + } + + float16* scale_data_float = param_.scale->data(); for (int i = 0; i < repeat; i++) { for (int j = 0; j < length; j++) { - bias_data[i * length + j] = zero; + float16 value = scale_data_float[j]; + scale_data[i * length + j] = value; } } } - float* scale_data_float = param_.scale->data(); - for (int i = 0; i < repeat; i++) { - for (int j = 0; j < length; j++) { - float16 value = float_to_half(scale_data_float[j]); - scale_data[i * length + j] = value; + // if (param_.bias != nullptr) { + // memcpy(bias_data, param_.bias->data(), input->shape().channel() + // * sizeof(float)); + // } + + dw_param.input = param_.input; + dw_param.output = param_.output; + dw_param.filter = &filter; + + dw_param.strides = {1, 1}; + dw_param.paddings = {0, 0}; + dw_param.kernelSize = {1, 1}; + dw_param.dilations = {1, 1}; + + dw_pe_.init(); + dw_pe_.apply(); + } + + void cpu_compute() { + Tensor* input = param_.input; + Tensor* output = param_.output; + Tensor float_input; + float* image_addr = float_input.mutableData(FP32, input->shape()); + input->syncToCPU(); + float_input.copyFrom(input); + float16* data_out = output->data(); + + float* scale_data = param_.scale->data(); + + int wh = input->shape().width() * input->shape().height(); + + float16* in_data = input->data(); + + float max = 0; + + for (int i = 0; i < wh; i++) { + for (int c = 0; c < input->shape().channel(); c++) { + int index = i * input->shape().channel() + c; + float value = half_to_float(in_data[index]) * scale_data[c]; + data_out[index] = float_to_half(value); + + if (value < 0) { + value = -value; + } + if (value > max) { + max = value; + } } } - - param_.alignedScale()->flush(); - param_.alignedBias()->flush(); - - int wc = input_shape.width() * input_shape.channel(); - int wc_aligned = align_image(wc); - - ScaleArgs& args = param_.args; - args.scale_address = param_.alignedScale()->data(); - args.bias_address = param_.alignedBias()->data(); - args.wc_alignment = wc_aligned; - args.channel_alignment = channel * repeat; - - args.image.address = input->data(); - args.image.scale_address = input->scale(); - args.image.channels = channel; - args.image.height = input_shape.height(); - args.image.width = input_shape.width(); - args.image.pad_width = 0; - args.image.pad_height = 0; - args.output.address = output->data(); - args.output.scale_address = output->scale(); + output->flush(); + output->scale()[0] = max / 127.0f; + output->scale()[1] = 127.0f / max; } bool dispatch() { + // cpu_compute(); + // return true; + + if (param_.scale->dataType() == FP16) { + DepthwiseConvParam& dw_param = dw_pe_.param(); + memcpy(dw_param.quantizedFilter()->mutableData(), + param_.scale->data(), + param_.scale->shape().numel() * sizeof(float16)); + dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0]; + dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1]; + + dw_param.quantizedFilter()->flush(); + // apply(); + } param_.input->syncToDevice(); - return compute_fpga_scale(param_.args) == 0; + return dw_pe_.dispatch(); } ScaleParam& param() { return param_; } private: ScaleParam param_; + Tensor filter; + DepthwiseConvPE dw_pe_; }; } // namespace zynqmp } // namespace paddle diff --git a/lite/backends/fpga/KD/shape.hpp b/lite/backends/fpga/KD/shape.hpp index 566ad8e6ff2eff32301e83b6cdb5b1addd0117fe..c25c3315145137a147928a164fcabd2923b09e87 100755 --- a/lite/backends/fpga/KD/shape.hpp +++ b/lite/backends/fpga/KD/shape.hpp @@ -23,6 +23,7 @@ limitations under the License. */ namespace paddle { namespace zynqmp { +static struct None none_; static struct NCHW nchw_; static struct NHWC nhwc_; static struct NC nc_; @@ -82,6 +83,9 @@ class Shape { void setLayoutType(LayoutType layout) { this->layoutType_ = layout; switch (layout) { + case None: + layout_ = &none_; + break; case NCHW: layout_ = &nchw_; break; diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp old mode 100644 new mode 100755 index f003ded33eb51136ae0ae0a2c21988460232f89a..047498eed009dded5ce398ddabc2079b62d937df --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -24,13 +25,10 @@ limitations under the License. */ #include #include -// #include "lite/core/tensor.h" - #include "lite/backends/fpga/KD/dl_engine.hpp" #include "lite/backends/fpga/KD/float16.hpp" #include "lite/backends/fpga/KD/llapi/zynqmp_api.h" #include "lite/backends/fpga/KD/shape.hpp" -// #include "lite/backends/fpga/KD/types.hpp" namespace paddle { namespace zynqmp { @@ -117,7 +115,8 @@ class Tensor { template Dtype* mutableData() { - size_t memorySize = shape_->memorySize(CellSize(dataType_)); + size_t memorySize = + shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; if (placeHolder_ != nullptr) { if (memorySize > placeHolder_->memorySize()) { placeHolder_.reset(new PlaceHolder(memorySize)); @@ -241,6 +240,10 @@ class Tensor { } } + void setMemScale(float scale_factor) { + this->mem_scale_factor_ = scale_factor; + } + void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); } void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) { @@ -276,9 +279,11 @@ class Tensor { .height = 1, .pad_width = 0u, .pad_height = 0u}; - args.output = { + + ImageOutputArgs output = { .address = data(), .scale_address = scale(), }; + args.output = output; src->syncToDevice(); size_t aligned_remainder = src->shape().numel() % 16; if (aligned_remainder > 0) { @@ -294,10 +299,16 @@ class Tensor { this->invalidate(); } - void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); } + void flush() { + size_t memorySize = + shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; + fpga_flush(placeHolder_->data(), memorySize); + } void invalidate() { - fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize()); + size_t memorySize = + shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_; + fpga_invalidate(placeHolder_->data(), memorySize); } void sync() { @@ -337,6 +348,18 @@ class Tensor { if (placeHolder_ == nullptr) { return; } + std::cout << scale()[0] << " , " << scale()[1] << std::endl; + } + + void printScale(std::string type) { + std::cout << type << " : " + << std::to_string(shape_->num()) + "_" + + std::to_string(shape_->channel()) + "_" + + std::to_string(shape_->height()) + "_" + + std::to_string(shape_->width()) + << std::endl; + std::cout << type << " \n"; + printScale(); } std::string dimsFileName() { @@ -358,33 +381,14 @@ class Tensor { saveToFile(path); } - friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) { - os << "tensor:" - << "\n"; - os << "dims: {"; - for (int i = 0; i < tensor.shape().dimSize(); ++i) { - os << tensor.shape()[i] << " "; - } - os << "}\n"; - for (int i = 0; i < tensor.shape().numel(); i++) { - float value = 0; - if (tensor.dataType() == FP32) { - value = tensor.data()[i]; - } else { - value = half_to_float(tensor.data()[i]); - } - os << value << " "; - } - os << "\n"; - return os; - } - void saveToFile(std::string path) { syncToCPU(); + invalidate(); std::ofstream ofs; static int counter = 0; std::string npath = std::to_string(counter) + "_" + path; counter++; + std::cout << "======== saving file:" << npath << " ============\n"; save_file_with_name(npath); } @@ -392,14 +396,16 @@ class Tensor { // return; invalidate(); std::ofstream ofs; - ofs.open(path); + for (int i = 0; i < shape_->numel(); i++) { float value = 0; if (dataType_ == FP32) { value = data()[i]; - } else { + } else if (dataType_ == FP16) { value = half_to_float(data()[i]); + } else { + value = data()[i]; } ofs << value << std::endl; } @@ -415,18 +421,49 @@ class Tensor { int num = shape_->numel(); invalidate(); float max = 0.0f; - float16* data = mutableData(); - for (int i = 0; i < num; ++i) { - float value = 0; - file_stream >> value; - max = std::max(std::abs(value), max); - data[i] = float_to_half(value); + if (dataType_ == FP16) { + float16* data = mutableData(); + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + max = std::max(std::abs(value), max); + data[i] = float_to_half(value); + } + } else { + float* data = mutableData(); + for (int i = 0; i < num; ++i) { + float value = 0; + file_stream >> value; + max = std::max(std::abs(value), max); + data[i] = value; + } } flush(); placeHolder_->scale_[0] = max / 127.0f; placeHolder_->scale_[1] = 127.0f / max; } + friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) { + os << "tensor:" + << "\n"; + os << "dims: {"; + for (int i = 0; i < tensor.shape().dimSize(); ++i) { + os << tensor.shape()[i] << " "; + } + os << "}\n"; + for (int i = 0; i < tensor.shape().numel(); i++) { + float value = 0; + if (tensor.dataType() == FP32) { + value = tensor.data()[i]; + } else { + value = half_to_float(tensor.data()[i]); + } + os << value << " "; + } + os << "\n"; + return os; + } + ~Tensor() { if (shape_ != nullptr) { delete shape_; @@ -436,6 +473,7 @@ class Tensor { private: int offset = 0; + float mem_scale_factor_ = 1.0f; std::shared_ptr placeHolder_; Shape* shape_ = nullptr; DataType dataType_ = FP32; diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc old mode 100644 new mode 100755 index 43218173fd05626fb46495bb254b250c14e5417a..7f1e8d3e17f97315e77532b77bbcfcc8331edd4f --- a/lite/backends/fpga/lite_tensor.cc +++ b/lite/backends/fpga/lite_tensor.cc @@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { dims_ = other.dims_; target_ = other.target_; lod_ = other.lod_; - // memory_size_ = other.memory_size_; - // buffer_->CopyDataFrom(*other.buffer_, memory_size_); - zynq_tensor_->mutableData(other.zynq_tensor_->dataType(), - other.zynq_tensor_->shape()); -} + auto dt = zynq_tensor_->dataType(); -// template -// void TensorLite::mutable_data_internal() { + auto shape = other.zynq_tensor_->shape(); -// } + Resize(other.dims()); + zynq_tensor_->mutableData(zynq_tensor_->dataType(), shape); + this->ZynqTensor()->copyFrom(other.ZynqTensor()); +} } // namespace lite } // namespace paddle diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h index 2f9df3abb08dd15641323f4a3c59d6175f2e481b..ccf3628ecf16c91b722380ad6bfd11b8e89b1879 100644 --- a/lite/backends/fpga/lite_tensor.h +++ b/lite/backends/fpga/lite_tensor.h @@ -106,7 +106,7 @@ class TensorLite { // For other devices, T and R may be the same type. template const R *data() const { - return zynq_tensor_->data(); + return zynq_tensor_->data() + offset_; } void Resize(const DDimLite &ddim) { dims_ = ddim; } @@ -125,6 +125,7 @@ class TensorLite { bool persistable() const { return persistable_; } void set_persistable(bool persistable) { persistable_ = persistable; } + // T is the data type and R is the return type // For OpenCL, the return type can be cl::Buffer // and the data type can be float/int8_t. @@ -147,6 +148,8 @@ class TensorLite { size_t memory_size() const { return zynq_tensor_->memorySize(); } + size_t offset() const { return offset_; } + bool IsInitialized() const { return buffer_->data(); } // Other share data to this. @@ -157,8 +160,14 @@ class TensorLite { template TensorLite Slice(int64_t begin, int64_t end) const; + template + void Slice(TensorLite &dst, int64_t begin, int64_t end) const; // NOLINT + TargetType target() const { return target_; } + // template + // TensorLite Slice(int64_t begin, int64_t end) const; + zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; } friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) { @@ -173,16 +182,21 @@ class TensorLite { private: TargetType target_{TargetType::kHost}; + + // precision_ and persistable_ are only used for persistable vars. + // If your tensor wants to be saved and loaded correctly, you must + // set values of precision_ and persistable_ after updating it. + // If your tensor is just a temp tensor, such as activations, + // you can ignore these two attributes. + PrecisionType precision_{PrecisionType::kUnk}; + bool persistable_{false}; + DDimLite dims_; std::shared_ptr buffer_; LoD lod_; size_t memory_size_{}; - size_t offset_{0}; - PrecisionType precision_{PrecisionType::kUnk}; - bool persistable_{false}; - zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor(); template @@ -197,6 +211,9 @@ R *TensorLite::mutable_data() { } zynqmp::LayoutType layout_type = zynqmp::NCHW; switch (v.size()) { + case 0: + layout_type = zynqmp::None; + break; case 1: layout_type = zynqmp::N; break; @@ -228,24 +245,63 @@ R *TensorLite::mutable_data(TargetType target) { return mutable_data(); } -template -bool TensorCompareWith(const TensorT &a, const TensorT &b) { - if (a.dims() != b.dims()) return false; - if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false; - return true; -} template TensorLite TensorLite::Slice(int64_t begin, int64_t end) const { - int64_t base = numel() / dims_[0]; + throw - 1; + CHECK_GE(begin, 0); + CHECK_LE(end, dims_[0]); + CHECK_LT(begin, end); + if (dims_[0] == 1) { + return *this; + } else { + int64_t base = numel() / dims_[0]; + + TensorLite dst; + // dst.buffer_ = buffer_; + // dst.zynq_tensor_ = zynq_tensor_; + dst.target_ = target_; + auto dst_dims = dims_; + dst_dims[0] = end - begin; + dst.Resize(dst_dims); + void *dst_data = dst.mutable_data(); + + T *src_data = const_cast(data()); + memcpy(dst_data, + src_data + static_cast(begin * base) * sizeof(T), + dst_dims.production() * sizeof(T)); + dst.ZynqTensor()->saveToFile("_slice", true); + + // dst.offset_ = offset_ + static_cast(begin * base) * sizeof(T); + return dst; + } +} + +template +void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const { + CHECK_GE(begin, 0); + CHECK_LE(end, dims_[0]); + CHECK_LT(begin, end); - TensorLite dst; - dst.buffer_ = buffer_; dst.target_ = target_; auto dst_dims = dims_; dst_dims[0] = end - begin; dst.Resize(dst_dims); - dst.offset_ = offset_ + static_cast(begin * base) * sizeof(T); - return dst; + void *dst_data = dst.mutable_data(); + + int64_t base = numel() / dims_[0]; + + T *src_data = const_cast(data()); + memcpy(dst_data, + src_data + static_cast(begin * dst_dims.production()), + dst_dims.production() * sizeof(T)); } + +template +bool TensorCompareWith(const TensorT &a, const TensorT &b) { + if (a.dims() != b.dims()) return false; + if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false; + return true; +} + } // namespace lite } // namespace paddle