diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index c8746bc1f7d405098ba84724ba253aae5b7522f1..5cef0ec1a64e7e696d6b5c797e39918d6f1ee915 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -28,13 +28,25 @@ namespace fpga { void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + std::type_index input_type = image_tensor->type(); + if (input_type == typeid(float)) { + auto data_ptr = image_tensor->data(); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + + image::format_image(&p_data, channel, height, width); + if (p_data != data_ptr && external_ptr == nullptr) { + image_tensor->reset_data_ptr(p_data); + } + } else { + auto data_ptr = image_tensor->data(); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - image::format_image(&p_data, channel, height, width); - if (p_data != data_ptr && external_ptr == nullptr) { - image_tensor->reset_data_ptr(p_data); + image::format_image(&p_data, channel, height, width); + if (p_data != data_ptr && external_ptr == nullptr) { + image_tensor->reset_data_ptr(p_data); + } } } diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index 833decef5808e3a1fe9f63a6d1008ea890247c73..4ba5af83ab26a8b21ea868c8a28bb94da5216c69 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -13,9 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V1/image.h" -#include -#include -#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -58,37 +55,6 @@ void convert_to_chw(float **data_in, int channel, int height, int width, *data_in = data_tmp; } -void align_element_conv(float **data_in, int height, int cw) { - int h = 0; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - - float *data_tmp = - (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - - memset(data_tmp, 0, height * align_cw * sizeof(float)); - - for (h = 0; h < height; h++) { - memcpy((void *)(data_tmp + h * align_cw), // NOLINT - (void *)(*data_in + h * cw), // NOLINT - cw * sizeof(float)); - } - - *data_in = data_tmp; -} - -void format_image(float **data_in, int channel, int height, int width) { - // convert_to_hwc(data_in, channel, height, width); - int cw = channel * width; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - float *hwc_temp = *data_in; - align_element_conv(data_in, height, channel * width); - fpga_free(hwc_temp); - } - fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * - sizeof(float)); -} - void concat_images(int16_t **images_in, float **scales_in, void *image_out, float *scale_out, int image_num, uint32_t *channel_num, int height, int width) { diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h index c81de8f4554d9d2a9396bf587ec7ab10806e856a..f5dc6ffe3e1d9747bf4c9cfd86f5a951e7b0ac24 100644 --- a/src/fpga/V1/image.h +++ b/src/fpga/V1/image.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once +#include +#include #include - +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { namespace image { @@ -24,10 +26,42 @@ void convert_to_hwc(float** data_in, int channel, int height, int width, int num = 1); void convert_to_chw(float** data_in, int channel, int height, int width, int num = 1); +// template +// void align_element_conv(Dtype** data_in, int height, int cw); +// template +// void format_image(T** data_in, int channel, int height, int width); +template +void align_element_conv(Dtype** data_in, int height, int cw); +template +void align_element_conv(Dtype** data_in, int height, int cw) { + int h = 0; + int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); + + Dtype* data_tmp = + (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype)); // NOLINT + + memset(data_tmp, 0, height * align_cw * sizeof(Dtype)); -void align_element_conv(float** data_in, int height, int cw); -void format_image(float** data_in, int channel, int height, int width); + for (h = 0; h < height; h++) { + memcpy((void*)(data_tmp + h * align_cw), // NOLINT + (void*)(*data_in + h * cw), // NOLINT + cw * sizeof(Dtype)); + } + *data_in = data_tmp; +} +template +void format_image(T** data_in, int channel, int height, int width) { + int cw = channel * width; + int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); + if (align_cw != cw) { + T* hwc_temp = *data_in; + align_element_conv(data_in, height, channel * width); + fpga_free(hwc_temp); + } + fpga_flush(*data_in, + align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T)); +} // Concat featuremaps along channel direction void concat_images(int16_t** images_in, float** scales_in, void* image_out, float* scale_out, int image_num, uint32_t* channel_num, diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 37feeb9dfa1a0e9a8c4dc9f789c0ab673e0f4d65..24ef95e6fc25b32a2faf69c7e685b5c1f07d1cdd 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -38,10 +38,12 @@ using namespace std; // NOLINT #define CMD_FP16_TO_FP32 1 #define CMD_FP32_TO_FP16 2 #define CMD_FP32_TO_FP32 3 +#define CMD_INT8_TO_FP16 4 // bypass macro #define SIZE_FP16 2 #define SIZE_FP32 4 +#define SIZE_INT8 1 #define PE_IRQ_TIMEOUT 1000000 @@ -607,6 +609,16 @@ int PerformBypass(const struct BypassArgs &args) { } } break; + case DATA_TYPE_INT8: { + if (args.output_data_type != DATA_TYPE_FP16) { + DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: " + << args.output_data_type; + } + data_cell_in = SIZE_INT8; + data_cell_out = SIZE_FP16; + cmd = CMD_INT8_TO_FP16; + } break; + case DATA_TYPE_FP32: { switch (args.output_data_type) { case DATA_TYPE_FP16: @@ -630,10 +642,13 @@ int PerformBypass(const struct BypassArgs &args) { break; } if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && - cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) { + cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 && + cmd != CMD_INT8_TO_FP16) { + // std::cout<< " err back Error1!" < bool FeedKernel::Init(FeedParam *param) { auto output = param->Out(); - auto input = const_cast(param->InputX()); - input->init(typeid(float)); - input->Resize(output->dims()); - if (output->dims().size() != 4) { return true; } + fpga::format_fp16_ofm(output); return true; } @@ -35,6 +32,14 @@ template <> void FeedKernel::Compute(const FeedParam ¶m) { auto output = param.Out(); auto input = const_cast(param.InputX()); + std::type_index input_type = input->type(); + + if (input_type == typeid(float)) { + input->init(typeid(float)); + } else { // input_type == typeid(int8_t) + input->init(typeid(int8_t)); + } + input->Resize(output->dims()); if (output->dims().size() != 4) { size_t size = output->numel() * sizeof(float); @@ -48,29 +53,47 @@ void FeedKernel::Compute(const FeedParam ¶m) { } fpga::format_image(input); - auto input_ptr = input->data(); - auto external_ptr = reinterpret_cast(input->external_data); - float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - auto output_ptr = output->data(); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; + if (input_type == typeid(float)) { + auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = p_data; - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = p_data; + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); + input->external_data = nullptr; + } else { // input_type == typeid(int8_t) + auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - input->external_data = nullptr; + args.input_data_type = fpga::DATA_TYPE_INT8; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = p_data; + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); + input->external_data = nullptr; + } } template class FeedKernel; diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 5b9a2ea8f2a4b1841d583e602e4746b87d71cea6..99822947671423d483d61294edcbc825021f1ad0 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -2554,13 +2554,13 @@ class FusionDeconvBNReluParam : public ConvTransposeParam { public: FusionDeconvBNReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvTransposeParam(inputs, outputs, attrs, scope) { - output_ = OpParam::OutFrom(outputs, scope); - input_bias_ = OpParam::InputBiasFrom(inputs, scope); - input_mean_ = OpParam::InputMeanFrom(inputs, scope); - input_scale_ = OpParam::InputScaleFrom(inputs, scope); - input_variance_ = OpParam::InputVarianceFrom(inputs, scope); + output_ = OpParam::OutFrom(outputs, *scope); + input_bias_ = OpParam::InputBiasFrom(inputs, *scope); + input_mean_ = OpParam::InputMeanFrom(inputs, *scope); + input_scale_ = OpParam::InputScaleFrom(inputs, *scope); + input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); epsilon_ = OpParam::GetAttr("epsilon", attrs); momentum_ = OpParam::GetAttr("momentum", attrs); } diff --git a/tools/op.cmake b/tools/op.cmake index aed5ce3260a293c23d05c9b85e73174dc2860857..190bb142bc59a10efdeebeb8a382043440731e68 100755 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -138,6 +138,8 @@ if (CON GREATER -1) set(CONV_TRANSPOSE_OP ON) set(FUSION_DECONVADDBNRELU_OP ON) set(FUSION_DECONVADDBN_OP ON) + set(FUSION_DECONVBNRELU_OP ON) + set(CONV_OP ON) set(ELEMENTWISEMUL_OP ON) set(FUSION_FCRELU_OP ON) set(RELU_OP ON) @@ -616,6 +618,9 @@ endif() if (FUSION_DECONVADDBNRELU_OP) add_definitions(-DFUSION_DECONVADDBNRELU_OP) endif() +if (FUSION_DECONVBNRELU_OP) + add_definitions(-DFUSION_DECONVBNRELU_OP) +endif() if (FUSION_DECONVADDBN_OP) add_definitions(-DFUSION_DECONVADDBN_OP) endif()