diff --git a/src/common/types.cpp b/src/common/types.cpp index 8f284b3fe1115bd8cec78430a405289aae98e898..ba00f639d76ae7c928f5b7484c08cec0b0926904 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -71,10 +71,10 @@ const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; -extern const char *G_OP_TYPE_TANH = "tanh"; -extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add"; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; +const char *G_OP_TYPE_TANH = "tanh"; +const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; +const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add"; +const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; std::unordered_map< std::string, std::pair, std::vector>> diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 04e51ab9b09fabc41fcd1cd73864bc285d183821..7c1f15f7c90e0b1ebc15a9ec8f3f6333ff173978 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -13,251 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V1/api.h" -#include -#include -#include -#include -#include #include "fpga/V1/bias_scale.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" -#define FPGA_TEST_MODE -#define PADDLE_MOBILE_OS_LINUX namespace paddle_mobile { namespace fpga { -static int fd = -1; -static const char *device_path = "/dev/fpgadrv0"; -static std::map memory_map; - -static inline int do_ioctl(int req, const void *arg) { -#ifdef PADDLE_MOBILE_OS_LINUX - int result = ioctl(fd, req, (uint64_t)arg); - PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly"); - return result; -#else - return -1; -#endif -} - -int open_device() { - if (fd == -1) { - fd = open(device_path, O_RDWR); - } - return fd; -} - -// memory management; -void *fpga_malloc(size_t size) { - static uint64_t counter = 0; - -#ifdef PADDLE_MOBILE_OS_LINUX - auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); -#else - auto ptr = malloc(size); -#endif - counter += size; - memory_map.insert(std::make_pair(ptr, size)); - // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " - // << counter << " bytes"; - return ptr; -} - -void fpga_free(void *ptr) { - static uint64_t counter = 0; - size_t size = 0; - - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); -#ifdef PADDLE_MOBILE_OS_LINUX - munmap(ptr, size); -#else - free(ptr); -#endif - counter += size; - // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " - // << counter << " bytes"; - } else { - DLOG << "Invalid pointer"; - } -} - -void fpga_copy(void *dest, const void *src, size_t num) { - memcpy(dest, src, num); -} - -int fpga_flush(void *address, size_t size) { - struct MemoryCacheArgs args = {nullptr}; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -int fpga_invalidate(void *address, size_t size) { - struct MemoryCacheArgs args = {nullptr}; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -half fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10)); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; -} - -float fp16_2_fp32(half fp16_num) { - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; -} - -int ComputeBasicConv(const struct ConvArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "======Compute Basic Conv======"; - DLOG << " relu_enabled:" << args.relu_enabled - << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} - -int ComputeFpgaConv(const struct SplitConvArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFPGAConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num - << " split_num:" << args.split_num; -#endif - - int split_num = args.split_num; - for (int i = 0; i < split_num; i++) { - ComputeBasicConv(args.conv_args[i]); - } - - if (split_num > 1) { - ComputeFPGAConcat(args.concat_arg); - } -} - -int ComputeFpgaPool(const struct PoolingArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaPool==========="; - DLOG << " mode:" << args.mode - << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_POOLING, &args); -} - -int ComputeFpgaEWAdd(const struct EWAddArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaEWAdd==========="; - DLOG << " relu_enabled:" << args.relu_enabled - << " const0:" << fp16_2_fp32(int16_t(args.const0)) - << " const1:" << fp16_2_fp32(int16_t(args.const1)); - DLOG << " image0_address:" << args.image0.address - << " image0_scale_address:" << args.image0.scale_address - << " image0_channels:" << args.image0.channels - << " image0_height:" << args.image0.height - << " image0_width:" << args.image0.width - << " pad0_height:" << args.image0.pad_height - << " pad0_width:" << args.image0.pad_width; - DLOG << " image1_address:" << args.image1.address - << " image1_scale_address:" << args.image1.scale_address - << " image1_channels:" << args.image1.channels - << " image1_height:" << args.image1.height - << " image1_width:" << args.image1.width - << " pad1_height:" << args.image1.pad_height - << " pad_width:" << args.image1.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_EW, &args); -} -int PerformBypass(const struct BypassArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaBypass==========="; - DLOG << " input_type:" << args.input_data_type - << " output_type:" << args.output_data_type - << " input_layout_type:" << args.input_layout_type - << " output_layout_type:" << args.output_layout_type; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_BYPASS, &args); -} - -int ComputeFPGAConcat(const struct ConcatArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaConcat==========="; - DLOG << " Image_num: " << args.image_num - << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.channel_num[i] - << " image_address:" << args.images_in[i] - << " image_scale_address:" << args.scales_in[i]; - } -#endif - - image::concat_images(args.images_in, args.scales_in, args.image_out, - args.scale_out, args.image_num, args.channel_num, - args.height, args.width); - return 0; -} - int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } void format_image(framework::Tensor *image_tensor) { @@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->filter_num = (uint32_t)filter->dims()[0]; arg->output.address = out_ptr; arg->output.scale_address = out->scale; - arg->conv_args = + arg->conv_arg = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT arg->concat_arg.image_num = arg->split_num; @@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); for (int i = 0; i < n; i++) { - arg->conv_args[i].relu_enabled = relu_enabled; - arg->conv_args[i].group_num = (uint32_t)group_num; - arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h; - arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w; - arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; - arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; - arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].image.channels = (uint32_t)input->dims()[1]; - arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; - arg->conv_args[i].image.scale_address = input->scale; - arg->conv_args[i].image.pad_height = (uint32_t)padding_h; - arg->conv_args[i].image.pad_width = (uint32_t)padding_w; - arg->conv_args[i].filter_scale_address = filter->scale; - arg->conv_args[i].filter_address = &( + arg->conv_arg[i].relu_enabled = relu_enabled; + arg->conv_arg[i].group_num = (uint32_t)group_num; + arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; + arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; + arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2]; + arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3]; + arg->conv_arg[i].image.address = input_ptr; + arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; + arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; + arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; + arg->conv_arg[i].image.scale_address = input->scale; + arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; + arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; + arg->conv_arg[i].filter_scale_address = filter->scale; + arg->conv_arg[i].filter_address = &( (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_args[i].filter_num = (uint32_t)( + arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); if (n > 1) { - arg->conv_args[i].output.scale_address = + arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT - arg->conv_args[i].output.address = fpga_malloc( - input->dims()[2] * - align_to_x(input->dims()[3] * arg->conv_args[i].filter_num, - IMAGE_ALIGNMENT) * - sizeof(half)); + arg->conv_arg[i].output.address = + fpga_malloc(input->dims()[2] * + align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, + IMAGE_ALIGNMENT) * + sizeof(half)); } else { - arg->conv_args[i].output.scale_address = out->scale; - arg->conv_args[i].output.address = out_ptr; + arg->conv_arg[i].output.scale_address = out->scale; + arg->conv_arg[i].output.address = out_ptr; } arg->concat_arg.images_in[i] = - (half *)arg->conv_args[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address; - arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num; + (half *)arg->conv_arg[i].output.address; // NOLINT + arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; + arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; } } diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index f535975a35ecc3c454bbac597b31d8c3670cbf91..daa7902ab4a6cb72a77bba31f8cfe84c897f30a4 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -14,178 +14,13 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include +#include "fpga/common/fpga_common.h" +#include "fpga/common/pe.h" #include "framework/tensor.h" namespace paddle_mobile { namespace fpga { -enum DataType { - DATA_TYPE_FP32 = 1, - DATA_TYPE_FP16 = 0, -}; - -enum LayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -struct VersionArgs { - void* buffer; -}; - -struct MemoryCopyArgs { - void* src; - void* dest; - size_t size; -}; - -struct KernelArgs { - uint32_t width; - uint32_t height; - uint32_t stride_w; - uint32_t stride_h; -}; - -struct ImageInputArgs { - void* address; // input featuremap virtual address - float* scale_address; // input scale address; - uint32_t channels; - uint32_t width; // featuremap width - uint32_t height; - uint32_t pad_width; // padding width; - uint32_t pad_height; -}; - -struct ImageOutputArgs { - void* address; // output result address; - float* scale_address; // output scale address; -}; - -struct ConvArgs { - bool relu_enabled; - void* sb_address; // scale and bias are interlaced; - void* filter_address; - float* filter_scale_address; - uint32_t filter_num; - uint32_t group_num; - - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct ConcatArgs { - uint32_t image_num; - half** images_in; - float** scales_in; - void* image_out; - float* scale_out; - uint32_t* channel_num; - uint32_t height; - uint32_t width; -}; - -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -struct GroupConvArgs { - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct SplitConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -struct PoolingArgs { - int16_t mode; // mode: 0:max, 1:avg - half kernel_reciprocal; - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct EWAddArgs { - bool relu_enabled; - - uint32_t const0; // output0 = const0 x input0 + const1 x input1; - uint32_t const1; - struct ImageInputArgs image0; - struct ImageInputArgs image1; - struct ImageOutputArgs output; -}; - -struct BypassArgs { - enum DataType input_data_type; - enum DataType output_data_type; - enum LayoutType input_layout_type; - enum LayoutType output_layout_type; - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct FpgaRegWriteArgs { - uint64_t address; // - uint64_t value; -}; - -struct FpgaRegReadArgs { - uint64_t address; - uint64_t value; -}; - -struct MemoryCacheArgs { - void* address; - size_t size; -}; - -#define IOCTL_FPGA_MAGIC 'FPGA' - -#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) - -#define IOCTL_SEPARATOR_0 10 - -#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) -#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) -#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) - -#define IOCTL_SEPARATOR_1 20 - -#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs) -#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs) -#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs) -#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs) -#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs) -#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs) - -//============================== API ============================= - -int open_device(); -int close_device(); - -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -void fpga_copy(void* dst, const void* src, size_t num); -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - -int PerformBypass(const struct BypassArgs& args); -int ComputeFpgaConv(const struct SplitConvArgs& args); -int ComputeFpgaPool(const struct PoolingArgs& args); -int ComputeFpgaEWAdd(const struct EWAddArgs& args); -int ComputeFPGAConcat(const struct ConcatArgs& args); - -static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } - int get_align_image_cw(int cw); void format_image(framework::Tensor* image_tensor); void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory @@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, bool relu_enabled, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float* bs_ptr); -half fp32_2_fp16(float fp32_num); -float fp16_2_fp32(half fp16_num); - } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp index 3c2c04dc1d7f76953b04a879fbcfa8377dd7ba8a..263a7494c5602c13208aa0d8899ce80d781aa11b 100644 --- a/src/fpga/V1/bias_scale.cpp +++ b/src/fpga/V1/bias_scale.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #include "fpga/V1/bias_scale.h" #include -#include "fpga/V1/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index 3f4a3e2c876f0b54546f0e385d4a5e8bbfacdf3c..157ac90a60262cadacb648173cbc5ba6c01e674e 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V1/filter.h" #include #include -#include "fpga/V1/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) { } int calc_division_number(int num, int group_num, int division_capacity) { - PADDLE_MOBILE_ENFORCE(num % group_num == 0, - "Filter number should be divisible by group number"); + // PADDLE_MOBILE_ENFORCE(num % group_num == 0, + // "Filter number should be divisible by group + // number"); int split_num = calc_split_num(num, division_capacity); - PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - "Split number or group number should be 1"); + // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, + // "Split number or group number should be 1"); return group_num * split_num; } int calc_num_per_div(int num, int group_num, int division_capacity) { - PADDLE_MOBILE_ENFORCE(num % group_num == 0, - "Filter number should be divisible by group number"); + // PADDLE_MOBILE_ENFORCE(num % group_num == 0, + // "Filter number should be divisible by group + // number"); int split_num = calc_split_num(num, division_capacity); - PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - "Split number or group number should be 1"); + // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, + // "Split number or group number should be 1"); if (group_num == 1) { if (num > division_capacity) { return division_capacity; diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index 73be05c942d6a848db830148d25bc8b3e14b53e4..312af1d00b5f6dfa25f33ce93a25d55577b92818 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V1/image.h" #include #include -#include "fpga/V1/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9adea7e0962243d46fa6060b4deae6df371567c8 --- /dev/null +++ b/src/fpga/V1/pe.cpp @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/common/pe.h" +#include "fpga/V1/filter.h" +#include "fpga/V1/image.h" +#include "fpga/common/config.h" +#include "fpga/common/driver.h" + +namespace paddle_mobile { +namespace fpga { + +int ComputeFpgaConv(const struct SplitConvArgs &args) { + ComputeBasicConv(args.conv_arg[0]); +} + +int ComputeBasicConv(const struct ConvArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "======Compute Basic Conv======"; + DLOG << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + return 0; +} + +int ComputeFpgaPool(const struct PoolingArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaPool==========="; + DLOG << " mode:" << args.mode + << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int ComputeFpgaEWAdd(const struct EWAddArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaEWAdd==========="; + DLOG << " relu_enabled:" << args.relu_enabled + << " const0:" << fp16_2_fp32(int16_t(args.const0)) + << " const1:" << fp16_2_fp32(int16_t(args.const1)); + DLOG << " image0_address:" << args.image0.address + << " image0_scale_address:" << args.image0.scale_address + << " image0_channels:" << args.image0.channels + << " image0_height:" << args.image0.height + << " image0_width:" << args.image0.width + << " pad0_height:" << args.image0.pad_height + << " pad0_width:" << args.image0.pad_width; + DLOG << " image1_address:" << args.image1.address + << " image1_scale_address:" << args.image1.scale_address + << " image1_channels:" << args.image1.channels + << " image1_height:" << args.image1.height + << " image1_width:" << args.image1.width + << " pad1_height:" << args.image1.pad_height + << " pad_width:" << args.image1.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int PerformBypass(const struct BypassArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaBypass==========="; + DLOG << " input_type:" << args.input_data_type + << " output_type:" << args.output_data_type + << " input_layout_type:" << args.input_layout_type + << " output_layout_type:" << args.output_layout_type; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + return 0; +} + +int ComputeFPGAConcat(const struct ConcatArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaConcat==========="; + DLOG << " Image_num: " << args.image_num + << " out_address:" << args.image_out + << " out_scale_address:" << args.scale_out + << " out_channel:" << args.out_channel; + DLOG << " image_height:" << args.height << " image_width:" << args.width; + for (int i = 0; i < args.image_num; i++) { + DLOG << " " << i << "th: "; + DLOG << " channel_num:" << args.channel_num[i] + << " aligned_channel_num:" << args.aligned_channel_num[i] + << " image_address:" << args.images_in[i] + << " image_scale_address:" << args.scales_in[i]; + } +#endif + + image::concat_images(args.images_in, args.scales_in, args.image_out, + args.scale_out, args.image_num, args.channel_num, + args.height, args.width); + return 0; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 2f8a9f119e643b3836ef2c541e098f39ab3cbd17..5bfd34104600668ce63a9c7d684d4482d5d804fb 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -13,84 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V2/api.h" -#include #include "fpga/V2/bias_scale.h" -#include "fpga/V2/config.h" -#include "fpga/V2/driver/driver.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" namespace paddle_mobile { namespace fpga { -static std::map memory_map; - -int open_device() { - int ret = driver::open_device_driver(); - return ret; -} - -int close_device() { - int ret = driver::close_device_driver(); - return ret; -} - -void *fpga_malloc(size_t size) { - static uint64_t counter = 0; -#ifdef PADDLE_MOBILE_ZU5 - auto ptr = driver::fpga_malloc_driver(size); -#else - auto ptr = malloc(size); -#endif - counter += size; - memory_map.insert(std::make_pair(ptr, size)); - // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " - // << counter << " bytes"; - return ptr; -} - -void fpga_free(void *ptr) { - static uint64_t counter = 0; - size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); -#ifdef PADDLE_MOBILE_ZU5 - driver::fpga_free_driver(ptr); -#else - free(ptr); -#endif - counter += size; - // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " - // << counter << " bytes"; - } else { - DLOG << "Invalid pointer"; - } -} -void fpga_copy(void *dest, const void *src, size_t num) { -#ifdef PADDLE_MOBILE_ZU5 - driver::fpga_copy_driver(dest, src, num); -#else - memcpy(dest, src, num); -#endif -} - -int fpga_flush(void *address, size_t size) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::fpga_flush_driver(address, size); -#else - return 0; -#endif -} -int fpga_invalidate(void *address, size_t size) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::fpga_invalidate_driver(address, size); -#else - return 0; -#endif -} - void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; @@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].output.address = out_ptr; arg->conv_arg[i].output.scale_address = out->scale; - int num_after_alignment = - filter::calc_aligned_num((int)input->dims()[1], arg->filter_num); + int num_after_alignment = filter::calc_aligned_num( + (int)input->dims()[1], arg->filter_num); // NOLINT arg->conv_arg[i].free_space = fpga_malloc(num_after_alignment * 2 * sizeof(half)); } diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 1f4a203936b517d93e2d417b08a8b8456cc1fc93..1386810164d72ef849162b76a8b83fcf32082907 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -14,21 +14,13 @@ limitations under the License. */ #pragma once -#include "fpga/V2/driver/pe.h" -#include "fpga/V2/fpga_common.h" +#include "fpga/common/fpga_common.h" +#include "fpga/common/pe.h" #include "framework/tensor.h" namespace paddle_mobile { namespace fpga { -int open_device(); -int close_device(); -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -void fpga_copy(void* dest, const void* src, size_t num); -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - float filter_find_max(framework::Tensor* filter_tensor); int get_aligned_channel_num(int channel_num); int get_aligned_filter_num(framework::Tensor* filter_tensor); diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp index 3afd3f51bbb10e3bb2d66195fcc54d25c56e2393..c8f587da330c6e6e9e35969d58ae27f4366830d2 100644 --- a/src/fpga/V2/bias_scale.cpp +++ b/src/fpga/V2/bias_scale.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #include "fpga/V2/bias_scale.h" #include -#include "fpga/V2/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp index ce278edbeed64f2ca413c1f75ff620ee1f44c83d..b17ce4406bf1b6b4619d0e9e75d3f432dfa84fb1 100644 --- a/src/fpga/V2/filter.cpp +++ b/src/fpga/V2/filter.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V2/filter.h" #include #include -#include "fpga/V2/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -73,7 +73,7 @@ void convert_to_hwc(float **data_in, int num, int channel, int height, void align_filter(float **data_in, int num, int channel, int height, int width) { - int aligned_channel = calc_channel_parallelism(channel); + int aligned_channel = calc_aligned_channel(channel); int hw = height * width; int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float)); // NOLINT diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp deleted file mode 100644 index 01bca30a9ccf79232e1f28bbf77b1c030632f5bc..0000000000000000000000000000000000000000 --- a/src/fpga/V2/fpga_common.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -namespace paddle_mobile { -namespace fpga { - -int16_t fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10))); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; -} - -float fp16_2_fp32(int16_t fp16_num) { - if (0 == fp16_num) { - return 0; - } - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp index 26829bfba65f2375b27251070b33b2bbe57d069b..3d1ed95df2a805c8c64f9184e0a720f5449d6181 100644 --- a/src/fpga/V2/image.cpp +++ b/src/fpga/V2/image.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V2/image.h" #include #include -#include "fpga/V2/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/pe.cpp similarity index 79% rename from src/fpga/V2/driver/pe.cpp rename to src/fpga/V2/pe.cpp index 2e806bfb37c131fad1c011c960bc79aa1b121186..5a1114cd5e9917532a6bf086c868783518401007 100644 --- a/src/fpga/V2/driver/pe.cpp +++ b/src/fpga/V2/pe.cpp @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/V2/driver/pe.h" -#include "fpga/V2/config.h" -#include "fpga/V2/driver/driver.h" +#include "fpga/common/pe.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" +#include "fpga/common/config.h" +#include "fpga/common/driver.h" namespace paddle_mobile { namespace fpga { @@ -166,53 +166,53 @@ int PerformBypass(const struct BypassArgs &args) { return 0; #endif - uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); - uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); - uint64_t bp_enable; - int64_t length; - uint64_t pixels; - - // fp32->fp16 - if ((args.input_data_type) && (!args.output_data_type)) { - pixels = (args.image.channels) * (args.image.width) * (args.image.height); - length = pixels * sizeof(float); - bp_enable = 0x8800000000000000 + length; - } - // fp16->fp32 - else if ((!args.input_data_type) && (args.output_data_type)) { - pixels = filter::calc_aligned_channel((args.image.channels)) * - (args.image.width) * (args.image.height); - length = pixels * sizeof(short); - length = align_to_x((int)length, 64); // NOLINT - bp_enable = 0x8a00000000000000 + length; - } - // fp16->fp16 findmax - else if ((!args.input_data_type) && (!args.output_data_type)) { - pixels = (args.image.channels) * (args.image.width) * (args.image.height); - length = pixels * sizeof(short); - bp_enable = 0x8900000000000000 + length; - } else { - return -1; - } - - // start bypass - driver::reg_writeq(ifm_src_paddr, MUL8(27)); - driver::reg_writeq(ifm_dst_paddr, MUL8(28)); - driver::reg_writeq(0, MUL8(0)); - driver::reg_writeq(bp_enable, MUL8(0)); - // poll - int ret = -1; - ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); - if (ret != -1) { - // clear "irq" - driver::reg_readq(MUL8(63)); - } - // get max value - if ((!args.input_data_type) && (!args.output_data_type)) { - float scale = Findfp16Max(); - args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT - args.output.scale_address[1] = scale; - } + // uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); + // uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); + // uint64_t bp_enable; + // int64_t length; + // uint64_t pixels; + // + // // fp32->fp16 + // if ((args.input_data_type) && (!args.output_data_type)) { + // pixels = (args.image.channels) * (args.image.width) * + // (args.image.height); length = pixels * sizeof(float); bp_enable = + // 0x8800000000000000 + length; + // } + // // fp16->fp32 + // else if ((!args.input_data_type) && (args.output_data_type)) { + // pixels = filter::calc_aligned_channel((args.image.channels)) * + // (args.image.width) * (args.image.height); + // length = pixels * sizeof(short); + // length = align_to_x((int)length, 64); // NOLINT + // bp_enable = 0x8a00000000000000 + length; + // } + // // fp16->fp16 findmax + // else if ((!args.input_data_type) && (!args.output_data_type)) { + // pixels = (args.image.channels) * (args.image.width) * + // (args.image.height); length = pixels * sizeof(short); bp_enable = + // 0x8900000000000000 + length; + // } else { + // return -1; + // } + // + // // start bypass + // driver::reg_writeq(ifm_src_paddr, MUL8(27)); + // driver::reg_writeq(ifm_dst_paddr, MUL8(28)); + // driver::reg_writeq(0, MUL8(0)); + // driver::reg_writeq(bp_enable, MUL8(0)); + // // poll + // int ret = -1; + // ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); + // if (ret != -1) { + // // clear "irq" + // driver::reg_readq(MUL8(63)); + // } + // // get max value + // if ((!args.input_data_type) && (!args.output_data_type)) { + // float scale = Findfp16Max(); + // args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT + // args.output.scale_address[1] = scale; + // } return ret; } diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/common/bitmap.cpp similarity index 99% rename from src/fpga/V2/driver/bitmap.cpp rename to src/fpga/common/bitmap.cpp index c612faa6aed11b683ff81fffdf6c57a6fed9536d..9742a4559927b0520b32eeabc757f5a0f4e3392a 100644 --- a/src/fpga/V2/driver/bitmap.cpp +++ b/src/fpga/common/bitmap.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/V2/driver/bitmap.h" +#include "fpga/common/bitmap.h" namespace fpga_bitmap { void bitmap_set(uint64_t *map, unsigned int start, int len) { diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/common/bitmap.h similarity index 100% rename from src/fpga/V2/driver/bitmap.h rename to src/fpga/common/bitmap.h diff --git a/src/fpga/V2/config.h b/src/fpga/common/config.h similarity index 100% rename from src/fpga/V2/config.h rename to src/fpga/common/config.h diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/common/driver.cpp similarity index 96% rename from src/fpga/V2/driver/driver.cpp rename to src/fpga/common/driver.cpp index d7e71782676fd350f938847c03e9736ff0adb64a..8c59ac14fb11282b29a837152194d873bd65d87d 100644 --- a/src/fpga/V2/driver/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -28,8 +28,8 @@ limitations under the License. */ #include #include "common/enforce.h" -#include "fpga/V2/driver/bitmap.h" -#include "fpga/V2/driver/driver.h" +#include "fpga/common/bitmap.h" +#include "fpga/common/driver.h" namespace paddle_mobile { namespace fpga { @@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) { } } -static inline int do_ioctl(unsigned long req, const void *arg) { +static inline int do_ioctl(int64_t req, const void *arg) { return ioctl(g_fpgainfo.fd_mem, req, arg); } @@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) { p_addr = vaddr_to_paddr(address); - args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.size = size; return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); @@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) { p_addr = vaddr_to_paddr(address); - args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.size = size; return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); @@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { for (i = 0; i < num; i++) { // DLOG << "i:" << i << " val:" << *((int8_t *)src + i); // usleep(1); - *((int8_t *)dest + i) = *((int8_t *)src + i); + *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT } return; diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/common/driver.h similarity index 90% rename from src/fpga/V2/driver/driver.h rename to src/fpga/common/driver.h index 633e95ea8204ada2a330a6bb4fab4ce8fe23248b..2dad07ec5206a7ca64449aa38ebe0603d72b71e3 100644 --- a/src/fpga/V2/driver/driver.h +++ b/src/fpga/common/driver.h @@ -33,8 +33,6 @@ namespace driver { #define FPGA_MEM_PHY_ADDR 0x20000000 #define FPGA_MEM_SIZE 0x20000000 -#define CPU_FREQ 1000000000 - #define FPGA_PAGE_SIZE (16UL * 1024UL) // PE related macros @@ -53,7 +51,7 @@ struct MemoryCacheArgs { size_t size; }; -#define IOCTL_FPGA_MAGIC 'FPGA' +#define IOCTL_FPGA_MAGIC 'F' #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) @@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo; inline uint64_t reg_readq(uint32_t offset) { // DLOG << "offset : " << offset; - uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + - offset); // NOLINT + uint64_t value = + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT + offset); // NOLINT return value; } inline void reg_writeq(uint64_t value, uint32_t offset) { // DLOG << "offset : " << offset << ", value : " << value; - *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + - offset) = // NOLINT - value; + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT + offset) = value; } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c7ae838fa4216d121cf38a11ef4897043b9a0dd --- /dev/null +++ b/src/fpga/common/fpga_common.cpp @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/common/fpga_common.h" +#include +#include +#include "fpga/common/config.h" +#include "fpga/common/driver.h" + +namespace paddle_mobile { +namespace fpga { + +int16_t fp32_2_fp16(float fp32_num) { + unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT + auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | + (((tmp & 0x7f800000) >> 13) - (112 << 10))); + if (tmp & 0x1000) { + t++; // roundoff + } + return t; +} + +float fp16_2_fp32(int16_t fp16_num) { + if (0 == fp16_num) { + return 0; + } + int frac = (fp16_num & 0x3ff); + int exp = ((fp16_num & 0x7c00) >> 10) + 112; + int s = fp16_num & 0x8000; + int tmp = 0; + float fp32_num; + tmp = s << 16 | exp << 23 | frac << 13; + fp32_num = *(float *)&tmp; // NOLINT + return fp32_num; +} + +static std::map memory_map; + +int open_device() { + int ret = driver::open_device_driver(); + return ret; +} + +int close_device() { + int ret = driver::close_device_driver(); + return ret; +} + +void *fpga_malloc(size_t size) { + static uint64_t counter = 0; +#ifdef PADDLE_MOBILE_ZU5 + auto ptr = driver::fpga_malloc_driver(size); +#else + auto ptr = malloc(size); +#endif + counter += size; + memory_map.insert(std::make_pair(ptr, size)); + // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " + // << counter << " bytes"; + return ptr; +} + +void fpga_free(void *ptr) { + static uint64_t counter = 0; + size_t size = 0; + auto iter = memory_map.find(ptr); // std::map::iterator + if (iter != memory_map.end()) { + size = iter->second; + memory_map.erase(iter); +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_free_driver(ptr); +#else + free(ptr); +#endif + counter += size; + // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " + // << counter << " bytes"; + } else { + DLOG << "Invalid pointer"; + } +} +void fpga_copy(void *dest, const void *src, size_t num) { +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_copy_driver(dest, src, num); +#else + memcpy(dest, src, num); +#endif +} + +int fpga_flush(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_flush_driver(address, size); +#else + return 0; +#endif +} +int fpga_invalidate(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_invalidate_driver(address, size); +#else + return 0; +#endif +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/common/fpga_common.h similarity index 91% rename from src/fpga/V2/fpga_common.h rename to src/fpga/common/fpga_common.h index 1862d843503ee8faf58caf038202e198ca079905..430014ef654ec2f00eeb2548012e4ae716f4aa8b 100644 --- a/src/fpga/V2/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include namespace paddle_mobile { @@ -117,9 +118,19 @@ struct BypassArgs { struct DeconvArgs { struct ConvArgs conv_arg; }; + static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } + int16_t fp32_2_fp16(float fp32_num); float fp16_2_fp32(int16_t fp16_num); +int open_device(); +int close_device(); +void* fpga_malloc(size_t size); +void fpga_free(void* ptr); +void fpga_copy(void* dest, const void* src, size_t num); +int fpga_flush(void* address, size_t size); +int fpga_invalidate(void* address, size_t size); + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/common/pe.h similarity index 96% rename from src/fpga/V2/driver/pe.h rename to src/fpga/common/pe.h index 4903bf4c33f6b5d5899c56eeaada8c7a21d1a875..0da13b8396b7f6a7960dfbb36337f3b38c7ac865 100644 --- a/src/fpga/V2/driver/pe.h +++ b/src/fpga/common/pe.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "fpga/V2/fpga_common.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c6fce2bbb6f3f4fa14501387f415836b0be88a3b..2a4a8e7e214d9d2b9651884e193e4a1576092882 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -67,9 +67,6 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet50 paddle-mobile) - ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-densebox paddle-mobile) - set(FOUND_MATCH ON) endif () @@ -81,9 +78,6 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-pe fpga/test_pe.cpp) target_link_libraries(test-pe paddle-mobile) - ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-densebox paddle-mobile) - set(FOUND_MATCH ON) endif () diff --git a/tools/op.cmake b/tools/op.cmake index 5901a23a1ff50c357d69cfff63cdfd543dbf8f9d..3c70f1754fbdddd9594cb25731979f17137f66d4 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -102,7 +102,6 @@ if (CON GREATER -1) set(MUL_OP ON) set(RESHAPE_OP ON) set(SOFTMAX_OP ON) - set(FOUND_MATCH ON) endif() @@ -120,14 +119,12 @@ if (CON GREATER -1) set(SOFTMAX_OP ON) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) - set(FUSION_CONVADD_OP ON) set(FOUND_MATCH ON) endif() list(FIND NET "FPGA_NET_V2" CON) if (CON GREATER -1) message("FPGA_NET_V2 enabled") - set(FEED_OP ON) set(FUSION_CONVADDRELU_OP ON) set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_FC_OP ON) @@ -136,8 +133,6 @@ if (CON GREATER -1) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) set(CONV_TRANSPOSE_OP ON) - set(FUSION_DECONVRELU_OP ON) - #set(SLICE_OP ON) set(TANH_OP ON) set(ELEMENTWISEADD_OP ON) set(TRANSPOSE2_OP ON)