From c812a2513fd944cc719cb7210d8d265075838e16 Mon Sep 17 00:00:00 2001 From: zhangyang Date: Mon, 12 Nov 2018 22:45:44 +0800 Subject: [PATCH] update V2 for FPGA track --- src/common/types.cpp | 6 +- src/common/types.h | 3 + src/fpga/V2/api.cpp | 98 +++++++------- src/fpga/V2/api.h | 109 +-------------- src/fpga/V2/bias_scale.cpp | 3 +- src/fpga/V2/driver/bitmap.cpp | 4 +- src/fpga/V2/driver/bitmap.h | 2 +- src/fpga/V2/driver/driver.cpp | 90 +++++++++++-- src/fpga/V2/driver/driver.h | 29 +++- src/fpga/V2/driver/pe.cpp | 28 ++-- src/fpga/V2/driver/pe.h | 3 +- src/fpga/V2/filter.cpp | 3 + src/fpga/V2/fpga_common.cpp | 44 ++++++ src/fpga/V2/fpga_common.h | 125 ++++++++++++++++++ src/fpga/V2/image.cpp | 4 + src/framework/executor.cpp | 13 +- src/io/paddle_mobile.cpp | 6 +- src/memory/t_malloc.cpp | 2 +- src/operators/conv_transpose_op.cpp | 1 + src/operators/fusion_deconv_relu_op.cpp | 32 +++++ src/operators/fusion_deconv_relu_op.h | 107 +++++++++++++++ src/operators/kernel/deconv_relu_kernel.h | 39 ++++++ .../kernel/fpga/V2/conv_transpose_kernel.cpp | 34 +++++ .../kernel/fpga/V2/deconv_relu_kernel.cpp | 36 +++++ src/operators/kernel/fpga/V2/slice_kernel.cpp | 1 + .../kernel/fpga/V2/softmax_kernel.cpp | 5 + src/operators/kernel/fpga/V2/tanh_kernel.cpp | 33 +++++ src/operators/kernel/tanh_kernel.h | 37 ++++++ src/operators/op_param.h | 36 +++++ src/operators/tanh_op.cpp | 35 +++++ src/operators/tanh_op.h | 44 ++++++ test/CMakeLists.txt | 3 + test/fpga/test_pe.cpp | 111 ++++++++++++++++ tools/op.cmake | 14 +- 34 files changed, 947 insertions(+), 193 deletions(-) create mode 100644 src/fpga/V2/fpga_common.cpp create mode 100644 src/fpga/V2/fpga_common.h create mode 100644 src/operators/fusion_deconv_relu_op.cpp create mode 100644 src/operators/fusion_deconv_relu_op.h create mode 100644 src/operators/kernel/deconv_relu_kernel.h create mode 100644 src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp create mode 100644 src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp create mode 100644 src/operators/kernel/fpga/V2/tanh_kernel.cpp create mode 100644 src/operators/kernel/tanh_kernel.h create mode 100644 src/operators/tanh_op.cpp create mode 100644 src/operators/tanh_op.h create mode 100644 test/fpga/test_pe.cpp diff --git a/src/common/types.cpp b/src/common/types.cpp index ede49478ce..510313d9fe 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -71,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; +extern const char *G_OP_TYPE_TANH = "tanh"; +extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; std::unordered_map< std::string, std::pair, std::vector>> @@ -129,5 +131,7 @@ std::unordered_map< {G_OP_TYPE_SUM, {{"X"}, {"Out"}}}, {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, - {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}}; + {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, + {G_OP_TYPE_TANH, {{"X"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}}; } // namespace paddle_mobile diff --git a/src/common/types.h b/src/common/types.h index 70f6debf87..4cd35ac910 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -139,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL; extern const char *G_OP_TYPE_QUANTIZE; extern const char *G_OP_TYPE_DEQUANTIZE; +extern const char *G_OP_TYPE_TANH; +extern const char *G_OP_TYPE_FUSION_DECONV_RELU; + extern std::unordered_map< std::string, std::pair, std::vector>> op_input_output_key; diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 324ee4f538..2f8a9f119e 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -16,27 +16,29 @@ limitations under the License. */ #include #include "fpga/V2/bias_scale.h" #include "fpga/V2/config.h" +#include "fpga/V2/driver/driver.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" namespace paddle_mobile { namespace fpga { + static std::map memory_map; int open_device() { - int ret = open_device_driver(); + int ret = driver::open_device_driver(); return ret; } int close_device() { - int ret = close_device_driver(); + int ret = driver::close_device_driver(); return ret; } void *fpga_malloc(size_t size) { static uint64_t counter = 0; #ifdef PADDLE_MOBILE_ZU5 - auto ptr = fpga_malloc_driver(size); + auto ptr = driver::fpga_malloc_driver(size); #else auto ptr = malloc(size); #endif @@ -55,7 +57,7 @@ void fpga_free(void *ptr) { size = iter->second; memory_map.erase(iter); #ifdef PADDLE_MOBILE_ZU5 - fpga_free_driver(ptr); + driver::fpga_free_driver(ptr); #else free(ptr); #endif @@ -66,26 +68,27 @@ void fpga_free(void *ptr) { DLOG << "Invalid pointer"; } } - -half fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10))); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; +void fpga_copy(void *dest, const void *src, size_t num) { +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_copy_driver(dest, src, num); +#else + memcpy(dest, src, num); +#endif } -float fp16_2_fp32(half fp16_num) { - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; +int fpga_flush(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_flush_driver(address, size); +#else + return 0; +#endif +} +int fpga_invalidate(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_invalidate_driver(address, size); +#else + return 0; +#endif } void format_image(framework::Tensor *image_tensor) { @@ -240,7 +243,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->filter_num = (uint32_t)filter->dims()[0]; arg->output.address = out_ptr; arg->output.scale_address = out->scale; - arg->conv_args = + arg->conv_arg = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT arg->concat_arg.image_num = arg->split_num; @@ -258,28 +261,33 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT for (int i = 0; i < n; i++) { - arg->conv_args[i].relu_enabled = relu_enabled; - arg->conv_args[i].sb_address = bs_ptr; - arg->conv_args[i].filter_address = (int8_t *)filter_ptr; // NOLINT - arg->conv_args[i].filter_scale_address = filter->scale; - arg->conv_args[i].filter_num = arg->filter_num; - arg->conv_args[i].group_num = (uint32_t)group_num; - - arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h; - arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w; - arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; - arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; - - arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].image.scale_address = input->scale; - arg->conv_args[i].image.channels = (uint32_t)input->dims()[1]; - arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; - arg->conv_args[i].image.pad_height = (uint32_t)padding_h; - arg->conv_args[i].image.pad_width = (uint32_t)padding_w; - - arg->conv_args[i].output.address = out_ptr; - arg->conv_args[i].output.scale_address = out->scale; + arg->conv_arg[i].relu_enabled = relu_enabled; + arg->conv_arg[i].sb_address = bs_ptr; + arg->conv_arg[i].filter_address = (int8_t *)filter_ptr; // NOLINT + arg->conv_arg[i].filter_scale_address = filter->scale; + arg->conv_arg[i].filter_num = arg->filter_num; + arg->conv_arg[i].group_num = (uint32_t)group_num; + + arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; + arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; + arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2]; + arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3]; + + arg->conv_arg[i].image.address = input_ptr; + arg->conv_arg[i].image.scale_address = input->scale; + arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; + arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; + arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; + arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; + arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; + + arg->conv_arg[i].output.address = out_ptr; + arg->conv_arg[i].output.scale_address = out->scale; + + int num_after_alignment = + filter::calc_aligned_num((int)input->dims()[1], arg->filter_num); + arg->conv_arg[i].free_space = + fpga_malloc(num_after_alignment * 2 * sizeof(half)); } } diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index aac97bec22..1f4a203936 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -14,118 +14,20 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include -#include "fpga/V2/driver/driver.h" #include "fpga/V2/driver/pe.h" +#include "fpga/V2/fpga_common.h" #include "framework/tensor.h" namespace paddle_mobile { namespace fpga { -enum DataType { - DATA_TYPE_FP32 = 1, - DATA_TYPE_FP16 = 0, -}; - -enum LayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -struct KernelArgs { - uint32_t width; - uint32_t height; - uint32_t stride_w; - uint32_t stride_h; -}; - -struct ImageInputArgs { - void* address; // input featuremap virtual address - float* scale_address; // input scale address; - uint32_t channels; - uint32_t width; // featuremap width - uint32_t height; - uint32_t pad_width; // padding width; - uint32_t pad_height; -}; - -struct ImageOutputArgs { - void* address; // output result address; - float* scale_address; // output scale address; - uint64_t timer_cnt; // time counter for FPGA computation -}; - -struct ConvArgs { - bool relu_enabled; - void* sb_address; // scale and bias are interlaced; - void* filter_address; - float* filter_scale_address; - uint32_t filter_num; - uint32_t group_num; - - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct ConcatArgs { - uint32_t image_num; - half** images_in; - float** scales_in; - void* image_out; - float* scale_out; - uint32_t* channel_num; - uint32_t* aligned_channel_num; - uint32_t out_channel; - uint32_t height; - uint32_t width; -}; - -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -struct PoolingArgs { - int16_t mode; // mode: 0:max, 1:avg - half kernel_reciprocal; - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct EWAddArgs { - bool relu_enabled; - - uint32_t const0; // output0 = const0 x input0 + const1 x input1; - uint32_t const1; - struct ImageInputArgs image0; - struct ImageInputArgs image1; - struct ImageOutputArgs output; -}; - -struct BypassArgs { - enum DataType input_data_type; - enum DataType output_data_type; - enum LayoutType input_layout_type; - enum LayoutType output_layout_type; - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - int open_device(); int close_device(); void* fpga_malloc(size_t size); void fpga_free(void* ptr); - -static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } +void fpga_copy(void* dest, const void* src, size_t num); +int fpga_flush(void* address, size_t size); +int fpga_invalidate(void* address, size_t size); float filter_find_max(framework::Tensor* filter_tensor); int get_aligned_channel_num(int channel_num); @@ -153,8 +55,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, bool relu_enabled, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float* bs_ptr); -half fp32_2_fp16(float fp32_num); -float fp16_2_fp32(half fp16_num); - } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp index 8a0fd42619..3afd3f51bb 100644 --- a/src/fpga/V2/bias_scale.cpp +++ b/src/fpga/V2/bias_scale.cpp @@ -27,7 +27,7 @@ void align_element(float **data_in, int num, int num_after_alignment) { (float *)fpga_malloc(total_element * sizeof(float)); // NOLINT memset(ptr_aligned, 0, total_element * sizeof(float)); - for (int i = 1; i < num; i++) { + for (int i = 0; i < num; i++) { ptr_aligned[i * 2 + 0] = ptr_unaligned[i]; ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num]; } @@ -39,6 +39,7 @@ void align_element(float **data_in, int num, int num_after_alignment) { void format_bias_scale_array(float **data_in, int num, int num_after_alignment) { align_element(data_in, num, num_after_alignment); + fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float)); } } // namespace bias_scale diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/V2/driver/bitmap.cpp index 9c99f6446c..c612faa6ae 100644 --- a/src/fpga/V2/driver/bitmap.cpp +++ b/src/fpga/V2/driver/bitmap.cpp @@ -57,8 +57,8 @@ static uint64_t ffs(uint64_t data) { uint64_t bit = 0; int i = 0; - for (i = 0; i < sizeof(data); i++) { - if (data & (1 << i)) { + for (i = 0; i < sizeof(data) * 8; i++) { + if (data & (1UL << i)) { bit = i; break; } diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/V2/driver/bitmap.h index 272cddf233..4cb1673d91 100644 --- a/src/fpga/V2/driver/bitmap.h +++ b/src/fpga/V2/driver/bitmap.h @@ -25,7 +25,7 @@ limitations under the License. */ #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define round_down(x, y) ((x) & ((y)-1)) +#define round_down(x, y) ((x) & ~((y)-1)) namespace fpga_bitmap { void bitmap_set(uint64_t *map, unsigned int start, int len); diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/V2/driver/driver.cpp index ed78fa5ebc..d7e7178267 100644 --- a/src/fpga/V2/driver/driver.cpp +++ b/src/fpga/V2/driver/driver.cpp @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +namespace driver { struct FPGA_INFO g_fpgainfo; int open_drvdevice() { @@ -43,7 +45,8 @@ int open_drvdevice() { int open_memdevice() { if (g_fpgainfo.fd_mem == -1) { - g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC); + // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC); + g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR); } return g_fpgainfo.fd_mem; } @@ -51,7 +54,6 @@ int open_memdevice() { void pl_reset() { // DLOG << "PL RESET"; - // reg_writeq(0x5a, REG_FPGA_RESET); usleep(100 * 1000); } @@ -131,7 +133,7 @@ int pl_get_status() { return 0; } int fpga_regpoll(uint64_t reg, uint64_t val, int time) { uint64_t i = 0; /*timeout精确性待确认*/ - int64_t timeout = time * CPU_FREQ / 1000000; + int64_t timeout = time * 6; for (i = 0; i < timeout; i++) { if (val == reg_readq(reg)) { @@ -173,9 +175,14 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { } void memory_release(struct fpga_memory *memory) { - pthread_mutex_lock(&memory->mutex); - fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); - pthread_mutex_unlock(&memory->mutex); + void *ptr = nullptr; + + /*unmap memory*/ + std::map map = g_fpgainfo.fpga_addr2size_map; + std::map::iterator iter; + for (iter = map.begin(); iter != map.end(); iter++) { + fpga_free_driver(ptr); + } } int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) { @@ -238,7 +245,6 @@ int init_fpga_memory(struct fpga_memory *memory) { return rc; } - // spin_lock_init(&memory->spin); fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0. @@ -293,9 +299,23 @@ void *fpga_reg_malloc(size_t size) { return ret; } +void *fpga_reg_free(void *ptr) { + size_t size = 0; + + auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); + if (iter != g_fpgainfo.fpga_addr2size_map.end()) { + size = iter->second; + g_fpgainfo.fpga_addr2size_map.erase(iter); + munmap(ptr, size); + } else { + DLOG << "Invalid pointer"; + } +} + void *fpga_malloc_driver(size_t size) { void *ret = nullptr; uint64_t phy_addr = 0; + int i = 0; memory_request(g_fpgainfo.memory_info, size, &phy_addr); @@ -311,17 +331,70 @@ void *fpga_malloc_driver(size_t size) { void fpga_free_driver(void *ptr) { size_t size = 0; + uint32_t pos = 0; + uint64_t p_addr = 0; auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); if (iter != g_fpgainfo.fpga_addr2size_map.end()) { size = iter->second; g_fpgainfo.fpga_addr2size_map.erase(iter); munmap(ptr, size); + + p_addr = vaddr_to_paddr(ptr); + pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; + + /*clear bitmap*/ + pthread_mutex_lock(&g_fpgainfo.memory_info->mutex); + fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, + g_fpgainfo.memory_info->nr[pos]); + pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); } else { DLOG << "Invalid pointer"; } } +static inline int do_ioctl(unsigned long req, const void *arg) { + return ioctl(g_fpgainfo.fd_mem, req, arg); +} + +int fpga_flush_driver(void *address, size_t size) { + struct MemoryCacheArgs args; + uint64_t p_addr; + + p_addr = vaddr_to_paddr(address); + + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.size = size; + + return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); +} + +int fpga_invalidate_driver(void *address, size_t size) { + struct MemoryCacheArgs args; + uint64_t p_addr; + + p_addr = vaddr_to_paddr(address); + + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.size = size; + + return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); +} + +void fpga_copy_driver(void *dest, const void *src, size_t num) { + uint64_t i; + + DLOG << "dest:" << dest << " src:" << src << " size:" << num; + + for (i = 0; i < num; i++) { + // DLOG << "i:" << i << " val:" << *((int8_t *)src + i); + // usleep(1); + *((int8_t *)dest + i) = *((int8_t *)src + i); + } + + return; +} + int open_device_driver() { g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR; g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR; @@ -347,12 +420,13 @@ int open_device_driver() { int close_device_driver() { pl_destroy(); - fpga_free_driver(g_fpgainfo.FpgaRegVirAddr); + fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); memory_release(g_fpgainfo.memory_info); destroy_fpga_memory(g_fpgainfo.memory_info); return 0; } +} // namespace driver } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/V2/driver/driver.h index ee01454ac5..633e95ea82 100644 --- a/src/fpga/V2/driver/driver.h +++ b/src/fpga/V2/driver/driver.h @@ -24,6 +24,7 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +namespace driver { #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) @@ -47,6 +48,15 @@ const int PE_IDX_BYPASS = 3; enum pe_status { IDLE = 0, BUSY = 1 }; +struct MemoryCacheArgs { + void *offset; + size_t size; +}; + +#define IOCTL_FPGA_MAGIC 'FPGA' +#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) +#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) + struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; struct pe_data_s *outer; @@ -95,26 +105,39 @@ extern struct FPGA_INFO g_fpgainfo; inline uint64_t reg_readq(uint32_t offset) { // DLOG << "offset : " << offset; - uint64_t value = - *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset); // NOLINT + uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + + offset); // NOLINT return value; } inline void reg_writeq(uint64_t value, uint32_t offset) { // DLOG << "offset : " << offset << ", value : " << value; - *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) = // NOLINT + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + + offset) = // NOLINT value; } int open_device_driver(); + int close_device_driver(); + void *fpga_malloc_driver(size_t size); + void fpga_free_driver(void *ptr); + +void fpga_copy_driver(void *dest, const void *src, size_t num); + +int fpga_flush_driver(void *address, size_t size); + +int fpga_invalidate_driver(void *address, size_t size); + /*pe*/ uint64_t vaddr_to_paddr(void *address); + int fpga_regpoll(uint64_t reg, uint64_t val, int time); +} // namespace driver } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp index 52cde04601..2e806bfb37 100644 --- a/src/fpga/V2/driver/pe.cpp +++ b/src/fpga/V2/driver/pe.cpp @@ -20,29 +20,29 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { -#define MUL8(x) (x * 8) +#define MUL8(x) ((x)*8) #define BYPASS_DONE 1 float Findfp16Max() { uint16_t abs_vals[16]; uint64_t max_fp16; - max_fp16 = reg_readq(MUL8(49)); + max_fp16 = driver::reg_readq(MUL8(49)); abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT - max_fp16 = reg_readq(MUL8(50)); + max_fp16 = driver::reg_readq(MUL8(50)); abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT - max_fp16 = reg_readq(MUL8(51)); + max_fp16 = driver::reg_readq(MUL8(51)); abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT - max_fp16 = reg_readq(MUL8(52)); + max_fp16 = driver::reg_readq(MUL8(52)); abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16)); abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT @@ -58,7 +58,7 @@ float Findfp16Max() { } int ComputeFpgaConv(const struct SplitConvArgs &args) { - ComputeBasicConv(args.conv_args[0]); + ComputeBasicConv(args.conv_arg[0]); } int ComputeBasicConv(const struct ConvArgs &args) { @@ -166,8 +166,8 @@ int PerformBypass(const struct BypassArgs &args) { return 0; #endif - uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address); - uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address); + uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); + uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); uint64_t bp_enable; int64_t length; uint64_t pixels; @@ -196,16 +196,16 @@ int PerformBypass(const struct BypassArgs &args) { } // start bypass - reg_writeq(ifm_src_paddr, MUL8(27)); - reg_writeq(ifm_dst_paddr, MUL8(28)); - reg_writeq(0, MUL8(0)); - reg_writeq(bp_enable, MUL8(0)); + driver::reg_writeq(ifm_src_paddr, MUL8(27)); + driver::reg_writeq(ifm_dst_paddr, MUL8(28)); + driver::reg_writeq(0, MUL8(0)); + driver::reg_writeq(bp_enable, MUL8(0)); // poll int ret = -1; - ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); + ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); if (ret != -1) { // clear "irq" - reg_readq(MUL8(63)); + driver::reg_readq(MUL8(63)); } // get max value if ((!args.input_data_type) && (!args.output_data_type)) { diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/V2/driver/pe.h index 4ec3ccb01d..4903bf4c33 100644 --- a/src/fpga/V2/driver/pe.h +++ b/src/fpga/V2/driver/pe.h @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "fpga/V2/api.h" + +#include "fpga/V2/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp index 39d67b2d2d..ce278edbee 100644 --- a/src/fpga/V2/filter.cpp +++ b/src/fpga/V2/filter.cpp @@ -94,6 +94,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width, convert_to_hwc(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + fpga_flush(*data_in, pixel_num * sizeof(float)); } void convert_fc_filter(float **data_in, int num, int chw) { @@ -113,6 +114,8 @@ void format_fc_filter(float **data_in, int num, int channel, int height, int chw = channel * height * width; convert_fc_filter(data_in, num, chw); align_filter(data_in, num, channel, height, width); + int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + fpga_flush(*data_in, pixel_num * sizeof(float)); } float find_max(float *data_in, int data_size) { diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp new file mode 100644 index 0000000000..01bca30a9c --- /dev/null +++ b/src/fpga/V2/fpga_common.cpp @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +namespace paddle_mobile { +namespace fpga { + +int16_t fp32_2_fp16(float fp32_num) { + unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT + auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | + (((tmp & 0x7f800000) >> 13) - (112 << 10))); + if (tmp & 0x1000) { + t++; // roundoff + } + return t; +} + +float fp16_2_fp32(int16_t fp16_num) { + if (0 == fp16_num) { + return 0; + } + int frac = (fp16_num & 0x3ff); + int exp = ((fp16_num & 0x7c00) >> 10) + 112; + int s = fp16_num & 0x8000; + int tmp = 0; + float fp32_num; + tmp = s << 16 | exp << 23 | frac << 13; + fp32_num = *(float *)&tmp; // NOLINT + return fp32_num; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/V2/fpga_common.h new file mode 100644 index 0000000000..1862d84350 --- /dev/null +++ b/src/fpga/V2/fpga_common.h @@ -0,0 +1,125 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle_mobile { +namespace fpga { + +enum DataType { + DATA_TYPE_FP32 = 1, + DATA_TYPE_FP16 = 0, +}; + +enum LayoutType { + LAYOUT_CHW = 1, + LAYOUT_HWC = 0, +}; + +struct KernelArgs { + uint32_t width; + uint32_t height; + uint32_t stride_w; + uint32_t stride_h; +}; + +struct ImageInputArgs { + void* address; // input featuremap virtual address + float* scale_address; // input scale address; + uint32_t channels; + uint32_t width; // featuremap width + uint32_t height; + uint32_t pad_width; // padding width; + uint32_t pad_height; +}; + +struct ImageOutputArgs { + void* address; // output result address; + float* scale_address; // output scale address; + uint64_t timer_cnt; // time counter for FPGA computation +}; + +struct ConvArgs { + bool relu_enabled; + void* sb_address; // scale and bias + void* filter_address; + float* filter_scale_address; + void* free_space; // used by FPGA logic + uint32_t filter_num; + uint32_t group_num; + + struct KernelArgs kernel; + struct ImageInputArgs image; // input image; + struct ImageOutputArgs output; +}; + +struct ConcatArgs { + uint32_t image_num; + int16_t** images_in; + float** scales_in; + void* image_out; + float* scale_out; + uint32_t* channel_num; + uint32_t* aligned_channel_num; + uint32_t out_channel; + uint32_t height; + uint32_t width; +}; + +struct SplitConvArgs { + uint32_t split_num; + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct ConvArgs* conv_arg; + struct ConcatArgs concat_arg; +}; + +struct PoolingArgs { + int16_t mode; // mode: 0:max, 1:avg + int16_t kernel_reciprocal; + struct KernelArgs kernel; + struct ImageInputArgs image; // input image; + struct ImageOutputArgs output; +}; + +struct EWAddArgs { + bool relu_enabled; + uint32_t const0; // output0 = const0 x input0 + const1 x input1; + uint32_t const1; + struct ImageInputArgs image0; + struct ImageInputArgs image1; + struct ImageOutputArgs output; +}; + +struct BypassArgs { + enum DataType input_data_type; + enum DataType output_data_type; + enum LayoutType input_layout_type; + enum LayoutType output_layout_type; + struct ImageInputArgs image; + struct ImageOutputArgs output; +}; + +struct DeconvArgs { + struct ConvArgs conv_arg; +}; +static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } +int16_t fp32_2_fp16(float fp32_num); +float fp16_2_fp32(int16_t fp16_num); + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp index 4ce76cd00f..26829bfba6 100644 --- a/src/fpga/V2/image.cpp +++ b/src/fpga/V2/image.cpp @@ -58,6 +58,7 @@ void format_image(float **data_in, int channel, int height, int width, int aligned_channel) { convert_to_hwc(data_in, channel, height, width); align_image(data_in, channel, height, width, aligned_channel); + fpga_flush(*data_in, aligned_channel * height * width * sizeof(float)); } void concat_images(int16_t **images_in, float **scales_in, void *image_out, @@ -69,6 +70,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, scale_out[1] = 0.0; for (int i = 0; i < image_num; i++) { scale_out[0] = std::max(*scale_out, scales_in[i][0]); + fpga_invalidate(images_in[i], + height * width * aligned_channel_num[i] * sizeof(int16_t)); } scale_out[1] = 1 / scale_out[0]; @@ -83,6 +86,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, tmp_channel_sum += channel_num[i]; } } + fpga_flush(image_out, hw * out_channel * sizeof(int16_t)); } } // namespace image diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 0ed3a5d323..c7ef09ed5a 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -26,6 +26,7 @@ limitations under the License. */ #include "framework/program/var_desc.h" #include "framework/scope.h" #include "framework/tensor.h" +#include "memory/t_malloc.h" #ifdef PADDLE_EXECUTOR_MULTITHREAD #include @@ -86,8 +87,10 @@ Executor::Executor(const framework::Program p, int batch_size, } std::shared_ptr to_predict_block = to_predict_program_->Block(0); + int i = 0; auto &ops = ops_of_block_[*to_predict_block.get()]; for (const auto &op : ops) { + DLOG << "Initialize op[" << i++ << "]: " << op->Type(); op->Init(); } } @@ -102,8 +105,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor, // should be moved into operator init function float min_value; float max_value; - memcpy(&min_value, data_buf, sizeof(float)); - memcpy(&max_value, data_buf + sizeof(float), sizeof(float)); + memory::Copy(&min_value, data_buf, sizeof(float)); + memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float)); data_buf += 2 * sizeof(float); const float factor = (max_value - min_value) / 255.0; const uint8_t *uint8_data = reinterpret_cast(data_buf); @@ -112,7 +115,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor, } data_buf += size * sizeof(uint8_t); } else { - memcpy(tensor_data, *data_buf, size * sizeof(Dtype)); + memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype)); *data_buf += size * sizeof(Dtype); } } @@ -128,7 +131,7 @@ void Executor::LoadMemory( // lod information // uint64_t lod_level = *(reinterpret_cast(*data_buf)); uint64_t lod_level = 0; - memcpy(&lod_level, *data_buf, sizeof(uint64_t)); + memory::Copy(&lod_level, *data_buf, sizeof(uint64_t)); *data_buf += sizeof(uint64_t); auto *lod = tensor->mutable_lod(); @@ -137,7 +140,7 @@ void Executor::LoadMemory( uint64_t size = *(reinterpret_cast(*data_buf)); *data_buf += sizeof(uint64_t); std::vector tmp_dim(size / sizeof(size_t)); - memcpy(tmp_dim.data(), *data_buf, size); + memory::Copy(tmp_dim.data(), *data_buf, size); (*lod)[i] = std::move(tmp_dim); *data_buf += size; } diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index fca870860e..1a28373f6a 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -21,7 +21,6 @@ limitations under the License. */ #include "operators/math/gemm.h" namespace paddle_mobile { -static std::mutex lc; template void PaddleMobile::SetThreadNum(int num) { #ifdef _OPENMP @@ -148,8 +147,8 @@ double PaddleMobile::GetPredictTime() { } paddle_mobile::operators::math::Gemm gemm; auto time1 = paddle_mobile::time(); - gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, - static_cast(0), c, ldc, false, nullptr); +// gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, +// static_cast(0), c, ldc, false, nullptr); auto time2 = paddle_mobile::time(); double cost = paddle_mobile::time_diff(time1, time2); paddle_mobile::memory::Free(a); @@ -199,6 +198,7 @@ void PaddleMobile::Predict_To(int end) { #endif #ifdef PADDLE_MOBILE_CL +static std::mutex lc; template void PaddleMobile::SetCLPath(std::string path) { std::lock_guard lock(lc); diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 129f82a19d..2fb74d1880 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -32,7 +32,7 @@ const int MALLOC_ALIGN = 64; namespace fpga = paddle_mobile::fpga; void Copy(void *dst, const void *src, size_t num) { - std::memcpy(dst, src, num); + fpga::fpga_copy(dst, src, num); } void *Alloc(size_t size) { return fpga::fpga_malloc(size); } diff --git a/src/operators/conv_transpose_op.cpp b/src/operators/conv_transpose_op.cpp index 4d9eefaa85..d09a793745 100644 --- a/src/operators/conv_transpose_op.cpp +++ b/src/operators/conv_transpose_op.cpp @@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose); #ifdef PADDLE_MOBILE_MALI_GPU #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose); #endif #endif diff --git a/src/operators/fusion_deconv_relu_op.cpp b/src/operators/fusion_deconv_relu_op.cpp new file mode 100644 index 0000000000..daae39c951 --- /dev/null +++ b/src/operators/fusion_deconv_relu_op.cpp @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVRELU_OP + +#include "operators/fusion_deconv_relu_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_relu_op.h b/src/operators/fusion_deconv_relu_op.h new file mode 100644 index 0000000000..e87d5d3798 --- /dev/null +++ b/src/operators/fusion_deconv_relu_op.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVRELU_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvReluMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvReluMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), {}, removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; } +}; + +template +class FusionDeconvReluOp : public framework::OperatorWithKernel< + DeviceType, FusionDeconvReluParam, + operators::DeconvReluKernel> { + public: + FusionDeconvReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvReluParam, + operators::DeconvReluKernel>(type, inputs, outputs, + attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_FC_RELU_OP diff --git a/src/operators/kernel/deconv_relu_kernel.h b/src/operators/kernel/deconv_relu_kernel.h new file mode 100644 index 0000000000..bc85f1ffee --- /dev/null +++ b/src/operators/kernel/deconv_relu_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVRELU_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvReluParam ¶m); + + bool Init(FusionDeconvReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp new file mode 100644 index 0000000000..3284ddcdec --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_TRANSPOSE_OP + +#include "operators/kernel/conv_transpose_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvTransposeKernel::Init(ConvTransposeParam *param) { + return true; +} + +template <> +void ConvTransposeKernel::Compute( + const ConvTransposeParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp new file mode 100644 index 0000000000..bf3556609a --- /dev/null +++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVRELU_OP + +#include "operators/kernel/deconv_relu_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvReluKernel::Init(FusionDeconvReluParam *param) { + return true; +} + +template <> +void DeconvReluKernel::Compute( + const FusionDeconvReluParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp index b0df0cb65d..bc3fbfd796 100644 --- a/src/operators/kernel/fpga/V2/slice_kernel.cpp +++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -24,6 +24,7 @@ bool SliceKernel::Init(SliceParam* param) { } template <> void SliceKernel::Compute(const SliceParam& param) {} + } // namespace operators } // namespace paddle_mobile #endif diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp index 5cfccf8779..bbdb35b715 100644 --- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp @@ -49,7 +49,12 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { Tensor *out = param.Out(); fpga::PerformBypass(param.FpgaArgs()); + fpga::fpga_invalidate( + (void *)in_x->data(), // NOLINT + fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT + sizeof(float)); math::SoftmaxFuntor()(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); } } // namespace operators diff --git a/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/src/operators/kernel/fpga/V2/tanh_kernel.cpp new file mode 100644 index 0000000000..46dd3a0f6f --- /dev/null +++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#include "operators/kernel/tanh_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool TanhKernel::Init(TanhParam *param) { + return true; +} + +template <> +void TanhKernel::Compute(const TanhParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/tanh_kernel.h b/src/operators/kernel/tanh_kernel.h new file mode 100644 index 0000000000..035f64f840 --- /dev/null +++ b/src/operators/kernel/tanh_kernel.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef TANH_OP + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class TanhKernel : public OpKernelBase> { + public: + void Compute(const TanhParam& param); + bool Init(TanhParam* param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 5666f8e9c9..d65ca66364 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1534,6 +1534,27 @@ class ReluParam : public ReluParamBase { #endif +#ifdef TANH_OP +template +class TanhParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + } + const RType *InputX() const { return input_x_; } + RType *Out() const { return out_; } + + private: + RType *input_x_; + RType *out_; +}; +#endif + #ifdef PRELU_OP template class PReluParam : public OpParam { @@ -2229,9 +2250,24 @@ class ConvTransposeParam : public OpParam { vector paddings_; vector dilations_; int groups; + +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::DeconvArgs fpga_conv_args; + + public: + const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; } +#endif }; #endif +#ifdef FUSION_DECONVRELU_OP +template +using FusionDeconvReluParam = ConvTransposeParam; +#endif + #ifdef GRU_OP template class GruParam : public OpParam { diff --git a/src/operators/tanh_op.cpp b/src/operators/tanh_op.cpp new file mode 100644 index 0000000000..454cdfa269 --- /dev/null +++ b/src/operators/tanh_op.cpp @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#include "operators/tanh_op.h" + +namespace paddle_mobile { +namespace operators { + +template +void TanhOp::InferShape() const { + this->param_.Out()->Resize(this->param_.InputX()->dims()); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp); +#endif + +#endif diff --git a/src/operators/tanh_op.h b/src/operators/tanh_op.h new file mode 100644 index 0000000000..82b0e4e9a0 --- /dev/null +++ b/src/operators/tanh_op.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "operators/kernel/tanh_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class TanhOp : public framework::OperatorWithKernel< + DeviceType, TanhParam, + operators::TanhKernel> { + public: + TanhOp(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel, + operators::TanhKernel>( + type, inputs, outputs, attrs, scope) {} + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 79bed19be3..52a1bf3070 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -73,6 +73,9 @@ list(FIND NET "FPGA_NET_V2" CON) if (CON GREATER -1) ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet50 paddle-mobile) + + ADD_EXECUTABLE(test-pe fpga/test_pe.cpp) + target_link_libraries(test-pe paddle-mobile) set(FOUND_MATCH ON) endif () diff --git a/test/fpga/test_pe.cpp b/test/fpga/test_pe.cpp new file mode 100644 index 0000000000..f5f2708b9e --- /dev/null +++ b/test/fpga/test_pe.cpp @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#include "fpga/V2/filter.h" + +namespace fpga = paddle_mobile::fpga; + +static const uint32_t N = 64; +static const uint32_t C = 3; +static const uint32_t H = 224; +static const uint32_t W = 224; +static const uint32_t G = 1; + +fpga::DataType input_type = fpga::DATA_TYPE_FP32; +fpga::DataType output_type = fpga::DATA_TYPE_FP16; + +void* ifm = nullptr; +void* ofm = nullptr; +void* filter = nullptr; +void* ifm_scale = nullptr; +void* ofm_scale = nullptr; +void* filter_scale = nullptr; + +int ifm_size = 0, ofm_size = 0; + +void format_data() { + ifm_scale = fpga::fpga_malloc(8); + ofm_scale = fpga::fpga_malloc(8); + int ifm_channel = fpga::filter::calc_aligned_channel(C); + int ofm_channel = fpga::filter::calc_aligned_channel(N); + int num = fpga::filter::calc_aligned_num(N, C); + DLOG << "ifm_channel = " << ifm_channel; + DLOG << "ofm_channel = " << ofm_channel; + DLOG << "aligned_num = " << num; + ifm_size = ifm_channel * H * W; + ofm_size = ofm_channel * H * W; + ifm = fpga::fpga_malloc(ifm_size * sizeof(float)); + ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t)); + memset(ifm, 0, ifm_size * sizeof(float)); + memset(ofm, 0, ofm_size * sizeof(int16_t)); + + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + for (int c = 0; c < C; c++) { + int index = h * W * ifm_channel + w * ifm_channel + c; + (reinterpret_cast(ifm))[index] = h + w + c * 0.1f; + // DLOG << index << ":" << ((float *) ifm)[index]; + } + } + } + fpga::fpga_flush(ifm, ifm_size * sizeof(float)); + fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t)); +} + +void print_fp16(int16_t* ptr, int total_size, int num) { + fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t)); + int stride = total_size / num; + for (int i = 0; i < total_size; i += stride) { + DLOG << fpga::fp16_2_fp32(ptr[i]); + } +} + +void print_fp32(float* ptr, int total_size, int num) { + fpga::fpga_invalidate(ptr, total_size * sizeof(float)); + int stride = total_size / num; + for (int i = 0; i < total_size; i += stride) { + DLOG << ptr[i]; + } +} + +void test_bypass() { + fpga::BypassArgs args; + args.input_data_type = input_type; + args.output_data_type = output_type; + args.image.address = ifm; + args.image.height = H; + args.image.width = W; + args.image.channels = C; + args.image.scale_address = reinterpret_cast(ifm_scale); + args.output.address = ofm; + args.output.scale_address = reinterpret_cast(ofm_scale); + fpga::PerformBypass(args); +} + +int main() { + paddle_mobile::fpga::open_device(); + format_data(); + DLOG << "format data done"; + print_fp32(reinterpret_cast(ifm), ifm_size, 200); + DLOG << "print input done"; + test_bypass(); + DLOG << "test done"; + print_fp16(reinterpret_cast(ofm), ifm_size, 200); + std::cout << "Computation done" << std::endl; + return 0; +} + +#endif diff --git a/tools/op.cmake b/tools/op.cmake index 7d19591efc..ae1ac1a4ff 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -133,9 +133,11 @@ if (CON GREATER -1) set(SOFTMAX_OP ON) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) -# set(CONV_TRANSPOSE_OP ON) -# set(SLICE_OP ON) -# set(ELEMENTWISEADD_OP ON) + set(CONV_TRANSPOSE_OP ON) + set(FUSION_DECONVRELU_OP ON) + set(SLICE_OP ON) + set(TANH_OP ON) + set(ELEMENTWISEADD_OP ON) set(FOUND_MATCH ON) endif() @@ -445,3 +447,9 @@ if (DEQUANT_OP) add_definitions(-DDEQUANT_OP) endif() +if (TANH_OP) + add_definitions(-DTANH_OP) +endif() +if (FUSION_DECONVRELU_OP) + add_definitions(-DFUSION_DECONVRELU_OP) +endif() \ No newline at end of file -- GitLab