diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index ada463ca74ca95d3984ac05a45f9aacf513cc6ff..324ee4f5381a20a9a34000045b130d61f71ec116 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -13,46 +13,30 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V2/api.h" -#include -#include #include -#include #include "fpga/V2/bias_scale.h" +#include "fpga/V2/config.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" -#define FPGA_TEST_MODE -// #define PADDLE_MOBILE_OS_LINUX namespace paddle_mobile { namespace fpga { - -static int fd = -1; -static const char *device_path = "/dev/fpgadrv0"; static std::map memory_map; -static inline int do_ioctl(int req, const void *arg) { -#ifdef PADDLE_MOBILE_OS_LINUX - int result = ioctl(fd, req, (uint64_t)arg); - PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly"); - return result; -#else - return -1; -#endif +int open_device() { + int ret = open_device_driver(); + return ret; } -int open_device() { - if (fd == -1) { - fd = open(device_path, O_RDWR); - } - return fd; +int close_device() { + int ret = close_device_driver(); + return ret; } -// memory management; void *fpga_malloc(size_t size) { static uint64_t counter = 0; - -#ifdef PADDLE_MOBILE_OS_LINUX - auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); +#ifdef PADDLE_MOBILE_ZU5 + auto ptr = fpga_malloc_driver(size); #else auto ptr = malloc(size); #endif @@ -66,13 +50,12 @@ void *fpga_malloc(size_t size) { void fpga_free(void *ptr) { static uint64_t counter = 0; size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator if (iter != memory_map.end()) { size = iter->second; memory_map.erase(iter); -#ifdef PADDLE_MOBILE_OS_LINUX - munmap(ptr, size); +#ifdef PADDLE_MOBILE_ZU5 + fpga_free_driver(ptr); #else free(ptr); #endif @@ -84,24 +67,6 @@ void fpga_free(void *ptr) { } } -void fpga_copy(void *dest, const void *src, size_t num) { - memcpy(dest, src, num); -} - -int fpga_flush(void *address, size_t size) { - struct MemoryCacheArgs args = {nullptr}; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -int fpga_invalidate(void *address, size_t size) { - struct MemoryCacheArgs args = {nullptr}; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - half fp32_2_fp16(float fp32_num) { unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | @@ -123,136 +88,13 @@ float fp16_2_fp32(half fp16_num) { return fp32_num; } -int ComputeBasicConv(const struct ConvArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "======Compute Basic Conv======"; - DLOG << " relu_enabled:" << args.relu_enabled - << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} - -int ComputeFpgaConv(const struct SplitConvArgs &args) { - ComputeBasicConv(args.conv_args[0]); -} - -int ComputeFpgaPool(const struct PoolingArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaPool==========="; - DLOG << " mode:" << args.mode - << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_POOLING, &args); -} - -int ComputeFpgaEWAdd(const struct EWAddArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaEWAdd==========="; - DLOG << " relu_enabled:" << args.relu_enabled - << " const0:" << fp16_2_fp32(int16_t(args.const0)) - << " const1:" << fp16_2_fp32(int16_t(args.const1)); - DLOG << " image0_address:" << args.image0.address - << " image0_scale_address:" << args.image0.scale_address - << " image0_channels:" << args.image0.channels - << " image0_height:" << args.image0.height - << " image0_width:" << args.image0.width - << " pad0_height:" << args.image0.pad_height - << " pad0_width:" << args.image0.pad_width; - DLOG << " image1_address:" << args.image1.address - << " image1_scale_address:" << args.image1.scale_address - << " image1_channels:" << args.image1.channels - << " image1_height:" << args.image1.height - << " image1_width:" << args.image1.width - << " pad1_height:" << args.image1.pad_height - << " pad_width:" << args.image1.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_EW, &args); -} -int PerformBypass(const struct BypassArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaBypass==========="; - DLOG << " input_type:" << args.input_data_type - << " output_type:" << args.output_data_type - << " input_layout_type:" << args.input_layout_type - << " output_layout_type:" << args.output_layout_type; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_BYPASS, &args); -} - -int ComputeFPGAConcat(const struct ConcatArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaConcat==========="; - DLOG << " Image_num: " << args.image_num - << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out - << " out_channel:" << args.out_channel; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.channel_num[i] - << " aligned_channel_num:" << args.aligned_channel_num[i] - << " image_address:" << args.images_in[i] - << " image_scale_address:" << args.scales_in[i]; - } -#endif - - image::concat_images(args.images_in, args.scales_in, args.image_out, - args.scale_out, args.image_num, args.channel_num, - args.height, args.width, args.aligned_channel_num, - args.out_channel); - return 0; -} - void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = image_tensor->data(); size_t memory_size = channel * height * width * sizeof(float); auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); + memcpy(new_data, data_ptr, memory_size); int aligned_channel = filter::calc_aligned_channel((int)channel); // NOLINT image::format_image(&new_data, (int)channel, (int)height, // NOLINT (int)width, // NOLINT @@ -265,7 +107,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) { size_t memory_size = 0; if (dims.size() == 4) { auto height = dims[2], width = dims[3]; - memory_size = height * width * aligned_channel * sizeof(half); + memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half); } else if (dims.size() == 2) { memory_size = aligned_channel * sizeof(half); } else { @@ -319,7 +161,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, auto data_ptr = filter_tensor->data(); size_t memory_size = num * channel * height * width * sizeof(float); auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); + memcpy(new_data, data_ptr, memory_size); filter::format_filter(&new_data, (int)num, (int)channel, // NOLINT (int)height, // NOLINT (int)width, group_num, max_value); // NOLINT @@ -334,7 +176,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { auto data_ptr = filter_tensor->data(); size_t memory_size = num * channel * height * width * sizeof(float); auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); + memcpy(new_data, data_ptr, memory_size); filter::format_fc_filter(&new_data, (int)num, (int)channel, // NOLINT (int)height, // NOLINT (int)width, 1, max_value); // NOLINT diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 5c56b5b8c3a559dc4734dc8d5e7138ef6bccab20..aac97bec225a4940f710172c115e06452469d289 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -18,6 +18,8 @@ limitations under the License. */ #include #include #include +#include "fpga/V2/driver/driver.h" +#include "fpga/V2/driver/pe.h" #include "framework/tensor.h" namespace paddle_mobile { @@ -33,16 +35,6 @@ enum LayoutType { LAYOUT_HWC = 0, }; -struct VersionArgs { - void* buffer; -}; - -struct MemoryCopyArgs { - void* src; - void* dest; - size_t size; -}; - struct KernelArgs { uint32_t width; uint32_t height; @@ -128,56 +120,10 @@ struct BypassArgs { struct ImageOutputArgs output; }; -struct FpgaRegWriteArgs { - uint64_t address; // - uint64_t value; -}; - -struct FpgaRegReadArgs { - uint64_t address; - uint64_t value; -}; - -struct MemoryCacheArgs { - void* address; - size_t size; -}; - -#define IOCTL_FPGA_MAGIC 'FPGA' - -#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) - -#define IOCTL_SEPARATOR_0 10 - -#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) -#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) -#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) - -#define IOCTL_SEPARATOR_1 20 - -#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs) -#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs) -#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs) -#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs) -#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs) -#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs) - -//============================== API ============================= - int open_device(); int close_device(); - void* fpga_malloc(size_t size); void fpga_free(void* ptr); -void fpga_copy(void* dst, const void* src, size_t num); -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - -int PerformBypass(const struct BypassArgs& args); -int ComputeFpgaConv(const struct SplitConvArgs& args); -int ComputeFpgaPool(const struct PoolingArgs& args); -int ComputeFpgaEWAdd(const struct EWAddArgs& args); -int ComputeFPGAConcat(const struct ConcatArgs& args); static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp index d07d8a11b8bdd1a1442bd1df72ed586ccd958f7e..8a0fd426194f6ab5e699f084ff6277920d8c89b4 100644 --- a/src/fpga/V2/bias_scale.cpp +++ b/src/fpga/V2/bias_scale.cpp @@ -39,7 +39,6 @@ void align_element(float **data_in, int num, int num_after_alignment) { void format_bias_scale_array(float **data_in, int num, int num_after_alignment) { align_element(data_in, num, num_after_alignment); - fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float)); } } // namespace bias_scale diff --git a/src/fpga/V2/config.h b/src/fpga/V2/config.h new file mode 100644 index 0000000000000000000000000000000000000000..27187c7b854c84d501949db41fe89f9dca1d2bf1 --- /dev/null +++ b/src/fpga/V2/config.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define PADDLE_MOBILE_ZU5 +#define FPGA_PRINT_MODE diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/V2/driver/bitmap.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c99f6446caf80f78a5c8737a41a4a80f93395d9 --- /dev/null +++ b/src/fpga/V2/driver/bitmap.cpp @@ -0,0 +1,131 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/driver/bitmap.h" + +namespace fpga_bitmap { +void bitmap_set(uint64_t *map, unsigned int start, int len) { + uint64_t *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *p |= mask_to_set; + } +} + +void bitmap_clear(uint64_t *map, unsigned int start, int len) { + uint64_t *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} + +static uint64_t ffs(uint64_t data) { + uint64_t bit = 0; + int i = 0; + + for (i = 0; i < sizeof(data); i++) { + if (data & (1 << i)) { + bit = i; + break; + } + } + + return bit; +} + +static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits, + uint64_t start, uint64_t invert) { + uint64_t tmp = 0; + + if (!nbits || start >= nbits) return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + + /* Handle 1st word. */ + tmp &= BITMAP_FIRST_WORD_MASK(start); + start = round_down(start, BITS_PER_LONG); + + while (!tmp) { + start += BITS_PER_LONG; + if (start >= nbits) return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + } + + return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits; +} + +uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size, + uint64_t offset) { + return _find_next_bit(addr, size, offset, ~0UL); +} + +uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) { + return _find_next_bit(addr, size, offset, 0UL); +} + +uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size, + uint64_t start, unsigned int nr, + uint64_t align_mask, + uint64_t align_offset) { + uint64_t index = 0; + uint64_t end = 0; + uint64_t i = 0; + +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; + + end = index + nr; + if (end > size) return end; + i = find_next_bit(map, end, index); + if (i < end) { + start = i + 1; + goto again; + } + + return index; +} + +uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, + uint64_t start, unsigned int nr, + uint64_t align_mask) { + return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); +} +} // namespace fpga_bitmap diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/V2/driver/bitmap.h new file mode 100644 index 0000000000000000000000000000000000000000..272cddf23367e17759a4493ace64119a9e351595 --- /dev/null +++ b/src/fpga/V2/driver/bitmap.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#define BITS_PER_LONG 64 +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) +#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) + +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) + +#define round_down(x, y) ((x) & ((y)-1)) + +namespace fpga_bitmap { +void bitmap_set(uint64_t *map, unsigned int start, int len); +void bitmap_clear(uint64_t *map, unsigned int start, int len); +uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, + uint64_t start, unsigned int nr, + uint64_t align_mask); + +} // namespace fpga_bitmap diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/V2/driver/driver.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ed78fa5ebcc089e136ebc2a79d56874885735879 --- /dev/null +++ b/src/fpga/V2/driver/driver.cpp @@ -0,0 +1,358 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/enforce.h" +#include "fpga/V2/driver/bitmap.h" +#include "fpga/V2/driver/driver.h" + +namespace paddle_mobile { +namespace fpga { +struct FPGA_INFO g_fpgainfo; + +int open_drvdevice() { + if (g_fpgainfo.fd_drv == -1) { + g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR); + } + return g_fpgainfo.fd_drv; +} + +int open_memdevice() { + if (g_fpgainfo.fd_mem == -1) { + g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC); + } + return g_fpgainfo.fd_mem; +} + +void pl_reset() { + // DLOG << "PL RESET"; + + // reg_writeq(0x5a, REG_FPGA_RESET); + usleep(100 * 1000); +} + +void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe, + char const *type_name, int pe_idx) { + memset(pe, 0, sizeof(struct fpga_pe)); + + pe->outer = pe_data; + snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name); + + pe->status = IDLE; + pe->interrupt_cnt = 0; + pe_data->pes[pe_idx] = pe; + pe_data->pe_num++; +} + +void pl_init() { + struct pe_data_s *pe_data = nullptr; + + pl_reset(); + + pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s)); + if (pe_data == nullptr) { + DLOG << "pe_data malloc error!"; + return; + } + memset(pe_data, 0, sizeof(struct pe_data_s)); + pthread_mutex_init(&pe_data->mutex, 0); + + setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV); + setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING); + setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW); + setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS); + + g_fpgainfo.pe_data = pe_data; +} + +void pl_destroy() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + pthread_mutex_destroy(&pe_data->mutex); + free(pe_data); +} + +void pl_start() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + + pthread_mutex_unlock(&pe_data->mutex); +} + +void pl_stop() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + + pthread_mutex_lock(&pe_data->mutex); +} + +void pl_reinit() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + struct fpga_pe *pe = nullptr; + int i = 0; + + pl_stop(); + pl_reset(); + pl_start(); + + for (i = 0; i < pe_data->pe_num; i++) { + pe = pe_data->pes[i]; + pe->status = IDLE; + pe->interrupt_cnt = 0; + } + + pl_start(); +} + +int pl_get_status() { return 0; } + +/*tmie单位us*/ +int fpga_regpoll(uint64_t reg, uint64_t val, int time) { + uint64_t i = 0; + /*timeout精确性待确认*/ + int64_t timeout = time * CPU_FREQ / 1000000; + + for (i = 0; i < timeout; i++) { + if (val == reg_readq(reg)) { + break; + } + } + + if (i <= timeout) { + return 0; + } else { + return -1; + } +} + +/*内存管理*/ +int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { + uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); + unsigned int nr = (unsigned int)_nr; + int ret = 0; + + pthread_mutex_lock(&memory->mutex); + + unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area( + memory->bitmap, memory->page_num, 0, nr, 0); + if (pos <= memory->page_num) { + uint64_t address_ofset = + memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE; + fpga_bitmap::bitmap_set(memory->bitmap, pos, nr); + memory->nr[pos] = nr; + + *addr = address_ofset; + } else { + ret = -ENOMEM; + } + + pthread_mutex_unlock(&memory->mutex); + + return ret; +} + +void memory_release(struct fpga_memory *memory) { + pthread_mutex_lock(&memory->mutex); + fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); + pthread_mutex_unlock(&memory->mutex); +} + +int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) { + int rc = 0; + + uint64_t *bitmap = nullptr; + unsigned int *nr = nullptr; + + // 不允许多份memory创建,所以创建memory结构体不存在互斥 + // pthread_mutex_lock(&memory->mutex); + memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE); + memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG); + + bitmap = + (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT + if (!bitmap) { + rc = -EFAULT; + return rc; + } + memory->bitmap = bitmap; + + nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int)); + if (!nr) { + rc = -EFAULT; + free(bitmap); + return rc; + } + memory->nr = nr; + + memory->mem_start = FPGA_MEM_PHY_ADDR; + memory->mem_end = FPGA_MEM_SIZE; + // pthread_mutex_unlock(memory->mutex); + + return rc; +} + +int create_fpga_memory(struct fpga_memory **memory_info) { + int rc = 0; + + *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory)); + if (*memory_info == NULL) { + rc = -EFAULT; + return rc; + } + pthread_mutex_init(&((*memory_info)->mutex), nullptr); + + rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE); + if (rc) { + free(*memory_info); + } + + return rc; +} + +int init_fpga_memory(struct fpga_memory *memory) { + int rc = 0; + + if (!memory) { + rc = -EFAULT; + return rc; + } + + // spin_lock_init(&memory->spin); + fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); + fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0. + + return 0; +} + +void destroy_fpga_memory(struct fpga_memory *memory) { + if (memory) { + free(memory->nr); + free(memory->bitmap); + free(memory); + } +} + +int fpga_memory_add() { + int rc = 0; + + rc = create_fpga_memory(&g_fpgainfo.memory_info); + if (rc) { + return rc; + } + + rc = init_fpga_memory(g_fpgainfo.memory_info); + if (rc) { + destroy_fpga_memory(g_fpgainfo.memory_info); + return rc; + } + + return 0; +} + +uint64_t vaddr_to_paddr(void *address) { + uint64_t paddr = 0; + auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); + if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { + paddr = iter->second; + } else { + DLOG << "Invalid pointer"; + } + + return paddr; +} + +void *fpga_reg_malloc(size_t size) { + void *ret = nullptr; + ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, + g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR); + // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); + + g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); + + return ret; +} + +void *fpga_malloc_driver(size_t size) { + void *ret = nullptr; + uint64_t phy_addr = 0; + + memory_request(g_fpgainfo.memory_info, size, &phy_addr); + + ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, + g_fpgainfo.fd_mem, phy_addr); + PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); + + g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr)); + g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); + + return ret; +} + +void fpga_free_driver(void *ptr) { + size_t size = 0; + + auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); + if (iter != g_fpgainfo.fpga_addr2size_map.end()) { + size = iter->second; + g_fpgainfo.fpga_addr2size_map.erase(iter); + munmap(ptr, size); + } else { + DLOG << "Invalid pointer"; + } +} + +int open_device_driver() { + g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR; + g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR; + g_fpgainfo.FpgaRegVirAddr = nullptr; + g_fpgainfo.pe_data = nullptr; + g_fpgainfo.drvdevice_path = "/dev/fpgadrv0"; + g_fpgainfo.memdevice_path = "/dev/fpgamem0"; + g_fpgainfo.fd_drv = -1; + g_fpgainfo.fd_mem = -1; + + int ret = 0; + ret = open_drvdevice(); + ret |= open_memdevice(); + + g_fpgainfo.FpgaRegVirAddr = + (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT + fpga_memory_add(); + + pl_init(); + + return ret; +} + +int close_device_driver() { + pl_destroy(); + fpga_free_driver(g_fpgainfo.FpgaRegVirAddr); + memory_release(g_fpgainfo.memory_info); + destroy_fpga_memory(g_fpgainfo.memory_info); + + return 0; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/V2/driver/driver.h new file mode 100644 index 0000000000000000000000000000000000000000..ee01454ac593e7b5a146a8fac4f81a957c2b1e95 --- /dev/null +++ b/src/fpga/V2/driver/driver.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/log.h" + +namespace paddle_mobile { +namespace fpga { + +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) + +#define FPGA_REG_PHY_ADDR 0xa0000000 +#define FPGA_REG_SIZE 0x1000 +#define FPGA_MEM_PHY_ADDR 0x20000000 +#define FPGA_MEM_SIZE 0x20000000 + +#define CPU_FREQ 1000000000 + +#define FPGA_PAGE_SIZE (16UL * 1024UL) + +// PE related macros +const int MAX_NUM_PES = 6; +const size_t MAX_TYPE_NAME_LENTH = 8; + +const int PE_IDX_CONV = 0; +const int PE_IDX_POOLING = 1; +const int PE_IDX_EW = 2; +const int PE_IDX_BYPASS = 3; + +enum pe_status { IDLE = 0, BUSY = 1 }; + +struct fpga_pe { + char type_name[MAX_TYPE_NAME_LENTH + 1]; + struct pe_data_s *outer; + pe_status status; // 0=idle 1=busy -1=fail + uint64_t interrupt_cnt; +}; + +struct pe_data_s { + pthread_mutex_t mutex; + struct fpga_pe pe_conv; + struct fpga_pe pe_pooling; + struct fpga_pe pe_ew; + struct fpga_pe pe_bypass; + + struct fpga_pe *pes[MAX_NUM_PES]; + int pe_num; +}; + +struct fpga_memory { + pthread_mutex_t mutex; + uint64_t *bitmap; + unsigned int *nr; + unsigned int page_num; + unsigned int page_num_long; + uint64_t mem_start; + uint64_t mem_end; +}; + +struct FPGA_INFO { + uint64_t FpgaRegPhyAddr; + uint64_t FpgaMemPhyAddr; + pthread_t poll_pid; + void *FpgaRegVirAddr; + struct pe_data_s *pe_data; + + std::map fpga_addr2size_map; + std::map fpga_vaddr2paddr_map; + const char *drvdevice_path; + const char *memdevice_path; + struct fpga_memory *memory_info; + int fd_drv; + int fd_mem; +}; + +extern struct FPGA_INFO g_fpgainfo; + +inline uint64_t reg_readq(uint32_t offset) { + // DLOG << "offset : " << offset; + uint64_t value = + *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset); // NOLINT + + return value; +} + +inline void reg_writeq(uint64_t value, uint32_t offset) { + // DLOG << "offset : " << offset << ", value : " << value; + *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) = // NOLINT + value; +} + +int open_device_driver(); +int close_device_driver(); +void *fpga_malloc_driver(size_t size); +void fpga_free_driver(void *ptr); +/*pe*/ + +uint64_t vaddr_to_paddr(void *address); +int fpga_regpoll(uint64_t reg, uint64_t val, int time); + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..52cde04601bc5e002ce2d8e15b3bdb1ce64b340a --- /dev/null +++ b/src/fpga/V2/driver/pe.cpp @@ -0,0 +1,244 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/driver/pe.h" +#include "fpga/V2/config.h" +#include "fpga/V2/driver/driver.h" +#include "fpga/V2/filter.h" +#include "fpga/V2/image.h" + +namespace paddle_mobile { +namespace fpga { +#define MUL8(x) (x * 8) +#define BYPASS_DONE 1 + +float Findfp16Max() { + uint16_t abs_vals[16]; + uint64_t max_fp16; + + max_fp16 = reg_readq(MUL8(49)); + abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(50)); + abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(51)); + abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(52)); + abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16)); + abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + + uint16_t tmp = 0; + for (int i = 0; i < 16; i++) { + if (tmp < abs_vals[i]) { + tmp = abs_vals[i]; + } + } + return fp16_2_fp32(tmp) / 127.0f; +} + +int ComputeFpgaConv(const struct SplitConvArgs &args) { + ComputeBasicConv(args.conv_args[0]); +} + +int ComputeBasicConv(const struct ConvArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "======Compute Basic Conv======"; + DLOG << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + return 0; +} + +int ComputeFpgaPool(const struct PoolingArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaPool==========="; + DLOG << " mode:" << args.mode + << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int ComputeFpgaEWAdd(const struct EWAddArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaEWAdd==========="; + DLOG << " relu_enabled:" << args.relu_enabled + << " const0:" << fp16_2_fp32(int16_t(args.const0)) + << " const1:" << fp16_2_fp32(int16_t(args.const1)); + DLOG << " image0_address:" << args.image0.address + << " image0_scale_address:" << args.image0.scale_address + << " image0_channels:" << args.image0.channels + << " image0_height:" << args.image0.height + << " image0_width:" << args.image0.width + << " pad0_height:" << args.image0.pad_height + << " pad0_width:" << args.image0.pad_width; + DLOG << " image1_address:" << args.image1.address + << " image1_scale_address:" << args.image1.scale_address + << " image1_channels:" << args.image1.channels + << " image1_height:" << args.image1.height + << " image1_width:" << args.image1.width + << " pad1_height:" << args.image1.pad_height + << " pad_width:" << args.image1.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int PerformBypass(const struct BypassArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaBypass==========="; + DLOG << " input_type:" << args.input_data_type + << " output_type:" << args.output_data_type + << " input_layout_type:" << args.input_layout_type + << " output_layout_type:" << args.output_layout_type; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address); + uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address); + uint64_t bp_enable; + int64_t length; + uint64_t pixels; + + // fp32->fp16 + if ((args.input_data_type) && (!args.output_data_type)) { + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(float); + bp_enable = 0x8800000000000000 + length; + } + // fp16->fp32 + else if ((!args.input_data_type) && (args.output_data_type)) { + pixels = filter::calc_aligned_channel((args.image.channels)) * + (args.image.width) * (args.image.height); + length = pixels * sizeof(short); + length = align_to_x((int)length, 64); // NOLINT + bp_enable = 0x8a00000000000000 + length; + } + // fp16->fp16 findmax + else if ((!args.input_data_type) && (!args.output_data_type)) { + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(short); + bp_enable = 0x8900000000000000 + length; + } else { + return -1; + } + + // start bypass + reg_writeq(ifm_src_paddr, MUL8(27)); + reg_writeq(ifm_dst_paddr, MUL8(28)); + reg_writeq(0, MUL8(0)); + reg_writeq(bp_enable, MUL8(0)); + // poll + int ret = -1; + ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); + if (ret != -1) { + // clear "irq" + reg_readq(MUL8(63)); + } + // get max value + if ((!args.input_data_type) && (!args.output_data_type)) { + float scale = Findfp16Max(); + args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT + args.output.scale_address[1] = scale; + } + return ret; +} + +int ComputeFPGAConcat(const struct ConcatArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaConcat==========="; + DLOG << " Image_num: " << args.image_num + << " out_address:" << args.image_out + << " out_scale_address:" << args.scale_out + << " out_channel:" << args.out_channel; + DLOG << " image_height:" << args.height << " image_width:" << args.width; + for (int i = 0; i < args.image_num; i++) { + DLOG << " " << i << "th: "; + DLOG << " channel_num:" << args.channel_num[i] + << " aligned_channel_num:" << args.aligned_channel_num[i] + << " image_address:" << args.images_in[i] + << " image_scale_address:" << args.scales_in[i]; + } +#endif + + image::concat_images(args.images_in, args.scales_in, args.image_out, + args.scale_out, args.image_num, args.channel_num, + args.height, args.width, args.aligned_channel_num, + args.out_channel); + return 0; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/V2/driver/pe.h new file mode 100644 index 0000000000000000000000000000000000000000..4ec3ccb01db1859d4265484644a1e1704cc836c7 --- /dev/null +++ b/src/fpga/V2/driver/pe.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "fpga/V2/api.h" + +namespace paddle_mobile { +namespace fpga { + +int PerformBypass(const struct BypassArgs& args); +int ComputeBasicConv(const struct ConvArgs& args); +int ComputeFpgaPool(const struct PoolingArgs& args); +int ComputeFpgaEWAdd(const struct EWAddArgs& args); + +int ComputeFpgaConv(const struct SplitConvArgs& args); +int ComputeFPGAConcat(const struct ConcatArgs& args); + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp index ce278edbeed64f2ca413c1f75ff620ee1f44c83d..39d67b2d2d6213baf674dc0bbc3e96f4f182e3c6 100644 --- a/src/fpga/V2/filter.cpp +++ b/src/fpga/V2/filter.cpp @@ -94,7 +94,6 @@ void format_filter(float **data_in, int num, int channel, int height, int width, convert_to_hwc(data_in, num, channel, height, width); align_filter(data_in, num, channel, height, width); int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); - fpga_flush(*data_in, pixel_num * sizeof(float)); } void convert_fc_filter(float **data_in, int num, int chw) { @@ -114,8 +113,6 @@ void format_fc_filter(float **data_in, int num, int channel, int height, int chw = channel * height * width; convert_fc_filter(data_in, num, chw); align_filter(data_in, num, channel, height, width); - int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); - fpga_flush(*data_in, pixel_num * sizeof(float)); } float find_max(float *data_in, int data_size) { diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp index 76e8d7f285b158b49938998f5f3fa18b5308caa0..4ce76cd00fb72cc1292efa5be6cc0d0fe7d93107 100644 --- a/src/fpga/V2/image.cpp +++ b/src/fpga/V2/image.cpp @@ -58,7 +58,6 @@ void format_image(float **data_in, int channel, int height, int width, int aligned_channel) { convert_to_hwc(data_in, channel, height, width); align_image(data_in, channel, height, width, aligned_channel); - fpga_flush(*data_in, aligned_channel * height * width * sizeof(float)); } void concat_images(int16_t **images_in, float **scales_in, void *image_out, @@ -70,8 +69,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, scale_out[1] = 0.0; for (int i = 0; i < image_num; i++) { scale_out[0] = std::max(*scale_out, scales_in[i][0]); - fpga_invalidate(images_in[i], - height * width * aligned_channel_num[i] * sizeof(int16_t)); } scale_out[1] = 1 / scale_out[0]; @@ -86,8 +83,6 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, tmp_channel_sum += channel_num[i]; } } - - fpga_flush(image_out, hw * out_channel * sizeof(int16_t)); } } // namespace image diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index 76d08513aa4301b9aa22b159a70a17b7b0619b92..d7b1c912dac304660f39e0e294122d0d27eb9bb6 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -90,8 +90,10 @@ class CLEngine { bool BuildProgram(cl_program program) { cl_int status; - status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math -I cl_kernel", - 0, 0); + std::string path = "-cl-fast-relaxed-math -I " + + CLEngine::Instance()->GetCLPath() + "/cl_kernel"; + + status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0); CL_CHECK_ERRORS(status); diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 81bfaf3a4d07f5a3ef82c19de57f1681dfc1f8c7..0ed3a5d32385963c67d898defc58ab019a09c156 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -704,7 +704,7 @@ void Executor::InitCombineMemory() { } } if (self_alloc) { - delete origin_data; + delete data; } LOG(kLOG_INFO) << " end init combine memory "; } diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp index c956ee70b6b23cdf763cb01dd7c2798f4d6e9351..281cd3d5084a1a15502e1e06865e1024d3b2b639 100644 --- a/src/operators/elementwise_add_op.cpp +++ b/src/operators/elementwise_add_op.cpp @@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp); REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp); #endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp); +#endif + #endif diff --git a/src/operators/kernel/arm/slice_kernel.cpp b/src/operators/kernel/arm/slice_kernel.cpp index 62efec9d2fb01568a108df8f3516085d81865bf7..e373b569870c81587377ac02e578397518513a85 100644 --- a/src/operators/kernel/arm/slice_kernel.cpp +++ b/src/operators/kernel/arm/slice_kernel.cpp @@ -17,6 +17,14 @@ limitations under the License. */ #include "operators/kernel/slice_kernel.h" namespace paddle_mobile { -namespace operators {} +namespace operators { + +template <> +bool SliceKernel::Init(SliceParam* param) { + return true; +} +template <> +void SliceKernel::Compute(const SliceParam& param) {} +} // namespace operators } // namespace paddle_mobile #endif diff --git a/src/operators/kernel/fpga/V2/concat_kernel.cpp b/src/operators/kernel/fpga/V2/concat_kernel.cpp index 5fad995a9f981bbe1c1329c4cfa5083f374d9b61..7f9ab66d48489dbecae01f819bd607c582f6145b 100644 --- a/src/operators/kernel/fpga/V2/concat_kernel.cpp +++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef CONCAT_OP #include "operators/kernel/concat_kernel.h" +#include "fpga/V2/api.h" namespace paddle_mobile { namespace operators { @@ -68,7 +69,7 @@ bool ConcatKernel::Init(ConcatParam *param) { template <> void ConcatKernel::Compute(const ConcatParam ¶m) { - ComputeFPGAConcat(param.FpgaArgs()); + fpga::ComputeFPGAConcat(param.FpgaArgs()); } template class ConcatKernel; diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b5085f26123994effa319826d84f2f249c80847 --- /dev/null +++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef ELEMENTWISEADD_OP + +#include "operators/kernel/elementwise_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { + bool relu_enabled = false; + auto *input_x = const_cast(param->InputX()); + auto *input_y = const_cast(param->InputY()); + auto *out = param->Out(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]); + fpga::format_fp16_ofm(out, aligned_channel_num); + auto out_ptr = out->mutable_data(); + + fpga::EWAddArgs ewaddArgs = {0}; + ewaddArgs.relu_enabled = relu_enabled; + ewaddArgs.const0 = 0x3c00; // =1 + ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + param->SetFpgaArgs(ewaddArgs); + return true; +} + +template <> +void ElementwiseAddKernel::Compute( + const ElementwiseAddParam ¶m) { + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp index f74b188b56b0fa1360f6a6a728c415f67b95b9a9..571987b3bf2a88c0d4ad648c7cb1966b538983a5 100644 --- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -21,7 +21,7 @@ namespace operators { template <> bool ElementwiseAddReluKernel::Init( ElementwiseAddReluParam *param) { - bool relu_enabled = true; + bool relu_enabled = false; auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b0df0cb65d44fe864c0e135c582b418826b9e00d --- /dev/null +++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SLICE_OP + +#include "operators/kernel/slice_kernel.h" + +namespace paddle_mobile { +namespace operators { +template <> +bool SliceKernel::Init(SliceParam* param) { + return true; +} +template <> +void SliceKernel::Compute(const SliceParam& param) {} +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp index bbdb35b715b60b25079c007a74b8b1e901cc9a59..5cfccf8779bfb1839f1bfe70dade765a975bf982 100644 --- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp @@ -49,12 +49,7 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { Tensor *out = param.Out(); fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate( - (void *)in_x->data(), // NOLINT - fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT - sizeof(float)); math::SoftmaxFuntor()(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); } } // namespace operators diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h index 6ae6528622b37c2f2694d70da3e74540e3404c99..89dba51d9e11570bd4228adb075ee104b2094fd8 100644 --- a/src/operators/kernel/slice_kernel.h +++ b/src/operators/kernel/slice_kernel.h @@ -24,7 +24,8 @@ template class SliceKernel : public framework::OpKernelBase> { public: - void Compute(const SliceParam& param) {} + void Compute(const SliceParam& param); + bool Init(SliceParam* param); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 5747d187d5b413bc6bb2bfe575814531ed74d132..5666f8e9c97482c13414fc9c4d4b54e7f96bcbca 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -436,6 +436,16 @@ class ConvParam : public OpParam { #ifdef PADDLE_MOBILE_CL int offset_; #endif + +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::SplitConvArgs fpga_conv_args; + + public: + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } +#endif }; template Print &operator<<(Print &printer, const ConvParam &conv_param); @@ -580,15 +590,6 @@ class MulParam : OpParam { GType *out_; int x_num_col_dims_; int y_num_col_dims_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1641,15 +1642,6 @@ class FusionConvAddParam : public ConvParam { RType *bias_; int axis_; RType *output_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; template @@ -1696,15 +1688,6 @@ class FusionConvAddPReluParam : public ConvParam { RType *output_; RType *alpha_; std::string mode_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1754,15 +1737,6 @@ class FusionConvAddAddPReluParam : public ConvParam { std::string keyOutput_; std::string keyX1_; std::string keyY1_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1829,16 +1803,6 @@ class FusionConvAddBNReluParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; - -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1916,15 +1880,6 @@ class FusionConvBNAddReluParam : public ConvParam { std::string keyBNY_; std::string keyX_; std::string keyY_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1983,15 +1938,6 @@ class FusionConvBNParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -2058,15 +2004,6 @@ class FusionConvAddBNParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -2184,15 +2121,6 @@ class FusionConvBNReluParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp index ac6c434c9450905931abeb395b294bed64c036b0..5704737902c03c476907ab527495b46c52567ed5 100644 --- a/src/operators/slice_op.cpp +++ b/src/operators/slice_op.cpp @@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp); #endif - +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(slice, ops::SliceOp); +#endif #endif diff --git a/tools/op.cmake b/tools/op.cmake index 1dc9ebe7708c072e579570842eec03a531b78d07..7d19591efc0e0a1bc36da914df0acd663aee811c 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -130,10 +130,12 @@ if (CON GREATER -1) set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_FC_OP ON) set(POOL_OP ON) - set(CONCAT_OP ON) set(SOFTMAX_OP ON) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) +# set(CONV_TRANSPOSE_OP ON) +# set(SLICE_OP ON) +# set(ELEMENTWISEADD_OP ON) set(FOUND_MATCH ON) endif()