提交 c812a251 编写于 作者: Z zhangyang

update V2 for FPGA track

上级 b328934d
......@@ -71,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum";
const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
extern const char *G_OP_TYPE_TANH = "tanh";
extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......@@ -129,5 +131,7 @@ std::unordered_map<
{G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
{G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}};
} // namespace paddle_mobile
......@@ -139,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
extern const char *G_OP_TYPE_QUANTIZE;
extern const char *G_OP_TYPE_DEQUANTIZE;
extern const char *G_OP_TYPE_TANH;
extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key;
......
......@@ -16,27 +16,29 @@ limitations under the License. */
#include <algorithm>
#include "fpga/V2/bias_scale.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
namespace paddle_mobile {
namespace fpga {
static std::map<void *, size_t> memory_map;
int open_device() {
int ret = open_device_driver();
int ret = driver::open_device_driver();
return ret;
}
int close_device() {
int ret = close_device_driver();
int ret = driver::close_device_driver();
return ret;
}
void *fpga_malloc(size_t size) {
static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_ZU5
auto ptr = fpga_malloc_driver(size);
auto ptr = driver::fpga_malloc_driver(size);
#else
auto ptr = malloc(size);
#endif
......@@ -55,7 +57,7 @@ void fpga_free(void *ptr) {
size = iter->second;
memory_map.erase(iter);
#ifdef PADDLE_MOBILE_ZU5
fpga_free_driver(ptr);
driver::fpga_free_driver(ptr);
#else
free(ptr);
#endif
......@@ -66,26 +68,27 @@ void fpga_free(void *ptr) {
DLOG << "Invalid pointer";
}
}
half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10)));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
void fpga_copy(void *dest, const void *src, size_t num) {
#ifdef PADDLE_MOBILE_ZU5
driver::fpga_copy_driver(dest, src, num);
#else
memcpy(dest, src, num);
#endif
}
float fp16_2_fp32(half fp16_num) {
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
int fpga_flush(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
return driver::fpga_flush_driver(address, size);
#else
return 0;
#endif
}
int fpga_invalidate(void *address, size_t size) {
#ifdef PADDLE_MOBILE_ZU5
return driver::fpga_invalidate_driver(address, size);
#else
return 0;
#endif
}
void format_image(framework::Tensor *image_tensor) {
......@@ -240,7 +243,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
arg->conv_args =
arg->conv_arg =
(ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
arg->concat_arg.image_num = arg->split_num;
......@@ -258,28 +261,33 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
(uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT
for (int i = 0; i < n; i++) {
arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].sb_address = bs_ptr;
arg->conv_args[i].filter_address = (int8_t *)filter_ptr; // NOLINT
arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].filter_num = arg->filter_num;
arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale;
arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].sb_address = bs_ptr;
arg->conv_arg[i].filter_address = (int8_t *)filter_ptr; // NOLINT
arg->conv_arg[i].filter_scale_address = filter->scale;
arg->conv_arg[i].filter_num = arg->filter_num;
arg->conv_arg[i].group_num = (uint32_t)group_num;
arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
arg->conv_arg[i].image.address = input_ptr;
arg->conv_arg[i].image.scale_address = input->scale;
arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].output.address = out_ptr;
arg->conv_arg[i].output.scale_address = out->scale;
int num_after_alignment =
filter::calc_aligned_num((int)input->dims()[1], arg->filter_num);
arg->conv_arg[i].free_space =
fpga_malloc(num_after_alignment * 2 * sizeof(half));
}
}
......
......@@ -14,118 +14,20 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/driver/pe.h"
#include "fpga/V2/fpga_common.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
};
enum LayoutType {
LAYOUT_CHW = 1,
LAYOUT_HWC = 0,
};
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
float* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
void* filter_address;
float* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t* aligned_channel_num;
uint32_t out_channel;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
half kernel_reciprocal;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataType input_data_type;
enum DataType output_data_type;
enum LayoutType input_layout_type;
enum LayoutType output_layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
void fpga_copy(void* dest, const void* src, size_t num);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
float filter_find_max(framework::Tensor* filter_tensor);
int get_aligned_channel_num(int channel_num);
......@@ -153,8 +55,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
half fp32_2_fp16(float fp32_num);
float fp16_2_fp32(half fp16_num);
} // namespace fpga
} // namespace paddle_mobile
......@@ -27,7 +27,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
(float *)fpga_malloc(total_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, total_element * sizeof(float));
for (int i = 1; i < num; i++) {
for (int i = 0; i < num; i++) {
ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
}
......@@ -39,6 +39,7 @@ void align_element(float **data_in, int num, int num_after_alignment) {
void format_bias_scale_array(float **data_in, int num,
int num_after_alignment) {
align_element(data_in, num, num_after_alignment);
fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float));
}
} // namespace bias_scale
......
......@@ -57,8 +57,8 @@ static uint64_t ffs(uint64_t data) {
uint64_t bit = 0;
int i = 0;
for (i = 0; i < sizeof(data); i++) {
if (data & (1 << i)) {
for (i = 0; i < sizeof(data) * 8; i++) {
if (data & (1UL << i)) {
bit = i;
break;
}
......
......@@ -25,7 +25,7 @@ limitations under the License. */
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define round_down(x, y) ((x) & ((y)-1))
#define round_down(x, y) ((x) & ~((y)-1))
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len);
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <algorithm>
......@@ -32,6 +33,7 @@ limitations under the License. */
namespace paddle_mobile {
namespace fpga {
namespace driver {
struct FPGA_INFO g_fpgainfo;
int open_drvdevice() {
......@@ -43,7 +45,8 @@ int open_drvdevice() {
int open_memdevice() {
if (g_fpgainfo.fd_mem == -1) {
g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
// g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
}
return g_fpgainfo.fd_mem;
}
......@@ -51,7 +54,6 @@ int open_memdevice() {
void pl_reset() {
// DLOG << "PL RESET";
// reg_writeq(0x5a, REG_FPGA_RESET);
usleep(100 * 1000);
}
......@@ -131,7 +133,7 @@ int pl_get_status() { return 0; }
int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
uint64_t i = 0;
/*timeout精确性待确认*/
int64_t timeout = time * CPU_FREQ / 1000000;
int64_t timeout = time * 6;
for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) {
......@@ -173,9 +175,14 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
}
void memory_release(struct fpga_memory *memory) {
pthread_mutex_lock(&memory->mutex);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
pthread_mutex_unlock(&memory->mutex);
void *ptr = nullptr;
/*unmap memory*/
std::map<void *, size_t> map = g_fpgainfo.fpga_addr2size_map;
std::map<void *, size_t>::iterator iter;
for (iter = map.begin(); iter != map.end(); iter++) {
fpga_free_driver(ptr);
}
}
int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
......@@ -238,7 +245,6 @@ int init_fpga_memory(struct fpga_memory *memory) {
return rc;
}
// spin_lock_init(&memory->spin);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0.
......@@ -293,9 +299,23 @@ void *fpga_reg_malloc(size_t size) {
return ret;
}
void *fpga_reg_free(void *ptr) {
size_t size = 0;
auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
size = iter->second;
g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size);
} else {
DLOG << "Invalid pointer";
}
}
void *fpga_malloc_driver(size_t size) {
void *ret = nullptr;
uint64_t phy_addr = 0;
int i = 0;
memory_request(g_fpgainfo.memory_info, size, &phy_addr);
......@@ -311,17 +331,70 @@ void *fpga_malloc_driver(size_t size) {
void fpga_free_driver(void *ptr) {
size_t size = 0;
uint32_t pos = 0;
uint64_t p_addr = 0;
auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
size = iter->second;
g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size);
p_addr = vaddr_to_paddr(ptr);
pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
/*clear bitmap*/
pthread_mutex_lock(&g_fpgainfo.memory_info->mutex);
fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
g_fpgainfo.memory_info->nr[pos]);
pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
} else {
DLOG << "Invalid pointer";
}
}
static inline int do_ioctl(unsigned long req, const void *arg) {
return ioctl(g_fpgainfo.fd_mem, req, arg);
}
int fpga_flush_driver(void *address, size_t size) {
struct MemoryCacheArgs args;
uint64_t p_addr;
p_addr = vaddr_to_paddr(address);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}
int fpga_invalidate_driver(void *address, size_t size) {
struct MemoryCacheArgs args;
uint64_t p_addr;
p_addr = vaddr_to_paddr(address);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}
void fpga_copy_driver(void *dest, const void *src, size_t num) {
uint64_t i;
DLOG << "dest:" << dest << " src:" << src << " size:" << num;
for (i = 0; i < num; i++) {
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*((int8_t *)dest + i) = *((int8_t *)src + i);
}
return;
}
int open_device_driver() {
g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
......@@ -347,12 +420,13 @@ int open_device_driver() {
int close_device_driver() {
pl_destroy();
fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
memory_release(g_fpgainfo.memory_info);
destroy_fpga_memory(g_fpgainfo.memory_info);
return 0;
}
} // namespace driver
} // namespace fpga
} // namespace paddle_mobile
......@@ -24,6 +24,7 @@ limitations under the License. */
namespace paddle_mobile {
namespace fpga {
namespace driver {
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
......@@ -47,6 +48,15 @@ const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 };
struct MemoryCacheArgs {
void *offset;
size_t size;
};
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer;
......@@ -95,26 +105,39 @@ extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset;
uint64_t value =
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset); // NOLINT
uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
offset); // NOLINT
return value;
}
inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) = // NOLINT
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +
offset) = // NOLINT
value;
}
int open_device_driver();
int close_device_driver();
void *fpga_malloc_driver(size_t size);
void fpga_free_driver(void *ptr);
void fpga_copy_driver(void *dest, const void *src, size_t num);
int fpga_flush_driver(void *address, size_t size);
int fpga_invalidate_driver(void *address, size_t size);
/*pe*/
uint64_t vaddr_to_paddr(void *address);
int fpga_regpoll(uint64_t reg, uint64_t val, int time);
} // namespace driver
} // namespace fpga
} // namespace paddle_mobile
......@@ -20,29 +20,29 @@ limitations under the License. */
namespace paddle_mobile {
namespace fpga {
#define MUL8(x) (x * 8)
#define MUL8(x) ((x)*8)
#define BYPASS_DONE 1
float Findfp16Max() {
uint16_t abs_vals[16];
uint64_t max_fp16;
max_fp16 = reg_readq(MUL8(49));
max_fp16 = driver::reg_readq(MUL8(49));
abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(50));
max_fp16 = driver::reg_readq(MUL8(50));
abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(51));
max_fp16 = driver::reg_readq(MUL8(51));
abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(52));
max_fp16 = driver::reg_readq(MUL8(52));
abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
......@@ -58,7 +58,7 @@ float Findfp16Max() {
}
int ComputeFpgaConv(const struct SplitConvArgs &args) {
ComputeBasicConv(args.conv_args[0]);
ComputeBasicConv(args.conv_arg[0]);
}
int ComputeBasicConv(const struct ConvArgs &args) {
......@@ -166,8 +166,8 @@ int PerformBypass(const struct BypassArgs &args) {
return 0;
#endif
uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
uint64_t bp_enable;
int64_t length;
uint64_t pixels;
......@@ -196,16 +196,16 @@ int PerformBypass(const struct BypassArgs &args) {
}
// start bypass
reg_writeq(ifm_src_paddr, MUL8(27));
reg_writeq(ifm_dst_paddr, MUL8(28));
reg_writeq(0, MUL8(0));
reg_writeq(bp_enable, MUL8(0));
driver::reg_writeq(ifm_src_paddr, MUL8(27));
driver::reg_writeq(ifm_dst_paddr, MUL8(28));
driver::reg_writeq(0, MUL8(0));
driver::reg_writeq(bp_enable, MUL8(0));
// poll
int ret = -1;
ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
if (ret != -1) {
// clear "irq"
reg_readq(MUL8(63));
driver::reg_readq(MUL8(63));
}
// get max value
if ((!args.input_data_type) && (!args.output_data_type)) {
......
......@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "fpga/V2/api.h"
#include "fpga/V2/fpga_common.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -94,6 +94,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
convert_to_hwc(data_in, num, channel, height, width);
align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
fpga_flush(*data_in, pixel_num * sizeof(float));
}
void convert_fc_filter(float **data_in, int num, int chw) {
......@@ -113,6 +114,8 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
int chw = channel * height * width;
convert_fc_filter(data_in, num, chw);
align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
fpga_flush(*data_in, pixel_num * sizeof(float));
}
float find_max(float *data_in, int data_size) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fpga/V2/fpga_common.h>
namespace paddle_mobile {
namespace fpga {
int16_t fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10)));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
}
float fp16_2_fp32(int16_t fp16_num) {
if (0 == fp16_num) {
return 0;
}
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
};
enum LayoutType {
LAYOUT_CHW = 1,
LAYOUT_HWC = 0,
};
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
float* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias
void* filter_address;
float* filter_scale_address;
void* free_space; // used by FPGA logic
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
int16_t** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t* aligned_channel_num;
uint32_t out_channel;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
int16_t kernel_reciprocal;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataType input_data_type;
enum DataType output_data_type;
enum LayoutType input_layout_type;
enum LayoutType output_layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
struct DeconvArgs {
struct ConvArgs conv_arg;
};
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num);
} // namespace fpga
} // namespace paddle_mobile
......@@ -58,6 +58,7 @@ void format_image(float **data_in, int channel, int height, int width,
int aligned_channel) {
convert_to_hwc(data_in, channel, height, width);
align_image(data_in, channel, height, width, aligned_channel);
fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
}
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
......@@ -69,6 +70,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
scale_out[1] = 0.0;
for (int i = 0; i < image_num; i++) {
scale_out[0] = std::max(*scale_out, scales_in[i][0]);
fpga_invalidate(images_in[i],
height * width * aligned_channel_num[i] * sizeof(int16_t));
}
scale_out[1] = 1 / scale_out[0];
......@@ -83,6 +86,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
tmp_channel_sum += channel_num[i];
}
}
fpga_flush(image_out, hw * out_channel * sizeof(int16_t));
}
} // namespace image
......
......@@ -26,6 +26,7 @@ limitations under the License. */
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
#include "memory/t_malloc.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
......@@ -86,8 +87,10 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
}
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
int i = 0;
auto &ops = ops_of_block_[*to_predict_block.get()];
for (const auto &op : ops) {
DLOG << "Initialize op[" << i++ << "]: " << op->Type();
op->Init();
}
}
......@@ -102,8 +105,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
// should be moved into operator init function
float min_value;
float max_value;
memcpy(&min_value, data_buf, sizeof(float));
memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
memory::Copy(&min_value, data_buf, sizeof(float));
memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
data_buf += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
......@@ -112,7 +115,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
}
data_buf += size * sizeof(uint8_t);
} else {
memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
*data_buf += size * sizeof(Dtype);
}
}
......@@ -128,7 +131,7 @@ void Executor<Dtype, P>::LoadMemory(
// lod information
// uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
uint64_t lod_level = 0;
memcpy(&lod_level, *data_buf, sizeof(uint64_t));
memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
*data_buf += sizeof(uint64_t);
auto *lod = tensor->mutable_lod();
......@@ -137,7 +140,7 @@ void Executor<Dtype, P>::LoadMemory(
uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
*data_buf += sizeof(uint64_t);
std::vector<size_t> tmp_dim(size / sizeof(size_t));
memcpy(tmp_dim.data(), *data_buf, size);
memory::Copy(tmp_dim.data(), *data_buf, size);
(*lod)[i] = std::move(tmp_dim);
*data_buf += size;
}
......
......@@ -21,7 +21,6 @@ limitations under the License. */
#include "operators/math/gemm.h"
namespace paddle_mobile {
static std::mutex lc;
template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP
......@@ -148,8 +147,8 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
}
paddle_mobile::operators::math::Gemm gemm;
auto time1 = paddle_mobile::time();
gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
static_cast<float>(0), c, ldc, false, nullptr);
// gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
// static_cast<float>(0), c, ldc, false, nullptr);
auto time2 = paddle_mobile::time();
double cost = paddle_mobile::time_diff(time1, time2);
paddle_mobile::memory::Free(a);
......@@ -199,6 +198,7 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
#endif
#ifdef PADDLE_MOBILE_CL
static std::mutex lc;
template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
std::lock_guard<std::mutex> lock(lc);
......
......@@ -32,7 +32,7 @@ const int MALLOC_ALIGN = 64;
namespace fpga = paddle_mobile::fpga;
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
fpga::fpga_copy(dst, src, num);
}
void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
......
......@@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVRELU_OP
#include "operators/fusion_deconv_relu_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/deconv_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
public:
FusionDeconvReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(), {}, removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
};
template <typename DeviceType, typename T>
class FusionDeconvReluOp : public framework::OperatorWithKernel<
DeviceType, FusionDeconvReluParam<DeviceType>,
operators::DeconvReluKernel<DeviceType, T>> {
public:
FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDeconvReluParam<DeviceType>,
operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif // FUSION_FC_RELU_OP
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVRELU_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DeconvReluKernel
: public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
public:
void Compute(const FusionDeconvReluParam<DeviceType> &param);
bool Init(FusionDeconvReluParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
return true;
}
template <>
void ConvTransposeKernel<FPGA, float>::Compute(
const ConvTransposeParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVRELU_OP
#include "operators/kernel/deconv_relu_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) {
return true;
}
template <>
void DeconvReluKernel<FPGA, float>::Compute(
const FusionDeconvReluParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -24,6 +24,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
}
template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -49,7 +49,12 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate(
(void *)in_x->data<float>(), // NOLINT
fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT
sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
} // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TANH_OP
#include "operators/kernel/tanh_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
return true;
}
template <>
void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef TANH_OP
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
public:
void Compute(const TanhParam<DeviceType>& param);
bool Init(TanhParam<DeviceType>* param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -1534,6 +1534,27 @@ class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
#endif
#ifdef TANH_OP
template <typename Dtype>
class TanhParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
}
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
private:
RType *input_x_;
RType *out_;
};
#endif
#ifdef PRELU_OP
template <typename Dtype>
class PReluParam : public OpParam {
......@@ -2229,9 +2250,24 @@ class ConvTransposeParam : public OpParam {
vector<int> paddings_;
vector<int> dilations_;
int groups;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::DeconvArgs fpga_conv_args;
public:
const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
#ifdef FUSION_DECONVRELU_OP
template <typename Dtype>
using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
#endif
#ifdef GRU_OP
template <typename Dtype>
class GruParam : public OpParam {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TANH_OP
#include "operators/tanh_op.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void TanhOp<DeviceType, T>::InferShape() const {
this->param_.Out()->Resize(this->param_.InputX()->dims());
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef TANH_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/tanh_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class TanhOp : public framework::OperatorWithKernel<
DeviceType, TanhParam<DeviceType>,
operators::TanhKernel<DeviceType, T>> {
public:
TanhOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, TanhParam<DeviceType>,
operators::TanhKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -73,6 +73,9 @@ list(FIND NET "FPGA_NET_V2" CON)
if (CON GREATER -1)
ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile)
ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
target_link_libraries(test-pe paddle-mobile)
set(FOUND_MATCH ON)
endif ()
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_FPGA_V2
#include "fpga/V2/api.h"
#include "fpga/V2/filter.h"
namespace fpga = paddle_mobile::fpga;
static const uint32_t N = 64;
static const uint32_t C = 3;
static const uint32_t H = 224;
static const uint32_t W = 224;
static const uint32_t G = 1;
fpga::DataType input_type = fpga::DATA_TYPE_FP32;
fpga::DataType output_type = fpga::DATA_TYPE_FP16;
void* ifm = nullptr;
void* ofm = nullptr;
void* filter = nullptr;
void* ifm_scale = nullptr;
void* ofm_scale = nullptr;
void* filter_scale = nullptr;
int ifm_size = 0, ofm_size = 0;
void format_data() {
ifm_scale = fpga::fpga_malloc(8);
ofm_scale = fpga::fpga_malloc(8);
int ifm_channel = fpga::filter::calc_aligned_channel(C);
int ofm_channel = fpga::filter::calc_aligned_channel(N);
int num = fpga::filter::calc_aligned_num(N, C);
DLOG << "ifm_channel = " << ifm_channel;
DLOG << "ofm_channel = " << ofm_channel;
DLOG << "aligned_num = " << num;
ifm_size = ifm_channel * H * W;
ofm_size = ofm_channel * H * W;
ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
memset(ifm, 0, ifm_size * sizeof(float));
memset(ofm, 0, ofm_size * sizeof(int16_t));
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
for (int c = 0; c < C; c++) {
int index = h * W * ifm_channel + w * ifm_channel + c;
(reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
// DLOG << index << ":" << ((float *) ifm)[index];
}
}
}
fpga::fpga_flush(ifm, ifm_size * sizeof(float));
fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
}
void print_fp16(int16_t* ptr, int total_size, int num) {
fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
int stride = total_size / num;
for (int i = 0; i < total_size; i += stride) {
DLOG << fpga::fp16_2_fp32(ptr[i]);
}
}
void print_fp32(float* ptr, int total_size, int num) {
fpga::fpga_invalidate(ptr, total_size * sizeof(float));
int stride = total_size / num;
for (int i = 0; i < total_size; i += stride) {
DLOG << ptr[i];
}
}
void test_bypass() {
fpga::BypassArgs args;
args.input_data_type = input_type;
args.output_data_type = output_type;
args.image.address = ifm;
args.image.height = H;
args.image.width = W;
args.image.channels = C;
args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
args.output.address = ofm;
args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
fpga::PerformBypass(args);
}
int main() {
paddle_mobile::fpga::open_device();
format_data();
DLOG << "format data done";
print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
DLOG << "print input done";
test_bypass();
DLOG << "test done";
print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
std::cout << "Computation done" << std::endl;
return 0;
}
#endif
......@@ -133,9 +133,11 @@ if (CON GREATER -1)
set(SOFTMAX_OP ON)
set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBN_OP ON)
# set(CONV_TRANSPOSE_OP ON)
# set(SLICE_OP ON)
# set(ELEMENTWISEADD_OP ON)
set(CONV_TRANSPOSE_OP ON)
set(FUSION_DECONVRELU_OP ON)
set(SLICE_OP ON)
set(TANH_OP ON)
set(ELEMENTWISEADD_OP ON)
set(FOUND_MATCH ON)
endif()
......@@ -445,3 +447,9 @@ if (DEQUANT_OP)
add_definitions(-DDEQUANT_OP)
endif()
if (TANH_OP)
add_definitions(-DTANH_OP)
endif()
if (FUSION_DECONVRELU_OP)
add_definitions(-DFUSION_DECONVRELU_OP)
endif()
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册