提交 6b14134f 编写于 作者: Z zhangyang

add deconv op for V1 for FPGA track

上级 467bbfe7
...@@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale; arg->conv_arg[i].filter_scale_address = filter->scale;
arg->conv_arg[i].filter_address = &( // arg->conv_arg[i].filter_address = &(
(int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; // NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].filter_num = (uint32_t)( arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div); : filter_num_per_div);
size_t filter_size =
element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
auto filter_head =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size);
size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
fpga_flush(arg->conv_arg[i].sb_address, bs_size);
if (n > 1) { if (n > 1) {
arg->conv_arg[i].output.scale_address = arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT (float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address = arg->conv_arg[i].output.address =
fpga_malloc(input->dims()[2] * fpga_malloc(out->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
IMAGE_ALIGNMENT) * IMAGE_ALIGNMENT) *
sizeof(half)); sizeof(half));
} else { } else {
...@@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
} }
filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr);
} }
} // namespace fpga } // namespace fpga
......
...@@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, ...@@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
if (num_per_div_after_alignment != num_per_div_before_alignment) {
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char)); char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
for (i = 0; i < div_num; i++) { memset(data_tmp, 0, num_element * sizeof(char));
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
*data_in = data_tmp; for (i = 0; i < div_num; i++) {
fpga_free(tmp); memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
} }
*data_in = data_tmp;
fpga_free(tmp);
} }
void reorder(char **data_in, int num_after_alignment, int chw) { void reorder(char **data_in, int num_after_alignment, int chw) {
...@@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
char **quantize_data = (char **)data_in; // NOLINT char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width); convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw); align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw); if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw); reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
...@@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height, ...@@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num; int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max); quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw); convert_fc_filter(quantize_data, num, chw);
align_element(quantize_data, num, chw); align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw); if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw); reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
......
此差异已折叠。
...@@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { ...@@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for (i = 0; i < timeout; i++) { for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) { if (val == reg_readq(reg)) {
std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
<< std::endl;
break; break;
} }
} }
if (i <= timeout) { if (i < timeout) {
return 0; return 0;
} else { } else {
return -1; return -1;
...@@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { ...@@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr; unsigned int nr = (unsigned int)_nr;
int ret = 0; int ret = 0;
DLOG << size;
DLOG << _nr;
DLOG << nr;
uint64_t a_size = FPGA_PAGE_SIZE * nr;
DLOG << a_size;
pthread_mutex_lock(&memory->mutex); pthread_mutex_lock(&memory->mutex);
...@@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { ...@@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
*addr = address_ofset; *addr = address_ofset;
} else { } else {
DLOG << "memory request failed!";
ret = -ENOMEM; ret = -ENOMEM;
} }
...@@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) { ...@@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) {
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
paddr = iter->second; paddr = iter->second;
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer: " << address;
} }
return paddr; return paddr;
...@@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) { ...@@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) {
fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
g_fpgainfo.memory_info->nr[pos]); g_fpgainfo.memory_info->nr[pos]);
pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
}
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer";
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <cstring> #include <cstring>
#include <map> #include <map>
...@@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1; ...@@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1;
const int PE_IDX_EW = 2; const int PE_IDX_EW = 2;
const int PE_IDX_BYPASS = 3; const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 }; enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 };
struct MemoryCacheArgs { struct MemoryCacheArgs {
void *offset; void *offset;
...@@ -58,7 +59,7 @@ struct MemoryCacheArgs { ...@@ -58,7 +59,7 @@ struct MemoryCacheArgs {
struct fpga_pe { struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1]; char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer; struct pe_data_s *outer;
pe_status status; // 0=idle 1=busy -1=fail pe_status status;
uint64_t interrupt_cnt; uint64_t interrupt_cnt;
}; };
...@@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) { ...@@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) {
uint64_t value = uint64_t value =
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset); // NOLINT offset); // NOLINT
// DLOG << "read end";
usleep(10);
return value; return value;
} }
...@@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) { ...@@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value; // DLOG << "offset : " << offset << ", value : " << value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset) = value; offset) = value;
// DLOG << "write end";
usleep(10);
} }
int open_device_driver(); int open_device_driver();
......
...@@ -74,12 +74,21 @@ struct ConcatArgs { ...@@ -74,12 +74,21 @@ struct ConcatArgs {
void* image_out; void* image_out;
float* scale_out; float* scale_out;
uint32_t* channel_num; uint32_t* channel_num;
// uint32_t* aligned_channel_num; uint32_t* aligned_channel_num;
// uint32_t out_channel; uint32_t out_channel;
uint32_t height; uint32_t height;
uint32_t width; uint32_t width;
}; };
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct SplitArgs { struct SplitArgs {
uint32_t image_num; uint32_t image_num;
int16_t* image_in; int16_t* image_in;
...@@ -91,15 +100,6 @@ struct SplitArgs { ...@@ -91,15 +100,6 @@ struct SplitArgs {
uint32_t width; uint32_t width;
}; };
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct PoolingArgs { struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg int16_t mode; // mode: 0:max, 1:avg
int16_t kernel_reciprocal; int16_t kernel_reciprocal;
...@@ -127,7 +127,14 @@ struct BypassArgs { ...@@ -127,7 +127,14 @@ struct BypassArgs {
}; };
struct DeconvArgs { struct DeconvArgs {
struct ConvArgs conv_arg; uint32_t sub_conv_num;
uint32_t group_num;
uint32_t filter_num;
uint32_t omit_size;
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
}; };
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册