提交 32f138a6 编写于 作者: H Houjiang Chen 提交者: GitHub

Merge branch 'develop' into dev-latest

...@@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale; arg->conv_arg[i].filter_scale_address = filter->scale;
arg->conv_arg[i].filter_address = &( // arg->conv_arg[i].filter_address = &(
(int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; // NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].filter_num = (uint32_t)( arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div); : filter_num_per_div);
size_t filter_size =
element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
auto filter_head =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size);
size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
fpga_flush(arg->conv_arg[i].sb_address, bs_size);
if (n > 1) { if (n > 1) {
arg->conv_arg[i].output.scale_address = arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT (float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address = arg->conv_arg[i].output.address =
fpga_malloc(input->dims()[2] * fpga_malloc(out->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
IMAGE_ALIGNMENT) * IMAGE_ALIGNMENT) *
sizeof(half)); sizeof(half));
} else { } else {
...@@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
} }
filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr);
} }
} // namespace fpga } // namespace fpga
......
...@@ -137,7 +137,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, ...@@ -137,7 +137,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
if (num_per_div_after_alignment != num_per_div_before_alignment) {
char *tmp = *data_in; char *tmp = *data_in;
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
...@@ -154,7 +154,6 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, ...@@ -154,7 +154,6 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
*data_in = data_tmp; *data_in = data_tmp;
fpga_free(tmp); fpga_free(tmp);
}
} }
void reorder(char **data_in, int num_after_alignment, int chw) { void reorder(char **data_in, int num_after_alignment, int chw) {
...@@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
char **quantize_data = (char **)data_in; // NOLINT char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width); convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw); align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw); align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw); reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
...@@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height, ...@@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num; int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max); quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw); convert_fc_filter(quantize_data, num, chw);
align_element(quantize_data, num, chw); align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw); align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw); reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
......
此差异已折叠。
...@@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { ...@@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for (i = 0; i < timeout; i++) { for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) { if (val == reg_readq(reg)) {
std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
<< std::endl;
break; break;
} }
} }
if (i <= timeout) { if (i < timeout) {
return 0; return 0;
} else { } else {
return -1; return -1;
...@@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { ...@@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr; unsigned int nr = (unsigned int)_nr;
int ret = 0; int ret = 0;
DLOG << size;
DLOG << _nr;
DLOG << nr;
uint64_t a_size = FPGA_PAGE_SIZE * nr;
DLOG << a_size;
pthread_mutex_lock(&memory->mutex); pthread_mutex_lock(&memory->mutex);
...@@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { ...@@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
*addr = address_ofset; *addr = address_ofset;
} else { } else {
DLOG << "memory request failed!";
ret = -ENOMEM; ret = -ENOMEM;
} }
...@@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) { ...@@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) {
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
paddr = iter->second; paddr = iter->second;
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer: " << address;
} }
return paddr; return paddr;
...@@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) { ...@@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) {
fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
g_fpgainfo.memory_info->nr[pos]); g_fpgainfo.memory_info->nr[pos]);
pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
}
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer";
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <cstring> #include <cstring>
#include <map> #include <map>
...@@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1; ...@@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1;
const int PE_IDX_EW = 2; const int PE_IDX_EW = 2;
const int PE_IDX_BYPASS = 3; const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 }; enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 };
struct MemoryCacheArgs { struct MemoryCacheArgs {
void *offset; void *offset;
...@@ -58,7 +59,7 @@ struct MemoryCacheArgs { ...@@ -58,7 +59,7 @@ struct MemoryCacheArgs {
struct fpga_pe { struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1]; char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer; struct pe_data_s *outer;
pe_status status; // 0=idle 1=busy -1=fail pe_status status;
uint64_t interrupt_cnt; uint64_t interrupt_cnt;
}; };
...@@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) { ...@@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) {
uint64_t value = uint64_t value =
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset); // NOLINT offset); // NOLINT
// DLOG << "read end";
usleep(10);
return value; return value;
} }
...@@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) { ...@@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value; // DLOG << "offset : " << offset << ", value : " << value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset) = value; offset) = value;
// DLOG << "write end";
usleep(10);
} }
int open_device_driver(); int open_device_driver();
......
...@@ -74,12 +74,21 @@ struct ConcatArgs { ...@@ -74,12 +74,21 @@ struct ConcatArgs {
void* image_out; void* image_out;
float* scale_out; float* scale_out;
uint32_t* channel_num; uint32_t* channel_num;
// uint32_t* aligned_channel_num; uint32_t* aligned_channel_num;
// uint32_t out_channel; uint32_t out_channel;
uint32_t height; uint32_t height;
uint32_t width; uint32_t width;
}; };
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct SplitArgs { struct SplitArgs {
uint32_t image_num; uint32_t image_num;
int16_t* image_in; int16_t* image_in;
...@@ -91,15 +100,6 @@ struct SplitArgs { ...@@ -91,15 +100,6 @@ struct SplitArgs {
uint32_t width; uint32_t width;
}; };
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct PoolingArgs { struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg int16_t mode; // mode: 0:max, 1:avg
int16_t kernel_reciprocal; int16_t kernel_reciprocal;
...@@ -127,7 +127,14 @@ struct BypassArgs { ...@@ -127,7 +127,14 @@ struct BypassArgs {
}; };
struct DeconvArgs { struct DeconvArgs {
struct ConvArgs conv_arg; uint32_t sub_conv_num;
uint32_t group_num;
uint32_t filter_num;
uint32_t omit_size;
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
}; };
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <fstream> #include <fstream>
#include <iomanip>
#include <iostream>
#include "../test_include.h" #include "../test_include.h"
#ifdef PADDLE_MOBILE_FPGA_V1 #ifdef PADDLE_MOBILE_FPGA_V1
...@@ -87,26 +89,29 @@ int main() { ...@@ -87,26 +89,29 @@ int main() {
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_resnet50), true)) { if (paddle_mobile.Load(std::string(g_resnet50), true)) {
Tensor input_tensor; Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0), SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(2),
static_cast<float>(1)); static_cast<float>(2));
readStream(g_image_src_float, readStream(g_image_src_float,
input_tensor.mutable_data<float>({1, 3, 224, 224})); input_tensor.mutable_data<float>({1, 3, 224, 224}));
paddle_mobile.FeedData(input_tensor); paddle_mobile.FeedData(input_tensor);
paddle_mobile.Predict_To(-1); paddle_mobile.Predict_To(-1);
/*for(int i = 0; i < 73; i++) for (int i = 0; i < 73; i++) {
{
auto tensor_ptr = paddle_mobile.FetchResult(i); auto tensor_ptr = paddle_mobile.FetchResult(i);
std::string saveName = "resnet50_result_" + std::to_string (i); std::string saveName = "resnet50_result_" + std::to_string(i);
paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(), paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
tensor_ptr->numel()); dump_stride(saveName, (*tensor_ptr), 20); tensor_ptr->numel() * sizeof(half));
//dump(saveName, (*tensor_ptr)); dump_stride(saveName, (*tensor_ptr), 20);
}*/ // dump(saveName, (*tensor_ptr));
}
/*std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73); std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
(*output_tensor).dump<float>("resnet50_result_73"); //(*output_tensor).dump<float>("resnet50_result_73");
output_tensor = paddle_mobile.FetchResult(74); output_tensor = paddle_mobile.FetchResult(74);
(*output_tensor).dump<float>("resnet50_result_74");*/ //(*output_tensor).dump<float>("resnet50_result_74");
std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74); // std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
// output_tensor = paddle_mobile.FetchResult(74);
float max = 0; float max = 0;
auto data_ptr = output_tensor->data<float>(); auto data_ptr = output_tensor->data<float>();
int maximumIdx = 0; int maximumIdx = 0;
...@@ -116,7 +121,7 @@ int main() { ...@@ -116,7 +121,7 @@ int main() {
max = data_ptr[i]; max = data_ptr[i];
} }
} }
std::cout << "index : " << maximumIdx << ", value : " << max std::cout << "index : " << std::dec << maximumIdx << ", value : " << max
<< std::endl; << std::endl;
std::cout << "Computation done" << std::endl; std::cout << "Computation done" << std::endl;
return 0; return 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册