提交 84ebc523 编写于 作者: L liuruilong

Merge remote-tracking branch 'upstream/develop' into develop

# coding=utf-8
import cv2
from array import array
def resize_take_rgbs(path, shape_h_w):
print '--------------resize_take_rgbs-----------------begin'
image = cv2.imread(path)
# print image.shape
cv2.imshow("before", image)
print_rgb(image[0, 0])
# image len may be for .just check it
# image.resize(shape_h_w)
image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
cv2.imshow("after", image)
print image.shape
height = shape_h_w[0]
width = shape_h_w[1]
rs_ = []
gs_ = []
bs_ = []
for h in range(0, height):
for w in range(0, width):
bs_.append(image[h, w, 0])
gs_.append(image[h, w, 1])
rs_.append(image[h, w, 2])
# print image[2, 2, 0]/255.
print len(bs_)
print len(gs_)
print len(rs_)
print '--------------resize_take_rgbs-----------------end'
return bs_, gs_, rs_
def print_rgb((b, g, r)):
print "像素 - R:%d,G:%d,B:%d" % (r, g, b) # 显示像素值
#
# image[0, 0] = (100, 150, 200) # 更改位置(0,0)处的像素
#
# (b, g, r) = image[0, 0] # 再次读取(0,0)像素
# print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b) # 显示更改后的像素值
#
# corner = image[0:100, 0:100] # 读取像素块
# cv2.imshow("Corner", corner) # 显示读取的像素块
#
# image[0:100, 0:100] = (0, 255, 0); # 更改读取的像素块
#
# cv2.imshow("Updated", image) # 显示图像
#
# cv2.waitKey(0) # 程序暂停
def save_to_file(to_file_name, array):
to_file = open(to_file_name, "wb")
array.tofile(to_file)
to_file.close()
# coding=utf-8
import cv2
from array import array
import imagetools as tools
from enum import Enum
class ChannelType(Enum):
RGB = 0,
BGR = 1
def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR):
print '--------------combine_bgrs_nchw-----------------begin'
print "scale: %f" % scale
print means_b_g_r
# print len(bgrs)
bs = bgrs[0]
gs = bgrs[1]
rs = bgrs[2]
assert len(bs) == len(gs) == len(rs)
print len(bs)
bgrs_float_array = array('f')
if channel_type == ChannelType.BGR:
print 'bgr'
for i in range(0, len(bs)):
bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale) # b
for i in range(0, len(gs)):
bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale) # g
for i in range(0, len(rs)):
bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale) # r
elif channel_type == ChannelType.RGB:
print 'rgb'
for i in range(0, len(rs)):
bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale) # r
for i in range(0, len(gs)):
bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale) # g
for i in range(0, len(bs)):
bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale) # b
print len(bgrs_float_array)
print '------------------'
print bgrs_float_array[0]
print bgrs_float_array[416 * 416 * 2 + 416 * 2 + 2]
# for i in range(0, 9):
# print'bs %d' % i
# print bs[i] / 255.
print bs[416 * 2 + 2] / 255.
print '--------------combine_bgrs_nchw-----------------end'
return bgrs_float_array
# bgrs = tools.resize_take_rgbs('banana.jpeg', (224, 224, 3))
# array = combine_bgrs_nchw(bgrs, (103.94, 116.78, 123.68), 0.017, array,ChannelType.BGR)
# tools.save_to_file('banana_1_3_224_224_nchw_float')
# cv2.waitKey(0)
bgrs = tools.resize_take_rgbs('datas/newyolo.jpg', (416, 416, 3))
array = combine_bgrs_nchw(bgrs, (0, 0, 0), 1. / 255, ChannelType.RGB)
tools.save_to_file('datas/desktop_1_3_416_416_nchw_float', array)
# coding=utf-8
import cv2
from array import array
import imagetools as tools
def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
print "scale: %f" % scale
print means_b_g_r
# print len(bgrs)
bs = bgrs[0]
gs = bgrs[1]
rs = bgrs[2]
assert len(bs) == len(gs) == len(rs)
# print len(bs)
bgrs_float_array = array('f')
for i in range(0, len(bs)):
bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale) # r
bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale) # g
bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale) # b
print len(bgrs_float_array)
print '------------------'
print bgrs_float_array[0]
print bgrs_float_array[999]
return bgrs_float_array
bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
cv2.waitKey(0)
# coding=utf-8
# 这个脚本是可以将numpy合并到二进制
import cv2
import numpy as np
import imagetools as tools
from array import array
#
# image = cv2.imread(path)
# print image.shape
#
# print_rgb(image[0, 0])
# # image len may be for .just check it
# image.resize(shape_h_w)
data = np.fromfile('datas/img.res')
print data.size
print data[0]
data.reshape(1, 3, 416, 416)
out_array = array('f')
print'--------------------'
print data.size
print data[0]
print '如果是nhwc --------'
# rgb rgb rgb rgb rgb
print data[416 * 3 * 2 + 3 * 2 + 2]
# print data[2]
print '如果是nchw --------'
# rgb rgb rgb rgb rgb
print data[416 * 416 * 2 + 416 * 2 + 2]
# print data[2]
# 明明是nchw
for i in range(0, data.size):
out_array.append(data[i])
print len(out_array)
print out_array[416 * 416 * 2 + 416 * 2 + 2]
tools.save_to_file('datas/in_put_1_3_416_416_2', out_array)
# coding=utf-8
import os
path = "yolo_v2_tofile_source/" # 文件夹目录
to_file_path = "yolo_v2_tofile_combined/params"
files = os.listdir(path) # 得到文件夹下的所有文件名称
files.sort(cmp=None, key=str.lower)
to_file = open(to_file_path, "wb")
for file in files: # 遍历文件夹
if not os.path.isdir(file): # 判断是否是文件夹,不是文件夹才打开
f = open(path + "/" + file) # 打开文件
name = f.name
print 'name: ' + name
from_file = open(name, "rb")
to_file.write(from_file.read())
from_file.close()
to_file.close()
...@@ -66,7 +66,7 @@ class Swichter: ...@@ -66,7 +66,7 @@ class Swichter:
def read_head(self, head_file): def read_head(self, head_file):
from_file = open(head_file, "rb") from_file = open(head_file, "rb")
read = from_file.read(20) read = from_file.read(24)
# print read # print read
from_file.close() from_file.close()
# print read # print read
...@@ -84,9 +84,32 @@ class Swichter: ...@@ -84,9 +84,32 @@ class Swichter:
to_file.close() to_file.close()
pass pass
def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
print'padding = %d' % padding
from_file = open(from_file_name, "rb")
# print len(from_file.read())
from_file.seek(padding, 0)
read = from_file.read()
print len(read)
to_file = open(to_file_name, "wb")
# tmp_file = open(tmp_file_name, "wb")
head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
to_file.write(head)
to_file.write(read)
from_file.close()
to_file.close()
pass
# Swichter().nhwc2nchw_one_slice_add_head(
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
# 32,
# 3, 3, 3)
# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
# Swichter().nhwc2nchw_one_slice( # Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv5_6_dw_0.bin',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/conv5_6_dw_0', 1,
# 512, 3, 3)
Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
...@@ -29,9 +29,7 @@ namespace fpga { ...@@ -29,9 +29,7 @@ namespace fpga {
static int fd = -1; static int fd = -1;
static const char *device_path = "/dev/fpgadrv0"; static const char *device_path = "/dev/fpgadrv0";
#ifdef PADDLE_MOBILE_OS_LINUX
static std::map<void *, size_t> memory_map; static std::map<void *, size_t> memory_map;
#endif
static inline int do_ioctl(int req, const void *arg) { static inline int do_ioctl(int req, const void *arg) {
#ifdef PADDLE_MOBILE_OS_LINUX #ifdef PADDLE_MOBILE_OS_LINUX
...@@ -53,32 +51,38 @@ int open_device() { ...@@ -53,32 +51,38 @@ int open_device() {
// memory management; // memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
static uint64_t counter = 0; static uint64_t counter = 0;
counter += size;
DLOG << size << " bytes allocated. Total " << counter << " bytes";
#ifdef PADDLE_MOBILE_OS_LINUX #ifdef PADDLE_MOBILE_OS_LINUX
auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
memory_map.insert(std::make_pair(ptr, size));
return ptr;
#else #else
return malloc(size); auto ptr = malloc(size);
#endif #endif
counter += size;
memory_map.insert(std::make_pair(ptr, size));
DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
<< counter << " bytes";
return ptr;
} }
void fpga_free(void *ptr) { void fpga_free(void *ptr) {
#ifdef PADDLE_MOBILE_OS_LINUX
static uint64_t counter = 0; static uint64_t counter = 0;
size_t size = 0; size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) { if (iter != memory_map.end()) {
size = iter->second; size = iter->second;
munmap(ptr, size);
memory_map.erase(iter); memory_map.erase(iter);
} #ifdef PADDLE_MOBILE_OS_LINUX
counter += size; munmap(ptr, size);
DLOG << size << " bytes freed. Total " << counter << " bytes";
#else #else
free(ptr); free(ptr);
#endif #endif
counter += size;
DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
<< counter << " bytes";
} else {
DLOG << "Invalid pointer";
}
} }
void fpga_copy(void *dest, const void *src, size_t num) { void fpga_copy(void *dest, const void *src, size_t num) {
...@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
int ComputeFPGAConcat(const struct ConcatArgs &args) { int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaConcat==========="; DLOG << "=============ComputeFpgaConcat===========";
DLOG << " out_address:" << args.image_out DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out; << " out_scale_address:" << args.scale_out;
DLOG << " image_height:" << args.height << " image_width:" << args.width; DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) { for (int i = 0; i < args.image_num; i++) {
...@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) { ...@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
auto channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>(); auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float); size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size); auto new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size); fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width); image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data); image_tensor->reset_data_ptr(new_data);
...@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
auto out_ptr = out->data<float>(); auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
arg->split_num = (uint32_t)fpga::get_plit_num(filter); // Either group_num or split_num = 1;
arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr; arg->output.address = out_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num * arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
sizeof(fpga::ConvArgs));
arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_num = arg->split_num;
arg->concat_arg.image_out = out_ptr; arg->concat_arg.image_out = out_ptr;
...@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->concat_arg.width = (uint32_t)filter->dims()[3]; arg->concat_arg.width = (uint32_t)filter->dims()[3];
int n = arg->split_num; int n = arg->split_num;
arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *)); arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *)); arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
arg->concat_arg.channel_num = arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
(uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
arg->concat_arg.image_out = out_ptr; arg->concat_arg.image_out = out_ptr;
auto channel = (int)out->dims()[1]; auto channel = (int)out->dims()[1];
int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num); int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = fpga::get_aligned_filter_element_num( int element_num = get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
...@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_args[i].filter_num = arg->conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
channel - (n - 1) * filter_num_per_div)
: filter_num_per_div); : filter_num_per_div);
if (n > 1) { if (n > 1) {
arg->conv_args[i].output.scale_address = arg->conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float)); (float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.address = arg->conv_args[i].output.address = fpga_malloc(
fpga::fpga_malloc(input->dims()[2] * input->dims()[3] * input->dims()[2] *
arg->conv_args[i].filter_num * sizeof(half)); align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
IMAGE_ALIGNMENT) *
sizeof(half));
} }
else { else {
...@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
} }
arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address; arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address; arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
} }
} }
......
...@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < ops.size(); ++j) { for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j]; std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << op->Type(); DLOG << "create op: " << j << " " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp( auto op_base = framework::OpRegistry<Dtype>::CreateOp(
op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
program_.scope); program_.scope);
...@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std::shared_ptr<framework::BlockDesc> to_predict_block = std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0); to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()]; auto &ops = ops_of_block_[*to_predict_block.get()];
int i = 0;
for (const auto &op : ops) { for (const auto &op : ops) {
DLOG << "Init op: " << i++ << " " << op->Type();
op->Init(); op->Init();
} }
} }
...@@ -231,6 +233,13 @@ void Executor<Dtype, P>::InitMemory() { ...@@ -231,6 +233,13 @@ void Executor<Dtype, P>::InitMemory() {
Get_binary_data(program_.model_path + "/" + var_desc->Name()); Get_binary_data(program_.model_path + "/" + var_desc->Name());
char *data = origin_data; char *data = origin_data;
LoadMemory(*var_desc, tensor, &data); LoadMemory(*var_desc, tensor, &data);
// DLOG << "----- " << var_desc->Name();
// DLOG << "----- " << tensor->dims();
// float *pDouble = tensor->template data<float>();
// for (int i = 0; i < tensor->numel() && i < 30; ++i) {
// std::cout << pDouble[i] << std::endl;
// }
delete origin_data; delete origin_data;
} else { } else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
...@@ -695,6 +704,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) { ...@@ -695,6 +704,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
clock_gettime(CLOCK_MONOTONIC, &ts); clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif #endif
DLOG << "Running op: " << i << " " << ops[i]->Type();
ops[i]->Run(); ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
......
...@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
fpga::format_image(input); fpga::format_image(input);
Tensor *output = param_.Out(); Tensor *output = param_.Out();
auto output_ptr = output->data<half>(); auto output_ptr = output->data<float>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
......
...@@ -129,10 +129,13 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) { ...@@ -129,10 +129,13 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
// param.Paddings(), // param.Paddings(),
// param.Filter(), param.Bias(), // param.Filter(), param.Bias(),
// param.Output(), false); // param.Output(), false);
if (param.Paddings()[0] == 0) {
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(), math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
*param.Bias(), true); *param.Bias(), true);
} else {
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), *param.Bias(), true);
}
} else { } else {
ConvAddBasic(param); ConvAddBasic(param);
} }
......
...@@ -26,7 +26,8 @@ template <> ...@@ -26,7 +26,8 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto float_input = new Tensor(*input); auto float_input = new Tensor;
float_input->mutable_data<float>(input->dims());
fpga::format_fp32_ofm(float_input); fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
......
...@@ -1881,6 +1881,103 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1881,6 +1881,103 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
#endif #endif
} }
void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor bias, bool if_bias) {
#if __ARM_NEON
const int batch_size = static_cast<int>(input->dims()[0]);
const int input_channel = static_cast<int>(input->dims()[1]);
const int input_height = static_cast<int>(input->dims()[2]);
const int input_width = static_cast<int>(input->dims()[3]);
const int output_height = static_cast<int>(output->dims()[2]);
const int output_width = static_cast<int>(output->dims()[3]);
const int inhxw = input_height * input_width;
const int outhxw = output_height * output_width;
float32x4_t zero = vdupq_n_f32(0.0);
for (int b = 0; b < batch_size; b++) {
#pragma omp parallel for
for (int c = 0; c < input_channel; c++) {
const float *filter_data = filter->data<float>() + c * 9;
const float *input_data = input->data<float>() + c * inhxw;
const float *bias_data = bias.data<float>() + c;
float *output_data = output->data<float>() + c * outhxw;
float w00 = filter_data[0];
float w01 = filter_data[1];
float w02 = filter_data[2];
float w10 = filter_data[3];
float w11 = filter_data[4];
float w12 = filter_data[5];
float w20 = filter_data[6];
float w21 = filter_data[7];
float w22 = filter_data[8];
float32x4_t biasv = vld1q_dup_f32(bias_data);
for (int i = 0; i < output_height; i += 1) {
for (int m = 0; m < output_width - 2; m += 3) {
float *output_ptr = output_data + i * output_width + m;
float32x4x2_t input_buff_top{}, input_buff_mid{}, input_buff_bottom{};
float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3,
tmp4, tmp5, out0;
input_buff_top =
vld2q_f32(input_data + (2 * i) * input_width + (2 * m));
input_buff_mid =
vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m));
input_buff_bottom =
vld2q_f32(input_data + (2 * i + 2) * input_width + (2 * m));
in0 = input_buff_top.val[0];
tmp0 = input_buff_top.val[1];
tmp1 = vextq_f32(in0, zero, 1);
in2 = input_buff_mid.val[0];
tmp2 = input_buff_mid.val[1];
tmp3 = vextq_f32(in2, zero, 1);
in4 = input_buff_bottom.val[0];
tmp4 = input_buff_bottom.val[1];
tmp5 = vextq_f32(in4, zero, 1);
out0 = vmulq_n_f32(in0, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in2, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vmlaq_n_f32(out0, in4, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, biasv);
vst1q_lane_f32(output_ptr, out0, 0);
vst1q_lane_f32(output_ptr + 1, out0, 1);
vst1q_lane_f32(output_ptr + 2, out0, 2);
}
int m;
for (m = 0; m < output_width - 2; m += 3) {
}
for (int j = m; j < output_width; j++) {
output_data[i * output_width + j] =
input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
input_data[(2 * i) * input_width + 2 * j] * w11 +
input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
output_data[i * output_width + j] += *bias_data;
}
}
}
}
#endif
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -43,6 +43,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -43,6 +43,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale, Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu); const Tensor *new_bias, bool if_relu);
void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor bias, bool if_bias);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
此差异已折叠。
...@@ -18,6 +18,9 @@ elseif ("yolo" IN_LIST NET) ...@@ -18,6 +18,9 @@ elseif ("yolo" IN_LIST NET)
# gen test # gen test
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-yolo paddle-mobile)
# gen test
ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test_yolo_combined paddle-mobile)
elseif ("squeezenet" IN_LIST NET) elseif ("squeezenet" IN_LIST NET)
# gen test # gen test
ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
...@@ -30,6 +33,27 @@ elseif("FPGAnets" IN_LIST NET) ...@@ -30,6 +33,27 @@ elseif("FPGAnets" IN_LIST NET)
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile) target_link_libraries(test-resnet paddle-mobile)
ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet50 paddle-mobile)
ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-EW paddle-mobile)
ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-conv paddle-mobile)
ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-pooling paddle-mobile)
ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-bypass paddle-mobile)
ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-softmax paddle-mobile)
ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-fpga-concat paddle-mobile)
ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-tensor-quant paddle-mobile) target_link_libraries(test-tensor-quant paddle-mobile)
...@@ -74,6 +98,10 @@ else () ...@@ -74,6 +98,10 @@ else ()
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-yolo paddle-mobile)
# gen test
ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test_yolo_combined paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-googlenet paddle-mobile) target_link_libraries(test-googlenet paddle-mobile)
...@@ -235,13 +263,4 @@ else () ...@@ -235,13 +263,4 @@ else ()
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif() endif()
# if(FPGA)
# ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
# target_link_libraries(test-tensor-quant paddle-mobile)
# endif()
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
static const char *g_resnet_combine = "../models/resnet50";
int main() {
DLOG << paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
std::string(g_resnet_combine) + "/params", true)) {
std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
static_cast<float>(1));
std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
paddle_mobile.FeedData(input_tensor);
paddle_mobile.Predict_To(-1);
// paddle_mobile.Predict_From(73);
// paddle_mobile.Predict_From_To(72, 73);
DLOG << "Computation done";
return 0;
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto time1 = time();
if (paddle_mobile.Load(std::string(g_yolo_combined) + "/model",
std::string(g_yolo_combined) + "/params", true)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<int64_t> dims{1, 3, 416, 416};
std::vector<float> input;
GetInput<float>(g_test_image_desktop_1_3_416_416_nchw_float, &input, dims);
std::cout << "input.size(): " << input.size() << std::endl;
for (int j = 0; j < 100; ++j) {
std::cout << j << " : " << input[j] << std::endl;
}
// // 预热十次
// for (int i = 0; i < 10; ++i) {
// paddle_mobile.Predict(input, dims);
// }
auto time3 = time();
const vector<float> vector_out = paddle_mobile.Predict(input, dims);
std::cout << "--------------------------------------------" << std::endl;
for (float i : vector_out) {
std::cout << i << std::endl;
}
std::cout << "--------------------------------------------" << std::endl;
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
return 0;
}
...@@ -41,12 +41,15 @@ static const char *g_resnet_50 = "../models/resnet_50"; ...@@ -41,12 +41,15 @@ static const char *g_resnet_50 = "../models/resnet_50";
static const char *g_resnet = "../models/resnet"; static const char *g_resnet = "../models/resnet";
static const char *g_googlenet_combine = "../models/googlenet_combine"; static const char *g_googlenet_combine = "../models/googlenet_combine";
static const char *g_yolo = "../models/yolo"; static const char *g_yolo = "../models/yolo";
static const char *g_yolo_combined = "../models/yolo_combined";
static const char *g_fluid_fssd_new = "../models/fluid_fssd_new"; static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
static const char *g_test_image_1x3x224x224 = static const char *g_test_image_1x3x224x224 =
"../images/test_image_1x3x224x224_float"; "../images/test_image_1x3x224x224_float";
static const char *g_test_image_1x3x224x224_banana = static const char *g_test_image_1x3x224x224_banana =
"../images/input_3x224x224_banana"; "../images/input_3x224x224_banana";
static const char *g_test_image_desktop_1_3_416_416_nchw_float =
"../images/in_put_1_3_416_416_2";
static const char *g_hand = "../images/hand_image"; static const char *g_hand = "../images/hand_image";
static const char *g_imgfssd_ar = "../images/test_image_ssd_ar"; static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
static const char *g_imgfssd_ar1 = "../images/003_0001.txt"; static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册