提交 dee0175f 编写于 作者: xiebaiyuan's avatar xiebaiyuan

Merge remote-tracking branch 'upstream/develop' into develop

...@@ -14,11 +14,9 @@ limitations under the License. */ ...@@ -14,11 +14,9 @@ limitations under the License. */
#include "api.h" #include "api.h"
#include <fcntl.h> #include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <algorithm> #include <algorithm>
#include <cstring> #include <memory>
#include "bias_scale.h" #include "bias_scale.h"
#include "filter.h" #include "filter.h"
#include "image.h" #include "image.h"
...@@ -48,6 +46,7 @@ int open_device() { ...@@ -48,6 +46,7 @@ int open_device() {
// memory management; // memory management;
void *fpga_malloc(size_t size) { void *fpga_malloc(size_t size) {
DLOG << size << " bytes allocated";
#ifdef PADDLE_MOBILE_OS_LINUX #ifdef PADDLE_MOBILE_OS_LINUX
return reinterpret_cast<void *>( return reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
...@@ -68,6 +67,20 @@ void fpga_copy(void *dest, const void *src, size_t num) { ...@@ -68,6 +67,20 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num); memcpy(dest, src, num);
} }
int fpga_flush(void *address, size_t size) {
struct MemoryCacheArgs args;
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
}
int fpga_invalidate(void *address, size_t size) {
struct MemoryCacheArgs args;
args.address = address;
args.size = size;
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
}
int ComputeFpgaConv(const struct WrapperConvArgs &args) { int ComputeFpgaConv(const struct WrapperConvArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
/*DLOG << " relu_enabled:" << args.relu_enabled /*DLOG << " relu_enabled:" << args.relu_enabled
...@@ -145,8 +158,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -145,8 +158,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
} }
int PerformBypass(const struct BypassArgs &args) { int PerformBypass(const struct BypassArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << " layout_type:" << args.layout_type DLOG << " input_type:" << args.input_data_type
<< " convert_type:" << args.convert_type; << " input_layout_type:" << args.input_layout_type;
DLOG << " image_address:" << args.image.address DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address << " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels << " image_channels:" << args.image.channels
...@@ -181,10 +194,19 @@ void format_image(framework::Tensor *image_tensor) { ...@@ -181,10 +194,19 @@ void format_image(framework::Tensor *image_tensor) {
void format_ofm(framework::Tensor *ofm_tensor) { void format_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims(); auto dims = ofm_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3]; size_t memory_size = 0;
size_t memory_size = if (dims.size() == 4) {
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); auto channel = dims[1], height = dims[2], width = dims[3];
ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
} }
float filter_find_max(framework::Tensor *filter_tensor) { float filter_find_max(framework::Tensor *filter_tensor) {
...@@ -200,7 +222,7 @@ int get_plit_num(framework::Tensor *filter_tensor) { ...@@ -200,7 +222,7 @@ int get_plit_num(framework::Tensor *filter_tensor) {
return filter::calc_split_num(num, div_capacity); return filter::calc_split_num(num, div_capacity);
} }
int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims(); auto dims = filter_tensor->dims();
auto chw = dims[1] * dims[2] * dims[3]; auto chw = dims[1] * dims[2] * dims[3];
auto num = dims[0]; auto num = dims[0];
...@@ -279,7 +301,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -279,7 +301,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->concat_arg.image_out = out_ptr; arg->concat_arg.image_out = out_ptr;
const int channel = (int)out->dims()[1]; const int channel = (int)out->dims()[1];
int element_num_per_div = fpga::get_element_num_per_div(filter, group_num); int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
int element_num = fpga::get_aligned_filter_element_num( int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
...@@ -297,12 +319,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -297,12 +319,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.pad_height = (uint32_t)padding_h; arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
arg->conv_args[i].image.pad_width = (uint32_t)padding_w; arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
arg->conv_args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; arg->conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_args[i].filter_num = arg->conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div) channel - (n - 1) * filter_num_per_div)
: element_num_per_div); : filter_num_per_div);
if (n > 1) { if (n > 1) {
arg->conv_args[i].output.scale_address = arg->conv_args[i].output.scale_address =
......
...@@ -25,23 +25,14 @@ limitations under the License. */ ...@@ -25,23 +25,14 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
int open_device(); enum DataType {
int close_device(); DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
enum DataConvertType {
DATA_NO_CONVERT = 0,
DATA_FP32_TO_FP16 = 1,
DATA_FP16_TO_FP32 = 2,
}; };
enum LayoutConvertType { enum LayoutType {
LAYOUT_NO_CONVERT = 0, LAYOUT_CHW = 1,
LAYOUT_CHW_TO_HWC = 1, LAYOUT_HWC = 0,
LAYOUT_HWC_TO_CHW = 2,
}; };
struct VersionArgs { struct VersionArgs {
...@@ -122,16 +113,18 @@ struct PoolingArgs { ...@@ -122,16 +113,18 @@ struct PoolingArgs {
struct EWAddArgs { struct EWAddArgs {
bool relu_enabled; bool relu_enabled;
float const0; // output0 = const0 x input0 + const1 x input1; uint32_t const0; // output0 = const0 x input0 + const1 x input1;
float const1; uint32_t const1;
struct ImageInputArgs image0; struct ImageInputArgs image0;
struct ImageInputArgs image1; struct ImageInputArgs image1;
struct ImageOutputArgs output; struct ImageOutputArgs output;
}; };
struct BypassArgs { struct BypassArgs {
enum DataConvertType convert_type; enum DataType input_data_type;
enum LayoutConvertType layout_type; enum DataType output_data_type;
enum LayoutType input_layout_type;
enum LayoutType output_layout_type;
struct ImageInputArgs image; struct ImageInputArgs image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
}; };
...@@ -141,6 +134,16 @@ struct FpgaRegWriteArgs { ...@@ -141,6 +134,16 @@ struct FpgaRegWriteArgs {
uint64_t value; uint64_t value;
}; };
struct FpgaRegReadArgs {
uint64_t address;
uint64_t value;
};
struct MemoryCacheArgs {
void* address;
size_t size;
};
#define IOCTL_FPGA_MAGIC 'FPGA' #define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
...@@ -148,6 +151,8 @@ struct FpgaRegWriteArgs { ...@@ -148,6 +151,8 @@ struct FpgaRegWriteArgs {
#define IOCTL_SEPARATOR_0 10 #define IOCTL_SEPARATOR_0 10
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) #define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_SEPARATOR_1 20 #define IOCTL_SEPARATOR_1 20
...@@ -184,6 +189,15 @@ enum FPGA_ERR_TYPE { ...@@ -184,6 +189,15 @@ enum FPGA_ERR_TYPE {
//============================== API ============================= //============================== API =============================
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size);
int PerformBypass(const struct BypassArgs& args); int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct WrapperConvArgs& args); int ComputeFpgaConv(const struct WrapperConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args);
...@@ -196,7 +210,7 @@ void format_image(framework::Tensor* image_tensor); ...@@ -196,7 +210,7 @@ void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory
float filter_find_max(framework::Tensor* filter_tensor); float filter_find_max(framework::Tensor* filter_tensor);
int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
int get_plit_num(framework::Tensor* filter_tensor); int get_plit_num(framework::Tensor* filter_tensor);
int get_aligned_filter_element_num(int chw); int get_aligned_filter_element_num(int chw);
int get_aligned_filter_num(int num); int get_aligned_filter_num(int num);
......
...@@ -79,6 +79,7 @@ void format_bias_scale_array(float **bias_scale_array, ...@@ -79,6 +79,7 @@ void format_bias_scale_array(float **bias_scale_array,
int element_num_after_division = int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT); align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division); interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
} }
} // namespace bias_scale } // namespace bias_scale
......
...@@ -101,7 +101,6 @@ void align_element(char **data_in, int num, int chw) { ...@@ -101,7 +101,6 @@ void align_element(char **data_in, int num, int chw) {
int j = 0; int j = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) { if (align_chw != chw) {
printf("align %d \n", align_chw);
char *tmp = *data_in; char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char)); char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
...@@ -207,6 +206,8 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -207,6 +206,8 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
align_num(quantize_data, num_per_div_before_alignment, num, chw); align_num(quantize_data, num_per_div_before_alignment, num, chw);
reorder(quantize_data, num_after_alignment, chw); reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
} }
} // namespace filter } // namespace filter
......
...@@ -38,7 +38,6 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) { ...@@ -38,7 +38,6 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
} }
void align_element_conv(float **data_in, int height, int cw) { void align_element_conv(float **data_in, int height, int cw) {
int i = 0;
int h = 0; int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) { if (align_cw != cw) {
...@@ -60,6 +59,8 @@ void align_element_conv(float **data_in, int height, int cw) { ...@@ -60,6 +59,8 @@ void align_element_conv(float **data_in, int height, int cw) {
void format_image(float **data_in, int channel, int height, int width) { void format_image(float **data_in, int channel, int height, int width) {
convert_to_hwc(data_in, channel, height, width); convert_to_hwc(data_in, channel, height, width);
align_element_conv(data_in, height, channel * width); align_element_conv(data_in, height, channel * width);
fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
sizeof(float));
} }
void concat_images(int16_t **images_in, float **scales_in, void *image_out, void concat_images(int16_t **images_in, float **scales_in, void *image_out,
...@@ -77,6 +78,10 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -77,6 +78,10 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
for (i = 0; i < image_num; i++) { for (i = 0; i < image_num; i++) {
each_out_line_channel += channel_num[i]; each_out_line_channel += channel_num[i];
*scale_out = std::max(*scale_out, scales_in[i][0]); *scale_out = std::max(*scale_out, scales_in[i][0]);
fpga_invalidate(images_in[i],
height *
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
sizeof(int16_t));
} }
align_each_out_area_cw = align_each_out_area_cw =
align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
...@@ -97,6 +102,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -97,6 +102,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
} }
} }
} }
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
} }
} // namespace image } // namespace image
......
...@@ -56,8 +56,11 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -56,8 +56,11 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
auto output_ptr = output->mutable_data<half>(); auto output_ptr = output->mutable_data<half>();
fpga::BypassArgs args; fpga::BypassArgs args;
args.convert_type = fpga::DATA_FP32_TO_FP16;
args.layout_type = fpga::LAYOUT_NO_CONVERT; args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = (void *)input_ptr; args.image.address = (void *)input_ptr;
args.image.channels = input->dims()[1]; args.image.channels = input->dims()[1];
args.image.height = input->dims()[2]; args.image.height = input->dims()[2];
......
...@@ -23,7 +23,7 @@ template <> ...@@ -23,7 +23,7 @@ template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false; bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
auto bias = param->Bias(); auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<Tensor *>(param->Filter());
...@@ -62,7 +62,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -62,7 +62,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
fpga::format_filter(filter, max_value, param->Groups()); fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div = int element_num_per_div =
fpga::get_element_num_per_div(filter, param->Groups()); fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out); fpga::format_ofm(out);
...@@ -80,7 +80,6 @@ void ConvAddBNKernel<FPGA, float>::Compute( ...@@ -80,7 +80,6 @@ void ConvAddBNKernel<FPGA, float>::Compute(
const FusionConvAddBNParam<FPGA> &param) const { const FusionConvAddBNParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
} }
template class ConvAddBNKernel<FPGA, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -24,7 +24,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -24,7 +24,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) { FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<Tensor *>(param->Filter());
...@@ -58,14 +57,12 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -58,14 +57,12 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups()); fpga::format_filter(filter, max_value, param->Groups());
auto filter_ptr = filter->data<float>();
int element_num_per_div = int element_num_per_div =
fpga::get_element_num_per_div(filter, param->Groups()); fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::WrapperConvArgs conv_arg; fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
...@@ -80,7 +77,6 @@ void ConvAddBNReluKernel<FPGA, float>::Compute( ...@@ -80,7 +77,6 @@ void ConvAddBNReluKernel<FPGA, float>::Compute(
const FusionConvAddBNReluParam<FPGA> &param) const { const FusionConvAddBNReluParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
} }
template class ConvAddBNReluKernel<FPGA, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -23,7 +23,6 @@ template <> ...@@ -23,7 +23,6 @@ template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<Tensor *>(param->Filter());
...@@ -40,14 +39,12 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -40,14 +39,12 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups()); fpga::format_filter(filter, max_value, param->Groups());
auto filter_ptr = filter->data<float>();
int element_num_per_div = int element_num_per_div =
fpga::get_element_num_per_div(filter, param->Groups()); fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::WrapperConvArgs conv_arg; fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
...@@ -62,7 +59,6 @@ void ConvAddReluKernel<FPGA, float>::Compute( ...@@ -62,7 +59,6 @@ void ConvAddReluKernel<FPGA, float>::Compute(
const FusionConvAddReluParam<FPGA> &param) const { const FusionConvAddReluParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
} }
template class ConvAddReluKernel<FPGA, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -24,7 +24,6 @@ template <> ...@@ -24,7 +24,6 @@ template <>
bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bool relu_enabled = false; bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
...@@ -55,14 +54,12 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -55,14 +54,12 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups()); fpga::format_filter(filter, max_value, param->Groups());
auto filter_ptr = filter->data<float>();
int element_num_per_div = int element_num_per_div =
fpga::get_element_num_per_div(filter, param->Groups()); fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::WrapperConvArgs conv_arg; fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
...@@ -77,7 +74,6 @@ void ConvBNKernel<FPGA, float>::Compute( ...@@ -77,7 +74,6 @@ void ConvBNKernel<FPGA, float>::Compute(
const FusionConvBNParam<FPGA> &param) const { const FusionConvBNParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
} }
template class ConvBNKernel<FPGA, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -23,7 +23,6 @@ template <> ...@@ -23,7 +23,6 @@ template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
...@@ -52,27 +51,12 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -52,27 +51,12 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups()); fpga::format_filter(filter, max_value, param->Groups());
auto filter_ptr = filter->data<float>();
int element_num_per_div = int element_num_per_div =
fpga::get_element_num_per_div(filter, param->Groups()); fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::WrapperConvArgs convArgs;
convArgs.group_num = (uint32_t)param->Groups();
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
fpga::WrapperConvArgs conv_arg; fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
...@@ -87,7 +71,6 @@ void ConvBNReluKernel<FPGA, float>::Compute( ...@@ -87,7 +71,6 @@ void ConvBNReluKernel<FPGA, float>::Compute(
const FusionConvBNReluParam<FPGA> &param) const { const FusionConvBNReluParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
} }
template class ConvBNReluKernel<FPGA, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -27,13 +27,7 @@ bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) { ...@@ -27,13 +27,7 @@ bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
template <> template <>
void DropoutKernel<FPGA, float>::Compute( void DropoutKernel<FPGA, float>::Compute(
const DropoutParam<FPGA> &param) const { const DropoutParam<FPGA> &param) const {}
// auto *input_x = param.InputX();
// auto *out = param.Out();
// auto input_x_ptr = input_x->data<float>();
// auto out_ptr = out->mutable_data<float>();
// out_ptr = const_cast<float *>(input_x_ptr);
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -21,7 +21,6 @@ template <> ...@@ -21,7 +21,6 @@ template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
auto input_x = const_cast<LoDTensor *>(param->InputX()); auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>();
auto filter = const_cast<Tensor *>(param->InputY()); auto filter = const_cast<Tensor *>(param->InputY());
auto input_z = param->InputZ(); auto input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
...@@ -47,12 +46,10 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -47,12 +46,10 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1); fpga::format_filter(filter, max_value, 1);
auto filter_ptr = filter->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(filter, 1); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::WrapperConvArgs conv_arg; fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
......
...@@ -22,7 +22,6 @@ template <> ...@@ -22,7 +22,6 @@ template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false; bool relu_enabled = false;
auto input_x = const_cast<LoDTensor *>(param->InputX()); auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>();
auto filter = const_cast<Tensor *>(param->InputY()); auto filter = const_cast<Tensor *>(param->InputY());
const Tensor *input_z = param->InputZ(); const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
...@@ -48,12 +47,10 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -48,12 +47,10 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1); fpga::format_filter(filter, max_value, 1);
auto filter_ptr = filter->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(filter, 1); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::WrapperConvArgs conv_arg; fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
......
...@@ -50,9 +50,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { ...@@ -50,9 +50,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
template <> template <>
void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) const { void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) const {
#ifdef PADDLE_MOBILE_FPGA
fpga::ComputeFpgaPool(param.FpgaArgs()); fpga::ComputeFpgaPool(param.FpgaArgs());
#endif
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -25,30 +25,41 @@ namespace operators { ...@@ -25,30 +25,41 @@ namespace operators {
template <> template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
const Tensor *input = param->InputX(); const Tensor *input = param->InputX();
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto output = param->Out(); auto output_ptr = param->Out();
auto output_ptr = output->mutable_data<float>(); Tensor *floatInput = new Tensor(*input);
fpga::BypassArgs args; fpga::BypassArgs args;
args.convert_type = fpga::DATA_FP16_TO_FP32; args.input_layout_type = fpga::LAYOUT_HWC;
args.layout_type = fpga::LAYOUT_NO_CONVERT; args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = (void *)(input_ptr); args.image.address = (void *)(input_ptr);
args.image.height = (uint32_t)input->dims()[0]; args.image.height = (uint32_t)input->dims()[0];
args.image.width = (uint32_t)input->dims()[1]; args.image.width = (uint32_t)input->dims()[1];
args.image.channels = 1; args.image.channels = 1;
args.output.address = output_ptr; args.output.address = (void *)floatInput->mutable_data<float>();
param->SetFpgaArgs(args);
param->SetFloatInput(floatInput);
param->SetFpgaArgs(args);
return true; return true;
} }
template <> template <>
void SoftmaxKernel<FPGA, float>::Compute( void SoftmaxKernel<FPGA, float>::Compute(
const SoftmaxParam<FPGA> &param) const { const SoftmaxParam<FPGA> &param) const {
// SoftmaxCompute<float>(param); DLOG << "======================================= FPGA SoftMAX "
"===============================================";
const Tensor *in_x = param.FloatInput();
Tensor *out = param.Out();
fpga::fpga_flush((void *)in_x->data<float>(), in_x->memory_size());
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate(out->data<float>(), out->memory_size());
auto x_dims = in_x->dims();
out->Resize(x_dims);
math::SoftmaxFuntor<CPU, float>()(in_x, out);
} }
template class SoftmaxKernel<FPGA, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -74,7 +74,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -74,7 +74,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
const int isize = im_height; const int isize = im_height;
bool pad1 = padding[0] > 0; bool pad1 = padding[0] > 0;
bool pad2 = bool pad2 =
(pad1 && (pad1 && padding[1] &&
(((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0)); (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
int fill = isize % 2; int fill = isize % 2;
if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 && if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
......
...@@ -36,13 +36,35 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -36,13 +36,35 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
if (trans_a) {
int numel = matrix_a.numel();
int m = matrix_a.dims()[0];
int n = matrix_a.dims()[1];
float *tmp = (float *)(matrix_a.data<float>());
float *a = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
int index = 0;
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
a[index++] = tmp[i * n + j];
}
}
#ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias);
#else
Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias);
#endif
} else {
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu, bias); N, beta, matrix_out->data<float>(), N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N, relu, bias); beta, matrix_out->data<float>(), N, relu, bias);
#endif #endif
}
} }
template <> template <>
......
...@@ -31,186 +31,43 @@ using std::min; ...@@ -31,186 +31,43 @@ using std::min;
using std::vector; using std::vector;
void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
#if __ARM_NEON #if __ARM_NEON
const int batch_size = input->dims()[0]; const int batch_size = static_cast<int>(input->dims()[0]);
const int input_channel = static_cast<int>(input->dims()[1]);
const int h_in = input->dims()[2]; const int input_height = static_cast<int>(input->dims()[2]);
const int input_width = static_cast<int>(input->dims()[3]);
const int output_height = static_cast<int>(output->dims()[2]);
const int output_width = static_cast<int>(output->dims()[3]);
const int w_in = input->dims()[3]; const int hxw = input_height * input_width;
const int output_channels = output->dims()[1];
const int h_out = output->dims()[2]; const int l = input_height;
const int w_out = output->dims()[3];
const int outputdata_channel_stride = h_out * w_out;
const int inputdata_channel_stride = h_in * w_in;
const int input_batch_stride = output_channels * inputdata_channel_stride;
const int output_batch_stride = output_channels * outputdata_channel_stride;
float *out_data = output->data<float>();
const float *input_data = input->data<float>();
const float coef = 1.0 / 9.0; const float coef = 1.0 / 9.0;
for (int k = 0; k < batch_size; ++k) { const float coef1 = 1.0 / 6.0;
#pragma omp parallel for const float coef2 = 1.0 / 4.0;
for (int c = 0; c < output_channels; ++c) {
const float *input_seg = input_data + c * inputdata_channel_stride;
float *output_seg = out_data + c * outputdata_channel_stride;
// four corner point
output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
input_seg[w_in + 1]) *
coef;
output_seg[w_out - 1] =
(input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - 2] +
input_seg[2 * w_in - 1]) *
coef;
output_seg[(h_out - 1) * w_out] =
(input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) *
coef;
output_seg[h_out * w_out - 1] =
(input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
input_seg[(h_in - 1) * w_in - 1] +
input_seg[(h_in - 1) * w_in - 2]) *
coef;
// left side & right side
for (int i = 1; i < h_in - 1; ++i) {
output_seg[i * w_out] =
(input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
input_seg[i * w_in] + input_seg[i * w_in + 1] +
input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
coef;
output_seg[i * w_out + w_out - 1] =
(input_seg[i * w_in - w_in + w_in - 2] +
input_seg[i * w_in - w_in + 1 + w_in - 2] +
input_seg[i * w_in + w_in - 2] +
input_seg[i * w_in + 1 + w_in - 2] +
input_seg[i * w_in + w_in + w_in - 2] +
input_seg[i * w_in + w_in + 1 + w_in - 2]) *
coef;
}
// top 1 row & bottom 1 row
const float *input_tmp = input_seg;
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, sum, out0;
float32x4_t v_coef = vdupq_n_f32(coef);
in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + w_in);
const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + w_in);
int c_mid = w_out - 2;
auto output_ptr = output_seg + 1;
for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + w_in + 4);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
tmp2 = vextq_f32(in2, in3, 1);
tmp3 = vextq_f32(in2, in3, 2);
sum = vaddq_f32(in0, tmp0);
sum = vaddq_f32(sum, tmp1);
sum = vaddq_f32(sum, in2);
sum = vaddq_f32(sum, tmp2);
sum = vaddq_f32(sum, tmp3);
vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
in5 = vld1q_f32(input_tmp_end + 4);
in7 = vld1q_f32(input_tmp_end + w_in + 4);
tmp0 = vextq_f32(in4, in5, 1);
tmp1 = vextq_f32(in4, in5, 2);
tmp2 = vextq_f32(in6, in7, 1);
tmp3 = vextq_f32(in6, in7, 2);
sum = vaddq_f32(in0, tmp0);
sum = vaddq_f32(sum, tmp1);
sum = vaddq_f32(sum, in2);
sum = vaddq_f32(sum, tmp2);
sum = vaddq_f32(sum, tmp3);
vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
// can optimize to each 8 stride.
input_tmp += 4;
input_tmp_end += 4;
output_ptr += 4;
in0 = in1;
in2 = in3;
in4 = in5;
in6 = in7;
}
// top right remain
float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2);
tmp2 = vextq_f32(in2, pad1, 2);
tmp3 = vextq_f32(in2, pad1, 2);
sum = vaddq_f32(in0, tmp0);
sum = vaddq_f32(sum, tmp1);
sum = vaddq_f32(sum, in2);
sum = vaddq_f32(sum, tmp2);
sum = vaddq_f32(sum, tmp3);
out0 = vmulq_f32(sum, v_coef);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + i, out0, 2);
}
}
// bottom_right remain
float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2);
tmp2 = vextq_f32(in6, pad3, 2);
tmp3 = vextq_f32(in6, pad3, 2);
sum = vaddq_f32(in4, tmp0);
sum = vaddq_f32(sum, tmp1);
sum = vaddq_f32(sum, in6);
sum = vaddq_f32(sum, tmp2);
sum = vaddq_f32(sum, tmp3);
out0 = vmulq_f32(sum, v_coef);
for (int i = 0; i < c_mid; ++i) { float32x4_t v_coef = vdupq_n_f32(coef);
if (i == 0) { float32x4_t v_coef1 = vdupq_n_f32(coef1);
vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
}
}
// mid
for (int j = 0; j < h_out - 2; ++j) {
output_ptr = output_seg + w_out * (j + 1) + 1;
input_tmp = input_seg + j * w_in;
in0 = vld1q_f32(input_tmp); for (int b = 0; b < batch_size; b++) {
in2 = vld1q_f32(input_tmp + w_in); #pragma omp parallel for
in4 = vld1q_f32(input_tmp + 2 * w_in); for (int c = 0; c < input_channel; c++) {
c_mid = w_out - 2; const float *input_data = input->data<float>() + c * hxw;
for (; c_mid > 3; c_mid -= 4) { float *output_data = output->data<float>() + c * hxw;
in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + w_in + 4); for (int i = 1; i < output_height - 1; i++) {
in5 = vld1q_f32(input_tmp + 2 * w_in + 4); float *output_ptr;
float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4,
tmp5, out0;
for (int m = 1; m < output_width - 4; m += 4) {
output_ptr = output_data + i * output_width + m;
in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1);
in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3);
in2 = vld1q_f32(input_data + i * input_width + m - 1);
in3 = vld1q_f32(input_data + i * input_width + m + 3);
in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1);
in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3);
tmp0 = vextq_f32(in0, in1, 1); tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2); tmp1 = vextq_f32(in0, in1, 2);
...@@ -219,63 +76,383 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -219,63 +76,383 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
tmp4 = vextq_f32(in4, in5, 1); tmp4 = vextq_f32(in4, in5, 1);
tmp5 = vextq_f32(in4, in5, 2); tmp5 = vextq_f32(in4, in5, 2);
sum = vaddq_f32(in0, tmp0); out0 = in0;
sum = vaddq_f32(sum, tmp1); out0 = vaddq_f32(out0, tmp0);
sum = vaddq_f32(sum, in2); out0 = vaddq_f32(out0, tmp1);
sum = vaddq_f32(sum, tmp2); out0 = vaddq_f32(out0, in2);
sum = vaddq_f32(sum, tmp3); out0 = vaddq_f32(out0, tmp2);
sum = vaddq_f32(sum, in4); out0 = vaddq_f32(out0, tmp3);
sum = vaddq_f32(sum, tmp4); out0 = vaddq_f32(out0, in4);
sum = vaddq_f32(sum, tmp5); out0 = vaddq_f32(out0, tmp4);
out0 = vaddq_f32(out0, tmp5);
out0 = vmulq_f32(sum, v_coef);
vst1q_f32(output_ptr, out0); vst1q_f32(output_ptr, vmulq_f32(out0, v_coef));
output_ptr += 4; }
input_tmp += 4; int m;
in0 = in1; for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
in2 = in3;
in4 = in5;
} }
// mid remain
float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
tmp0 = vextq_f32(in0, pad0, 1); for (int j = m; j < output_width - 1; j++) {
tmp1 = vextq_f32(in0, pad0, 2); output_data[i * output_width + j] =
tmp2 = vextq_f32(in2, pad1, 1); input_data[(i - 1) * input_width + j - 1] +
tmp3 = vextq_f32(in2, pad1, 2); input_data[(i - 1) * input_width + j] +
tmp4 = vextq_f32(in4, pad2, 1); input_data[(i - 1) * input_width + j + 1] +
tmp5 = vextq_f32(in4, pad2, 2); input_data[(i)*input_width + j - 1] +
input_data[(i)*input_width + j] +
input_data[(i)*input_width + j + 1] +
input_data[(i + 1) * input_width + j - 1] +
input_data[(i + 1) * input_width + j] +
input_data[(i + 1) * input_width + j + 1];
output_data[i * output_width + j] =
output_data[i * output_width + j] * coef;
}
}
sum = vaddq_f32(in0, tmp0); output_data[0] =
sum = vaddq_f32(sum, tmp1); input_data[0] + input_data[1] + input_data[l] + input_data[l + 1];
sum = vaddq_f32(sum, in2); output_data[l - 1] = input_data[l - 2] + input_data[l - 1] +
sum = vaddq_f32(sum, tmp2); input_data[2 * l - 2] + input_data[2 * l - 1];
sum = vaddq_f32(sum, tmp3); output_data[(l - 1) * l] =
sum = vaddq_f32(sum, in4); input_data[(l - 2) * l] + input_data[(l - 2) * l + 1] +
sum = vaddq_f32(sum, tmp4); input_data[(l - 1) * l] + input_data[(l - 1) * l + 1];
sum = vaddq_f32(sum, tmp5); output_data[l * l - 1] = input_data[(l - 2) * (l + 1)] +
out0 = vmulq_f32(sum, v_coef); input_data[(l - 2) * (l + 1) + 1] +
input_data[l * l - 2] + input_data[l * l - 1];
output_data[0] = output_data[0] * coef2;
output_data[l - 1] = output_data[l - 1] * coef2;
output_data[(l - 1) * l] = output_data[(l - 1) * l] * coef2;
output_data[l * l - 1] = output_data[l * l - 1] * coef2;
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] = input_data[i * l - l] + input_data[i * l - l + 1] +
input_data[i * l] + input_data[i * l + 1] +
input_data[i * l + l] + input_data[i * l + l + 1];
output_data[i * l + l - 1] =
input_data[i * l + l - 1 - l - 1] + input_data[i * l + l - 1 - l] +
input_data[i * l + l - 1 - 1] + input_data[i * l + l - 1] +
input_data[i * l + l - 1 + l - 1] + input_data[i * l + l - 1 + l];
output_data[i * l] = output_data[i * l] * coef1;
output_data[i * l + l - 1] = output_data[i * l + l - 1] * coef1;
}
for (int i = 0; i < c_mid; ++i) { int m;
if (i == 0) { for (m = 1; m < output_width - 4; m += 4) {
vst1q_lane_f32(output_ptr + i, out0, 0); float *output_ptr = output_data + m;
} float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
if (i == 1) { in0 = vld1q_f32(input_data + m - 1);
vst1q_lane_f32(output_ptr + i, out0, 1); in1 = vld1q_f32(input_data + m + 3);
} in2 = vld1q_f32(input_data + input_width + m - 1);
if (i == 2) { in3 = vld1q_f32(input_data + input_width + m + 3);
vst1q_lane_f32(output_ptr + i, out0, 2); tmp0 = vextq_f32(in0, in1, 1);
} tmp1 = vextq_f32(in0, in1, 2);
} tmp2 = vextq_f32(in2, in3, 1);
tmp3 = vextq_f32(in2, in3, 2);
out0 = in0;
out0 = vaddq_f32(out0, tmp0);
out0 = vaddq_f32(out0, tmp1);
out0 = vaddq_f32(out0, in2);
out0 = vaddq_f32(out0, tmp2);
out0 = vaddq_f32(out0, tmp3);
vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
}
for (m = 1; (m + 3) < output_width - 1; m += 4) {
}
for (int j = m; j < output_width - 1; j++) {
output_data[j] = input_data[j - 1] + input_data[j] + input_data[j + 1] +
input_data[input_width + j - 1] +
input_data[input_width + j] +
input_data[input_width + j + 1];
output_data[j] = output_data[j] * coef1;
}
for (m = 1; m < output_width - 4; m += 4) {
float *output_ptr =
output_data + (output_height - 1) * output_width + m;
float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1);
in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3);
in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1);
in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
tmp2 = vextq_f32(in2, in3, 1);
tmp3 = vextq_f32(in2, in3, 2);
out0 = in0;
out0 = vaddq_f32(out0, tmp0);
out0 = vaddq_f32(out0, tmp1);
out0 = vaddq_f32(out0, in2);
out0 = vaddq_f32(out0, tmp2);
out0 = vaddq_f32(out0, tmp3);
vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
}
for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
}
for (int j = m; j < output_width - 1; j++) {
output_data[(output_height - 1) * input_width + j] =
input_data[(output_height - 2) * input_width + j - 1] +
input_data[(output_height - 2) * input_width + j] +
input_data[(output_height - 2) * input_width + j + 1] +
input_data[(output_height - 1) * input_width + j - 1] +
input_data[(output_height - 1) * input_width + j] +
input_data[(output_height - 1) * input_width + j + 1];
output_data[(output_height - 1) * output_width + j] =
output_data[(output_height - 1) * output_width + j] * coef1;
} }
// input_data += inputdata_channel_stride;
// out_data += outputdata_channel_stride;
} }
input_data += input_batch_stride;
out_data += output_batch_stride;
} }
// const int batch_size = input->dims()[0];
//
// const int h_in = input->dims()[2];
//
// const int w_in = input->dims()[3];
//
// const int output_channels = output->dims()[1];
//
// const int h_out = output->dims()[2];
// const int w_out = output->dims()[3];
// const int outputdata_channel_stride = h_out * w_out;
// const int inputdata_channel_stride = h_in * w_in;
// const int input_batch_stride = output_channels * inputdata_channel_stride;
// const int output_batch_stride = output_channels *
// outputdata_channel_stride; float *out_data = output->data<float>(); const
// float *input_data = input->data<float>();
//
// const float coef = 1.0 / 9.0;
// for (int k = 0; k < batch_size; ++k) {
//#pragma omp parallel for
// for (int c = 0; c < output_channels; ++c) {
// const float *input_seg = input_data + c * inputdata_channel_stride;
// float *output_seg = out_data + c * outputdata_channel_stride;
// // four corner point
// output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
// input_seg[w_in + 1]) *
// coef;
// output_seg[w_out - 1] =
// (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 -
// 2] +
// input_seg[2 * w_in - 1]) *
// coef;
// output_seg[(h_out - 1) * w_out] =
// (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
// input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1])
// *
// coef;
// output_seg[h_out * w_out - 1] =
// (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
// input_seg[(h_in - 1) * w_in - 1] +
// input_seg[(h_in - 1) * w_in - 2]) *
// coef;
// // left side & right side
// for (int i = 1; i < h_in - 1; ++i) {
// output_seg[i * w_out] =
// (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
// input_seg[i * w_in] + input_seg[i * w_in + 1] +
// input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
// coef;
// output_seg[i * w_out + w_out - 1] =
// (input_seg[i * w_in - w_in + w_in - 2] +
// input_seg[i * w_in - w_in + 1 + w_in - 2] +
// input_seg[i * w_in + w_in - 2] +
// input_seg[i * w_in + 1 + w_in - 2] +
// input_seg[i * w_in + w_in + w_in - 2] +
// input_seg[i * w_in + w_in + 1 + w_in - 2]) *
// coef;
// }
// // top 1 row & bottom 1 row
// const float *input_tmp = input_seg;
//
// float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
// tmp3, tmp4, tmp5, sum, out0;
// float32x4_t v_coef = vdupq_n_f32(coef);
// in0 = vld1q_f32(input_tmp);
// in2 = vld1q_f32(input_tmp + w_in);
// const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
// in4 = vld1q_f32(input_tmp_end);
// in6 = vld1q_f32(input_tmp_end + w_in);
// int c_mid = w_out - 2;
// auto output_ptr = output_seg + 1;
// for (; c_mid > 3; c_mid -= 4) {
// in1 = vld1q_f32(input_tmp + 4);
// in3 = vld1q_f32(input_tmp + w_in + 4);
//
// tmp0 = vextq_f32(in0, in1, 1);
// tmp1 = vextq_f32(in0, in1, 2);
//
// tmp2 = vextq_f32(in2, in3, 1);
// tmp3 = vextq_f32(in2, in3, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
//
// vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
//
// in5 = vld1q_f32(input_tmp_end + 4);
// in7 = vld1q_f32(input_tmp_end + w_in + 4);
//
// tmp0 = vextq_f32(in4, in5, 1);
// tmp1 = vextq_f32(in4, in5, 2);
// tmp2 = vextq_f32(in6, in7, 1);
// tmp3 = vextq_f32(in6, in7, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
//
// vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
//
// // can optimize to each 8 stride.
// input_tmp += 4;
// input_tmp_end += 4;
// output_ptr += 4;
// in0 = in1;
// in2 = in3;
// in4 = in5;
// in6 = in7;
// }
// // top right remain
// float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
// float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
//
// tmp0 = vextq_f32(in0, pad0, 1);
// tmp1 = vextq_f32(in0, pad0, 2);
// tmp2 = vextq_f32(in2, pad1, 2);
// tmp3 = vextq_f32(in2, pad1, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// out0 = vmulq_f32(sum, v_coef);
//
// for (int i = 0; i < c_mid; ++i) {
// if (i == 0) {
// vst1q_lane_f32(output_ptr + i, out0, 0);
// }
// if (i == 1) {
// vst1q_lane_f32(output_ptr + i, out0, 1);
// }
// if (i == 2) {
// vst1q_lane_f32(output_ptr + i, out0, 2);
// }
// }
//
// // bottom_right remain
// float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
// float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
//
// tmp0 = vextq_f32(in4, pad2, 1);
// tmp1 = vextq_f32(in4, pad2, 2);
// tmp2 = vextq_f32(in6, pad3, 2);
// tmp3 = vextq_f32(in6, pad3, 2);
//
// sum = vaddq_f32(in4, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in6);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// out0 = vmulq_f32(sum, v_coef);
//
// for (int i = 0; i < c_mid; ++i) {
// if (i == 0) {
// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
// }
// if (i == 1) {
// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
// }
// if (i == 2) {
// vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
// }
// }
// // mid
// for (int j = 0; j < h_out - 2; ++j) {
// output_ptr = output_seg + w_out * (j + 1) + 1;
// input_tmp = input_seg + j * w_in;
//
// in0 = vld1q_f32(input_tmp);
// in2 = vld1q_f32(input_tmp + w_in);
// in4 = vld1q_f32(input_tmp + 2 * w_in);
// c_mid = w_out - 2;
// for (; c_mid > 3; c_mid -= 4) {
// in1 = vld1q_f32(input_tmp + 4);
// in3 = vld1q_f32(input_tmp + w_in + 4);
// in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
//
// tmp0 = vextq_f32(in0, in1, 1);
// tmp1 = vextq_f32(in0, in1, 2);
// tmp2 = vextq_f32(in2, in3, 1);
// tmp3 = vextq_f32(in2, in3, 2);
// tmp4 = vextq_f32(in4, in5, 1);
// tmp5 = vextq_f32(in4, in5, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// sum = vaddq_f32(sum, in4);
// sum = vaddq_f32(sum, tmp4);
// sum = vaddq_f32(sum, tmp5);
//
// out0 = vmulq_f32(sum, v_coef);
// vst1q_f32(output_ptr, out0);
// output_ptr += 4;
// input_tmp += 4;
// in0 = in1;
// in2 = in3;
// in4 = in5;
// }
// // mid remain
// float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
// float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
// float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
//
// tmp0 = vextq_f32(in0, pad0, 1);
// tmp1 = vextq_f32(in0, pad0, 2);
// tmp2 = vextq_f32(in2, pad1, 1);
// tmp3 = vextq_f32(in2, pad1, 2);
// tmp4 = vextq_f32(in4, pad2, 1);
// tmp5 = vextq_f32(in4, pad2, 2);
//
// sum = vaddq_f32(in0, tmp0);
// sum = vaddq_f32(sum, tmp1);
// sum = vaddq_f32(sum, in2);
// sum = vaddq_f32(sum, tmp2);
// sum = vaddq_f32(sum, tmp3);
// sum = vaddq_f32(sum, in4);
// sum = vaddq_f32(sum, tmp4);
// sum = vaddq_f32(sum, tmp5);
// out0 = vmulq_f32(sum, v_coef);
//
// for (int i = 0; i < c_mid; ++i) {
// if (i == 0) {
// vst1q_lane_f32(output_ptr + i, out0, 0);
// }
// if (i == 1) {
// vst1q_lane_f32(output_ptr + i, out0, 1);
// }
// if (i == 2) {
// vst1q_lane_f32(output_ptr + i, out0, 2);
// }
// }
// }
// // input_data += inputdata_channel_stride;
// // out_data += outputdata_channel_stride;
// }
// input_data += input_batch_stride;
// out_data += output_batch_stride;
// }
#endif #endif
} }
...@@ -662,6 +839,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -662,6 +839,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
wstart = max(wstart, 0); wstart = max(wstart, 0);
hend = min(hend, input_height); hend = min(hend, input_height);
wend = min(wend, input_width); wend = min(wend, input_width);
const float *pos1 = input_seg + hstart * input_width + wstart; const float *pos1 = input_seg + hstart * input_width + wstart;
const float *pos2 = input_seg + (hstart + 1) * input_width + wstart; const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
const float *pos3 = input_seg + (hstart + 2) * input_width + wstart; const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
...@@ -674,7 +852,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input, ...@@ -674,7 +852,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
sum += input_seg[h * input_width + w]; sum += input_seg[h * input_width + w];
} }
} }
output_seg[ph * output_width + pw] = sum / 9.0; output_seg[ph * output_width + pw] =
sum / ((hend - hstart) * (wend - wstart) * 1.0);
} else { } else {
#if __aarch64__ #if __aarch64__
#else #else
......
...@@ -795,7 +795,7 @@ class SoftmaxParam : public OpParam { ...@@ -795,7 +795,7 @@ class SoftmaxParam : public OpParam {
fpga::BypassArgs fpga_bypass_args; fpga::BypassArgs fpga_bypass_args;
public: public:
RType *FloatInput() { RType *FloatInput() const {
return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
} }
void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
......
...@@ -22,7 +22,7 @@ namespace fpga = paddle_mobile::fpga; ...@@ -22,7 +22,7 @@ namespace fpga = paddle_mobile::fpga;
using std::cout; using std::cout;
using std::endl; using std::endl;
int main() { void test_format_image() {
std::vector<int> dims{1, 1, 3, 3}; std::vector<int> dims{1, 1, 3, 3};
std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9};
frame::DDim ddim = frame::make_ddim(dims); frame::DDim ddim = frame::make_ddim(dims);
...@@ -44,6 +44,50 @@ int main() { ...@@ -44,6 +44,50 @@ int main() {
cout << endl; cout << endl;
auto dd = image.dims(); auto dd = image.dims();
cout << dims[0] << dims[1] << dims[2] << dims[3] << endl; cout << dims[0] << dims[1] << dims[2] << dims[3] << endl;
}
void test_fill_conv_arg() {
Tensor input, out, filter;
DLOG << "Setup input";
SetupTensor<int16_t>(&input, {1, 250, 32, 30}, static_cast<int16_t>(0),
static_cast<int16_t>(1));
DLOG << "Setup filter";
SetupTensor<float>(&filter, {1001, 250, 3, 3}, static_cast<float>(0),
static_cast<float>(1));
DLOG << "Setup output";
SetupTensor<int16_t>(&out, {1, 1001, 32, 30}, static_cast<int16_t>(0),
static_cast<int16_t>(1));
auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float));
DLOG << "find max";
float max_value = fpga::filter_find_max(&filter);
DLOG << "format filter";
fpga::format_filter(&filter, max_value, 1);
DLOG << "format bs_ptr";
int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001);
DLOG << "format ofm";
fpga::format_ofm(&out);
DLOG << "Build arg";
fpga::WrapperConvArgs arg;
fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr);
DLOG << "splitNum: " << arg.split_num << " group_num:" << arg.group_num
<< " filter_num:" << arg.filter_num;
for (int i = 0; i < arg.split_num; i++) {
DLOG << arg.conv_args[i].filter_num << " " << arg.conv_args[i].sb_address
<< " " << arg.conv_args[i].filter_address << " "
<< arg.conv_args[i].filter_scale_address;
}
}
int main() {
test_format_image();
test_fill_conv_arg();
return 0; return 0;
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册