提交 71f88533 编写于 作者: xiebaiyuan's avatar xiebaiyuan

Merge remote-tracking branch 'upstream/develop' into develop

cmake_minimum_required(VERSION 3.6)
option(USE_OPENMP "openmp support" ON)
option(USE_OPENMP "openmp support" OFF)
project(paddle-mobile)
option(DEBUGING "enable debug mode" OFF)
option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF)
option(LOG_PROFILE "log profile" OFF)
# select the platform to build
......@@ -94,6 +94,8 @@ else()
endif()
if(FPGA)
set(DEBUGING ON)
add_definitions(-DPADDLE_MOBILE_DEBUG)
add_definitions(-DPADDLE_MOBILE_FPGA)
else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
......@@ -140,7 +142,12 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default
set(NET "default" CACHE STRING "select net type")
if (FPGA)
set(NET "FPGAnets" CACHE STRING "select net type")
else()
set(NET "default" CACHE STRING "select net type")
endif()
set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP")
include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
......
......@@ -68,29 +68,35 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int ComputeFpgaConv(const struct ConvArgs &args) {
int ComputeFpgaConv(const struct WrapperConvArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
/*DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;*/
#endif
int split_num = args.split_num;
for (int i = 0; i < split_num; i++) {
do_ioctl(IOCTL_CONFIG_CONV, &args.conv_args[i]);
}
return do_ioctl(IOCTL_CONFIG_CONV, &args);
if (split_num > 1) {
ComputeFPGAConcat(args.concat_arg);
}
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
......@@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) {
return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width);
return 0;
}
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->mutable_data<float>();
size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
......@@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) {
void format_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
auto channel = dims[1], height = dims[2], width = dims[3];
size_t memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
......@@ -178,38 +191,38 @@ float filter_find_max(framework::Tensor *filter_tensor) {
auto filter_ptr = filter_tensor->data<float>();
return filter::find_max(filter_ptr, filter_tensor->numel());
}
int get_plit_num(framework::Tensor *filter_tensor) {
auto dims = filter_tensor->dims();
auto chw = dims[1] * dims[2] * dims[3];
auto num = dims[0];
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
}
int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(dims.size() == 4 || dims.size() == 2,
"Filter order should be 4 or 2");
int chw = dims.size() == 4 ? dims[1] * dims[2] * dims[3] : dims[1];
int num = dims.size() == 4 ? dims[0] : dims[1];
auto chw = dims[1] * dims[2] * dims[3];
auto num = dims[0];
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_num_per_div(num, group_num, div_capacity);
}
void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) {
auto dims = filter_tensor->dims();
int num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
int get_aligned_filter_element_num(int chw) {
return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
}
int get_aligned_filter_num(int num) {
return align_to_x(num, FILTER_NUM_ALIGNMENT);
}
void format_fc_matrix(framework::Tensor *filter_tensor, float max_value,
int group_num, int height, int width) {
void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) {
auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(height == 1 && width == 1,
"IFM should be flattened for FC");
int num = dims[1], channel = dims[0] / height / width;
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
auto new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
......@@ -222,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array,
element_num_per_division, num);
}
void format_concat_output(framework::Tensor *out, int height, int width,
int image_num, uint32_t *channel_num) {
int sum_channel = 0, sum_cw = 0;
for (int i = 0; i < image_num; i++) {
sum_channel += channel_num[i];
}
sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
auto ddim = framework::make_ddim({-1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -92,6 +92,26 @@ struct ConvArgs {
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t height;
uint32_t width;
};
struct WrapperConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
......@@ -165,21 +185,26 @@ enum FPGA_ERR_TYPE {
//============================== API =============================
int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct ConvArgs& args);
int ComputeFpgaConv(const struct WrapperConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory
float filter_find_max(framework::Tensor* filter_tensor);
int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num);
int get_plit_num(framework::Tensor* filter_tensor);
int get_aligned_filter_element_num(int chw);
int get_aligned_filter_num(int num);
void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num);
void format_fc_matrix(framework::Tensor* filter_tensor, float max_value,
int group_num, int height = 1, int width = 1);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_concat_output(framework::Tensor* out, int height, int width,
int image_num, uint32_t* channel_num);
} // namespace fpga
} // namespace paddle_mobile
......@@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) {
align_element_conv(data_in, height, channel * width);
}
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, uint32_t *channel_num,
int height, int width) {}
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile {
namespace fpga {
......@@ -21,6 +24,10 @@ namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width);
void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num,
int height,
int width); // Concat featuremaps along channel direction
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cstdlib>
#include <string>
#include "common/enforce.h"
namespace paddle_mobile {
namespace framework {
......
......@@ -49,7 +49,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
}
void RunImpl() const {
Tensor *input = const_cast<Tensor *>(param_.InputX());
auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
auto input_ptr = input->data<float>();
fpga::format_image(input);
Tensor *output = param_.Out();
......
......@@ -27,7 +27,11 @@ bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
template <typename T>
struct DropoutFunctor {
inline T operator()(T in) const { return in; }
DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
private:
T dropout_pro_;
};
template <>
......@@ -36,8 +40,8 @@ void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const {
auto *input_x_ptr = input_x->data<float>();
auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>();
DropoutFunctor<float> func_;
const float dropoutProb = param.DropoutProb();
DropoutFunctor<float> func_(dropoutProb);
math::Transform trans;
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
}
......
......@@ -21,31 +21,44 @@ namespace operators {
template <>
bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
auto inputs = param->Inputs();
auto out = param->Out();
auto image_num = inputs.size();
auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
auto channel_num =
(uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
auto height = inputs[0]->dims()[2];
auto width = inputs[0]->dims()[3];
for (int i = 0; i < image_num; i++) {
auto input = inputs[i];
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = (half *)input->data<float>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1];
scales_in[i] = input->scale;
}
fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
channel_num);
fpga::ConcatArgs concatArgs;
concatArgs.image_num = (uint32_t)image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->mutable_data<float>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = (uint32_t)height;
concatArgs.width = (uint32_t)width;
param->SetFpgaArgs(concatArgs);
return true;
}
template <>
void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const {
auto inputs = param.Inputs();
auto *out = param.Out();
int64_t axis = param.Axis();
out->mutable_data<half>();
DDim out_dim = out->dims();
int pixels = out_dim[1] * out_dim[2];
auto out_channel = out_dim[3];
auto out_offset = 0;
for (int i = 0; i < inputs.size(); ++i) {
auto input = inputs[i];
auto channels = input->dims()[3];
out_offset += channels;
auto src = input->data<half>();
for (int j = 0; j < pixels; ++j) {
auto dst = out->mutable_data<half>() + out_offset;
memory::Copy(dst, src, sizeof(half));
}
}
ComputeFPGAConcat(param.FpgaArgs());
}
template class ConcatKernel<FPGA, float>;
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/kernel/conv_add_bn_kernel.h"
#include "fpga/api.h"
namespace paddle_mobile {
namespace operators {
......@@ -23,13 +22,13 @@ namespace operators {
template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias();
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
Tensor *filter = param->Filter();
auto filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output();
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
......@@ -41,10 +40,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr =
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -70,27 +69,75 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)filter_ptr;
convArgs.filter_num = filter->dims()[0];
convArgs.group_num = param->Groups();
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
fpga::WrapperConvArgs convArgs;
convArgs.group_num = (uint32_t)param->Groups();
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
convArgs.concat_arg.image_num = convArgs.split_num;
convArgs.concat_arg.image_out = out_ptr;
convArgs.concat_arg.scale_out = out->scale;
convArgs.concat_arg.height = (uint32_t)filter->dims()[2];
convArgs.concat_arg.width = (uint32_t)filter->dims()[3];
int n = convArgs.split_num;
convArgs.concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
convArgs.concat_arg.scales_in =
(float **)fpga::fpga_malloc(n * sizeof(float *));
convArgs.concat_arg.channel_num =
(uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
convArgs.concat_arg.image_out = out_ptr;
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.scale_address = input->scale;
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
if (n > 1) {
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].output.address =
fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
convArgs.conv_args[i].filter_num * sizeof(half));
}
else {
convArgs.conv_args[i].output.scale_address = out->scale;
convArgs.conv_args[i].output.address = out_ptr;
}
convArgs.concat_arg.images_in[i] =
(half *)convArgs.conv_args[i].output.address;
convArgs.concat_arg.scales_in[i] =
(float *)convArgs.conv_args[i].sb_address;
convArgs.concat_arg.channel_num[i] = convArgs.conv_args[i].filter_num;
}
return true;
}
......
......@@ -23,12 +23,12 @@ template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
Tensor *filter = param->Filter();
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -39,9 +39,9 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -67,26 +67,45 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)filter_ptr;
convArgs.filter_num = filter->dims()[0];
convArgs.group_num = param->Groups();
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
fpga::WrapperConvArgs convArgs;
convArgs.group_num = (uint32_t)param->Groups();
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
return true;
}
......
......@@ -22,17 +22,17 @@ namespace operators {
template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
Tensor *filter = param->Filter();
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = bias_ptr[i];
......@@ -49,27 +49,44 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)filter_ptr;
convArgs.filter_num = filter->dims()[0];
convArgs.group_num = param->Groups();
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
fpga::WrapperConvArgs convArgs;
convArgs.group_num = (uint32_t)param->Groups();
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
}
......
......@@ -23,11 +23,10 @@ namespace operators {
template <>
bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bool relu_enabled = false;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
Tensor *filter = param->Filter();
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -37,10 +36,10 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr =
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -65,27 +64,44 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)filter_ptr;
convArgs.filter_num = filter->dims()[0];
convArgs.group_num = param->Groups();
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
fpga::WrapperConvArgs convArgs;
convArgs.group_num = (uint32_t)param->Groups();
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
}
......
......@@ -22,10 +22,10 @@ namespace operators {
template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
Tensor *filter = param->Filter();
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -34,9 +34,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -61,26 +61,44 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)filter_ptr;
convArgs.filter_num = filter->dims()[0];
convArgs.group_num = param->Groups();
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
fpga::WrapperConvArgs convArgs;
convArgs.group_num = (uint32_t)param->Groups();
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
}
......
......@@ -22,9 +22,9 @@ template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input_x = const_cast<Tensor *>(param->InputX());
Tensor *input_y = const_cast<Tensor *>(param->InputY());
Tensor *out = param->Out();
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
fpga::format_ofm(out);
......@@ -34,22 +34,22 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.const0 = 1;
ewaddArgs.const1 = 1;
ewaddArgs.image0.address = (void *)input_x_ptr;
ewaddArgs.image0.channels = input_x->dims()[1];
ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale;
ewaddArgs.image0.height = input_x->dims()[2];
ewaddArgs.image0.width = input_x->dims()[3];
ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
ewaddArgs.image0.pad_height = 0;
ewaddArgs.image0.pad_width = 0;
ewaddArgs.image1.address = (void *)input_y_ptr;
ewaddArgs.image1.channels = input_y->dims()[1];
ewaddArgs.image1.address = input_y_ptr;
ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
ewaddArgs.image1.scale_address = input_y->scale;
ewaddArgs.image1.height = input_y->dims()[2];
ewaddArgs.image1.width = input_y->dims()[3];
ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
ewaddArgs.image1.pad_height = 0;
ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = (void *)out_ptr;
ewaddArgs.output.address = out_ptr;
param->SetFpgaArgs(ewaddArgs);
return true;
}
......
......@@ -14,71 +14,84 @@ limitations under the License. */
#ifdef FUSION_FCRELU_OP
#include "operators/kernel/fc_relu_kernel.h"
#include "fpga/api.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input_x = const_cast<Tensor *>(param->InputX());
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>();
Tensor *input_y = param->InputY();
const Tensor *input_z = param->InputZ();
auto filter = const_cast<Tensor *>(param->InputY());
auto input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
int channel = (uint32_t)out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i];
}
int num = input_y->dims()[1];
int chw = input_y->dims()[0];
int num = (uint32_t)filter->dims()[1];
int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(),
"Filter element num should be equal to IFM element num");
int height = input_x->dims()[2];
int width = input_x->dims()[3];
int height = (uint32_t)input_x->dims()[2];
int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width;
input_y->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(input_y);
fpga::format_filter(input_y, max_value, 1);
auto input_y_ptr = input_y->data<float>();
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1);
auto filter_ptr = filter->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(input_y, 1);
int element_num_per_div = fpga::get_element_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)input_y_ptr;
convArgs.filter_num = out->dims()[1];
fpga::WrapperConvArgs convArgs;
convArgs.group_num = 1;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_w = 1;
convArgs.kernel.stride_h = 1;
convArgs.kernel.height = input_x->dims()[2];
convArgs.kernel.width = input_x->dims()[3];
convArgs.image.address = (void *)input_x_ptr;
convArgs.image.channels = input_x->dims()[1];
convArgs.image.height = input_x->dims()[2];
convArgs.image.width = input_x->dims()[3];
convArgs.image.pad_height = 0;
convArgs.image.pad_width = 0;
convArgs.image.scale_address = input_x->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = 1;
convArgs.conv_args[i].kernel.stride_h = 1;
convArgs.conv_args[i].kernel.stride_w = 1;
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_x_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.conv_args[i].image.pad_height = 0;
convArgs.conv_args[i].image.pad_width = 0;
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input_x->scale;
}
return true;
}
template <>
......
......@@ -21,58 +21,78 @@ namespace operators {
template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false;
Tensor *input_x = const_cast<Tensor *>(param->InputX());
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>();
Tensor *input_y = param->InputY();
auto filter = const_cast<Tensor *>(param->InputY());
const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out();
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
int channel = (uint32_t)out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i];
}
int num = input_y->dims()[1];
int chw = input_y->dims()[0];
int num = (uint32_t)filter->dims()[1];
int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(),
"Filter element num should be equal to IFM element num");
int height = input_x->dims()[2];
int width = input_x->dims()[3];
int height = (uint32_t)input_x->dims()[2];
int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width;
input_y->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(input_y);
fpga::format_filter(input_y, max_value, 1);
auto input_y_ptr = input_y->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(input_y, 1);
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1);
auto filter_ptr = filter->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)input_y_ptr;
convArgs.filter_num = out->dims()[1];
fpga::WrapperConvArgs convArgs;
convArgs.group_num = 1;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_w = 1;
convArgs.kernel.stride_h = 1;
convArgs.kernel.height = input_x->dims()[2];
convArgs.kernel.width = input_x->dims()[3];
convArgs.image.address = (void *)input_x_ptr;
convArgs.image.channels = input_x->dims()[1];
convArgs.image.height = input_x->dims()[2];
convArgs.image.width = input_x->dims()[3];
convArgs.image.pad_height = 0;
convArgs.image.pad_width = 0;
convArgs.image.scale_address = input_x->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = 1;
convArgs.conv_args[i].kernel.stride_h = 1;
convArgs.conv_args[i].kernel.stride_w = 1;
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_x_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.conv_args[i].image.pad_height = 0;
convArgs.conv_args[i].image.pad_width = 0;
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input_x->scale;
}
return true;
}
......
......@@ -21,7 +21,7 @@ namespace operators {
template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
Tensor *input = const_cast<Tensor *>(param->Input());
auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
Tensor *output = param->Output();
fpga::format_ofm(output);
......@@ -31,19 +31,19 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
vector<int> paddings = param->Paddings();
fpga::PoolingArgs poolArgs;
poolArgs.image.address = (void *)input_ptr;
poolArgs.image.channels = input->dims()[1];
poolArgs.image.height = input->dims()[2];
poolArgs.image.width = input->dims()[3];
poolArgs.image.pad_height = paddings[0];
poolArgs.image.pad_width = paddings[1];
poolArgs.image.address = input_ptr;
poolArgs.image.channels = (uint32_t)input->dims()[1];
poolArgs.image.height = (uint32_t)input->dims()[2];
poolArgs.image.width = (uint32_t)input->dims()[3];
poolArgs.image.pad_height = (uint32_t)paddings[0];
poolArgs.image.pad_width = (uint32_t)paddings[1];
poolArgs.image.scale_address = input->scale;
poolArgs.output.address = output_ptr;
poolArgs.output.scale_address = input->scale;
poolArgs.kernel.height = ksize[0];
poolArgs.kernel.width = ksize[1];
poolArgs.kernel.stride_h = strides[0];
poolArgs.kernel.stride_w = strides[1];
poolArgs.kernel.height = (uint32_t)ksize[0];
poolArgs.kernel.width = (uint32_t)ksize[1];
poolArgs.kernel.stride_h = (uint32_t)strides[0];
poolArgs.kernel.stride_w = (uint32_t)strides[1];
param->SetFpgaArgs(poolArgs);
return true;
}
......
......@@ -33,8 +33,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.convert_type = fpga::DATA_FP16_TO_FP32;
args.layout_type = fpga::LAYOUT_NO_CONVERT;
args.image.address = (void *)(input_ptr);
args.image.height = input->dims()[0];
args.image.width = input->dims()[1];
args.image.height = (uint32_t)input->dims()[0];
args.image.width = (uint32_t)input->dims()[1];
args.image.channels = 1;
args.output.address = output_ptr;
param->SetFpgaArgs(args);
......
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/gemm.h"
#include <string>
#include <string.h>
#include "common/log.h"
#include "memory/t_malloc.h"
#if __ARM_NEON
......@@ -2985,6 +2985,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) {}
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1) {}
#endif // __ARM_NEON
......
......@@ -489,6 +489,15 @@ class ConcatParam : public OpParam {
vector<GType *> inputs_;
GType *out_;
int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConcatArgs fpga_concat_args;
public:
const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; }
void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; }
#endif
};
#endif
......@@ -1238,11 +1247,7 @@ class FusionFcParam : public OpParam {
}
const GType *InputX() const { return input_x_; }
#ifdef PADDLE_MOBILE_FPGA
RType *InputY() const { return input_y_; }
#else
const RType *InputY() const { return input_y_; }
#endif
const RType *InputZ() const { return input_z_; }
......@@ -1265,11 +1270,11 @@ class FusionFcParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
......@@ -1303,11 +1308,7 @@ class FusionConvAddParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
......@@ -1332,11 +1333,11 @@ class FusionConvAddParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
......@@ -1385,11 +1386,7 @@ class FusionConvAddPReluParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
......@@ -1416,11 +1413,11 @@ class FusionConvAddPReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1467,11 +1464,7 @@ class FusionConvAddAddPReluParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
......@@ -1502,11 +1495,11 @@ class FusionConvAddAddPReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1544,11 +1537,7 @@ class FusionConvAddBNReluParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
......@@ -1604,11 +1593,11 @@ class FusionConvAddBNReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1654,11 +1643,7 @@ class FusionConvBNAddReluParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
......@@ -1717,11 +1702,11 @@ class FusionConvBNAddReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1754,11 +1739,8 @@ class FusionConvBNParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_y_; }
const vector<int> &Strides() const { return strides_; }
......@@ -1811,11 +1793,11 @@ class FusionConvBNParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -1853,11 +1835,8 @@ class FusionConvAddBNParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_y_; }
const vector<int> &Strides() const { return strides_; }
......@@ -1912,11 +1891,11 @@ class FusionConvAddBNParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -2033,11 +2012,7 @@ class FusionConvBNReluParam : public OpParam {
const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; }
......@@ -2091,11 +2066,11 @@ class FusionConvBNReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConvArgs fpga_conv_args;
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
};
#endif
......@@ -2147,15 +2122,20 @@ class DropoutParam : public OpParam {
const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
dropout_prob_ = GetAttr<float>("dropout_prob", attrs);
}
const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; }
float DropoutProb() const { return dropout_prob_; }
private:
RType *input_x_;
RType *out_;
float dropout_prob_;
};
#endif
......
......@@ -208,6 +208,14 @@ else ()
target_link_libraries(test-gru-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-inceptionv4 paddle-mobile)
# gen test
ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-alexnet paddle-mobile)
ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
target_link_libraries(test-googlenetv1 paddle-mobile)
......@@ -215,10 +223,13 @@ else ()
ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
target_link_libraries(test-fssd paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif()
# if(FPGA)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_alexnet, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
DLOG << vec_result;
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_inceptionv4, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
// DLOG << vec_result;
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
......@@ -34,6 +34,8 @@ static const char *g_mobilenet_detect = "../models/mobilenet-detect";
static const char *g_squeezenet = "../models/squeezenet";
static const char *g_googlenet = "../models/googlenet";
static const char *g_mobilenet = "../models/mobilenet";
static const char *g_alexnet = "../models/alexnet";
static const char *g_inceptionv4 = "../models/inceptionv4";
static const char *g_nlp = "../models/nlp";
static const char *g_resnet_50 = "../models/resnet_50";
static const char *g_resnet = "../models/resnet";
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册