提交 71f88533 编写于 作者: xiebaiyuan's avatar xiebaiyuan

Merge remote-tracking branch 'upstream/develop' into develop

cmake_minimum_required(VERSION 3.6) cmake_minimum_required(VERSION 3.6)
option(USE_OPENMP "openmp support" ON) option(USE_OPENMP "openmp support" OFF)
project(paddle-mobile) project(paddle-mobile)
option(DEBUGING "enable debug mode" OFF) option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF) option(USE_EXCEPTION "use std exception" OFF)
option(LOG_PROFILE "log profile" OFF) option(LOG_PROFILE "log profile" OFF)
# select the platform to build # select the platform to build
...@@ -94,6 +94,8 @@ else() ...@@ -94,6 +94,8 @@ else()
endif() endif()
if(FPGA) if(FPGA)
set(DEBUGING ON)
add_definitions(-DPADDLE_MOBILE_DEBUG)
add_definitions(-DPADDLE_MOBILE_FPGA) add_definitions(-DPADDLE_MOBILE_FPGA)
else() else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
...@@ -140,7 +142,12 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) ...@@ -140,7 +142,12 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default # NET default
set(NET "default" CACHE STRING "select net type") if (FPGA)
set(NET "FPGAnets" CACHE STRING "select net type")
else()
set(NET "default" CACHE STRING "select net type")
endif()
set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP") set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP")
include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake") include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
......
...@@ -68,29 +68,35 @@ void fpga_copy(void *dest, const void *src, size_t num) { ...@@ -68,29 +68,35 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num); memcpy(dest, src, num);
} }
int ComputeFpgaConv(const struct ConvArgs &args) { int ComputeFpgaConv(const struct WrapperConvArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << " relu_enabled:" << args.relu_enabled /*DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address << " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address << " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num; << " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address << " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels << " image_channels:" << args.image.channels
<< " image_height:" << args.image.height << " image_height:" << args.image.height
<< " image_width:" << args.image.width << " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height << " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width; << " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width << " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h << " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w; << " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;*/
#endif #endif
int split_num = args.split_num;
for (int i = 0; i < split_num; i++) {
do_ioctl(IOCTL_CONFIG_CONV, &args.conv_args[i]);
}
return do_ioctl(IOCTL_CONFIG_CONV, &args); if (split_num > 1) {
ComputeFPGAConcat(args.concat_arg);
}
} }
int ComputeFpgaPool(const struct PoolingArgs &args) { int ComputeFpgaPool(const struct PoolingArgs &args) {
...@@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) {
return do_ioctl(IOCTL_CONFIG_BYPASS, &args); return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
} }
int ComputeFPGAConcat(const struct ConcatArgs &args) {
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width);
return 0;
}
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims(); auto dims = image_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->mutable_data<float>(); auto data_ptr = image_tensor->mutable_data<float>();
size_t memory_size = channel * height * width * sizeof(float); size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size); float *new_data = (float *)fpga_malloc(memory_size);
...@@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) { ...@@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) {
void format_ofm(framework::Tensor *ofm_tensor) { void format_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims(); auto dims = ofm_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
size_t memory_size = size_t memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
...@@ -178,38 +191,38 @@ float filter_find_max(framework::Tensor *filter_tensor) { ...@@ -178,38 +191,38 @@ float filter_find_max(framework::Tensor *filter_tensor) {
auto filter_ptr = filter_tensor->data<float>(); auto filter_ptr = filter_tensor->data<float>();
return filter::find_max(filter_ptr, filter_tensor->numel()); return filter::find_max(filter_ptr, filter_tensor->numel());
} }
int get_plit_num(framework::Tensor *filter_tensor) {
auto dims = filter_tensor->dims();
auto chw = dims[1] * dims[2] * dims[3];
auto num = dims[0];
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
}
int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims(); auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(dims.size() == 4 || dims.size() == 2, auto chw = dims[1] * dims[2] * dims[3];
"Filter order should be 4 or 2"); auto num = dims[0];
int chw = dims.size() == 4 ? dims[1] * dims[2] * dims[3] : dims[1];
int num = dims.size() == 4 ? dims[0] : dims[1];
int div_capacity = filter::calc_division_capacity(chw); int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_num_per_div(num, group_num, div_capacity); return filter::calc_num_per_div(num, group_num, div_capacity);
} }
void format_filter(framework::Tensor *filter_tensor, float max_value, int get_aligned_filter_element_num(int chw) {
int group_num) { return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
auto dims = filter_tensor->dims(); }
int num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>(); int get_aligned_filter_num(int num) {
size_t memory_size = num * channel * height * width * sizeof(float); return align_to_x(num, FILTER_NUM_ALIGNMENT);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
} }
void format_fc_matrix(framework::Tensor *filter_tensor, float max_value, void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num, int height, int width) { int group_num) {
auto dims = filter_tensor->dims(); auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(height == 1 && width == 1, auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
"IFM should be flattened for FC");
int num = dims[1], channel = dims[0] / height / width;
auto data_ptr = filter_tensor->mutable_data<float>(); auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float); size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size); auto new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size); fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num, filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value); max_value);
...@@ -222,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array, ...@@ -222,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array,
element_num_per_division, num); element_num_per_division, num);
} }
void format_concat_output(framework::Tensor *out, int height, int width,
int image_num, uint32_t *channel_num) {
int sum_channel = 0, sum_cw = 0;
for (int i = 0; i < image_num; i++) {
sum_channel += channel_num[i];
}
sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
auto ddim = framework::make_ddim({-1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
}
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -92,6 +92,26 @@ struct ConvArgs { ...@@ -92,6 +92,26 @@ struct ConvArgs {
struct ImageOutputArgs output; struct ImageOutputArgs output;
}; };
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t height;
uint32_t width;
};
struct WrapperConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs { struct PoolingArgs {
struct KernelArgs kernel; struct KernelArgs kernel;
struct ImageInputArgs image; // input image; struct ImageInputArgs image; // input image;
...@@ -165,21 +185,26 @@ enum FPGA_ERR_TYPE { ...@@ -165,21 +185,26 @@ enum FPGA_ERR_TYPE {
//============================== API ============================= //============================== API =============================
int PerformBypass(const struct BypassArgs& args); int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct ConvArgs& args); int ComputeFpgaConv(const struct WrapperConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
void format_image(framework::Tensor* image_tensor); void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory
float filter_find_max(framework::Tensor* filter_tensor); float filter_find_max(framework::Tensor* filter_tensor);
int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num);
int get_plit_num(framework::Tensor* filter_tensor);
int get_aligned_filter_element_num(int chw);
int get_aligned_filter_num(int num);
void format_filter(framework::Tensor* filter_tensor, float max_value, void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num); int group_num);
void format_fc_matrix(framework::Tensor* filter_tensor, float max_value,
int group_num, int height = 1, int width = 1);
void format_bias_scale_array(float** bias_scale_array, void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num); int element_num_per_division, int num);
void format_concat_output(framework::Tensor* out, int height, int width,
int image_num, uint32_t* channel_num);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) { ...@@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) {
align_element_conv(data_in, height, channel * width); align_element_conv(data_in, height, channel * width);
} }
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, uint32_t *channel_num,
int height, int width) {}
} // namespace image } // namespace image
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <stdint.h>
#define IMAGE_ALIGNMENT 16 // Aligned to 16 #define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -21,6 +24,10 @@ namespace image { ...@@ -21,6 +24,10 @@ namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width); void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw); void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width); void format_image(float** data_in, int channel, int height, int width);
void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num,
int height,
int width); // Concat featuremaps along channel direction
} // namespace image } // namespace image
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cstdlib> #include <cstdlib>
#include <string>
#include "common/enforce.h" #include "common/enforce.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
......
...@@ -49,7 +49,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -49,7 +49,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
} }
void RunImpl() const { void RunImpl() const {
Tensor *input = const_cast<Tensor *>(param_.InputX()); auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
fpga::format_image(input); fpga::format_image(input);
Tensor *output = param_.Out(); Tensor *output = param_.Out();
......
...@@ -27,7 +27,11 @@ bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) { ...@@ -27,7 +27,11 @@ bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
template <typename T> template <typename T>
struct DropoutFunctor { struct DropoutFunctor {
inline T operator()(T in) const { return in; } DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
private:
T dropout_pro_;
}; };
template <> template <>
...@@ -36,8 +40,8 @@ void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const { ...@@ -36,8 +40,8 @@ void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const {
auto *input_x_ptr = input_x->data<float>(); auto *input_x_ptr = input_x->data<float>();
auto *out = param.Out(); auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>(); auto *out_ptr = out->mutable_data<float>();
const float dropoutProb = param.DropoutProb();
DropoutFunctor<float> func_; DropoutFunctor<float> func_(dropoutProb);
math::Transform trans; math::Transform trans;
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_); trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
} }
......
...@@ -21,31 +21,44 @@ namespace operators { ...@@ -21,31 +21,44 @@ namespace operators {
template <> template <>
bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
auto inputs = param->Inputs();
auto out = param->Out();
auto image_num = inputs.size();
auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
auto channel_num =
(uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
auto height = inputs[0]->dims()[2];
auto width = inputs[0]->dims()[3];
for (int i = 0; i < image_num; i++) {
auto input = inputs[i];
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = (half *)input->data<float>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1];
scales_in[i] = input->scale;
}
fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
channel_num);
fpga::ConcatArgs concatArgs;
concatArgs.image_num = (uint32_t)image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->mutable_data<float>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = (uint32_t)height;
concatArgs.width = (uint32_t)width;
param->SetFpgaArgs(concatArgs);
return true; return true;
} }
template <> template <>
void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const { void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const {
auto inputs = param.Inputs(); ComputeFPGAConcat(param.FpgaArgs());
auto *out = param.Out();
int64_t axis = param.Axis();
out->mutable_data<half>();
DDim out_dim = out->dims();
int pixels = out_dim[1] * out_dim[2];
auto out_channel = out_dim[3];
auto out_offset = 0;
for (int i = 0; i < inputs.size(); ++i) {
auto input = inputs[i];
auto channels = input->dims()[3];
out_offset += channels;
auto src = input->data<half>();
for (int j = 0; j < pixels; ++j) {
auto dst = out->mutable_data<half>() + out_offset;
memory::Copy(dst, src, sizeof(half));
}
}
} }
template class ConcatKernel<FPGA, float>; template class ConcatKernel<FPGA, float>;
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP #ifdef FUSION_CONVADDBN_OP
#include "operators/kernel/conv_add_bn_kernel.h" #include "operators/kernel/conv_add_bn_kernel.h"
#include "fpga/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -23,13 +22,13 @@ namespace operators { ...@@ -23,13 +22,13 @@ namespace operators {
template <> template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false; bool relu_enabled = false;
Tensor *input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias(); auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
Tensor *filter = param->Filter(); auto filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
...@@ -41,10 +40,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -41,10 +40,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
float *bs_ptr = auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float))); reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
Tensor *new_scale = new Tensor(); auto new_scale = new Tensor();
Tensor *new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel}); auto new_bias_ptr = new_bias->mutable_data<float>({channel});
...@@ -70,27 +69,75 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -70,27 +69,75 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled; convArgs.group_num = (uint32_t)param->Groups();
convArgs.filter_address = (void *)filter_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = filter->dims()[0]; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.group_num = param->Groups(); convArgs.output.address = out_ptr;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
convArgs.concat_arg.image_num = convArgs.split_num;
convArgs.concat_arg.image_out = out_ptr;
convArgs.concat_arg.scale_out = out->scale;
convArgs.concat_arg.height = (uint32_t)filter->dims()[2];
convArgs.concat_arg.width = (uint32_t)filter->dims()[3];
int n = convArgs.split_num;
convArgs.concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
convArgs.concat_arg.scales_in =
(float **)fpga::fpga_malloc(n * sizeof(float *));
convArgs.concat_arg.channel_num =
(uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
convArgs.concat_arg.image_out = out_ptr;
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.scale_address = input->scale;
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
if (n > 1) {
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].output.address =
fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
convArgs.conv_args[i].filter_num * sizeof(half));
}
else {
convArgs.conv_args[i].output.scale_address = out->scale;
convArgs.conv_args[i].output.address = out_ptr;
}
convArgs.concat_arg.images_in[i] =
(half *)convArgs.conv_args[i].output.address;
convArgs.concat_arg.scales_in[i] =
(float *)convArgs.conv_args[i].sb_address;
convArgs.concat_arg.channel_num[i] = convArgs.conv_args[i].filter_num;
}
return true; return true;
} }
......
...@@ -23,12 +23,12 @@ template <> ...@@ -23,12 +23,12 @@ template <>
bool ConvAddBNReluKernel<FPGA, float>::Init( bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) { FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
Tensor *filter = param->Filter(); auto filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -39,9 +39,9 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -39,9 +39,9 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
Tensor *new_scale = new Tensor(); auto new_scale = new Tensor();
Tensor *new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel}); auto new_bias_ptr = new_bias->mutable_data<float>({channel});
...@@ -67,26 +67,45 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -67,26 +67,45 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled; convArgs.group_num = (uint32_t)param->Groups();
convArgs.filter_address = (void *)filter_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = filter->dims()[0]; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.group_num = param->Groups(); convArgs.output.address = out_ptr;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
return true; return true;
} }
......
...@@ -22,17 +22,17 @@ namespace operators { ...@@ -22,17 +22,17 @@ namespace operators {
template <> template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
Tensor *filter = param->Filter(); auto filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
int channel = out->dims()[1]; int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1; bs_ptr[i + channel] = 1;
bs_ptr[i] = bias_ptr[i]; bs_ptr[i] = bias_ptr[i];
...@@ -49,27 +49,44 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -49,27 +49,44 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled; convArgs.group_num = (uint32_t)param->Groups();
convArgs.filter_address = (void *)filter_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = filter->dims()[0]; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.group_num = param->Groups(); convArgs.output.address = out_ptr;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true; return true;
} }
......
...@@ -23,11 +23,10 @@ namespace operators { ...@@ -23,11 +23,10 @@ namespace operators {
template <> template <>
bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bool relu_enabled = false; bool relu_enabled = false;
Tensor *input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
Tensor *filter = param->Filter(); auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
Tensor *out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -37,10 +36,10 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -37,10 +36,10 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
float *bs_ptr = auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float))); reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
Tensor *new_scale = new Tensor(); auto new_scale = new Tensor();
Tensor *new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel}); auto new_bias_ptr = new_bias->mutable_data<float>({channel});
...@@ -65,27 +64,44 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -65,27 +64,44 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled; convArgs.group_num = (uint32_t)param->Groups();
convArgs.filter_address = (void *)filter_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = filter->dims()[0]; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.group_num = param->Groups(); convArgs.output.address = out_ptr;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true; return true;
} }
......
...@@ -22,10 +22,10 @@ namespace operators { ...@@ -22,10 +22,10 @@ namespace operators {
template <> template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input()); auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
Tensor *filter = param->Filter(); auto filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output(); auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -34,9 +34,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -34,9 +34,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
Tensor *new_scale = new Tensor(); auto new_scale = new Tensor();
Tensor *new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel}); auto new_bias_ptr = new_bias->mutable_data<float>({channel});
...@@ -61,26 +61,44 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -61,26 +61,44 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
fpga::format_ofm(out); fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled; convArgs.group_num = (uint32_t)param->Groups();
convArgs.filter_address = (void *)filter_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.filter_num = filter->dims()[0]; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.group_num = param->Groups(); convArgs.output.address = out_ptr;
convArgs.sb_address = (void *)bs_ptr;
convArgs.kernel.stride_h = param->Strides()[0];
convArgs.kernel.stride_w = param->Strides()[1];
convArgs.kernel.height = filter->dims()[2];
convArgs.kernel.width = filter->dims()[3];
convArgs.image.address = (void *)input_ptr;
convArgs.image.channels = input->dims()[1];
convArgs.image.height = input->dims()[2];
convArgs.image.width = input->dims()[3];
convArgs.image.pad_height = param->Paddings()[0];
convArgs.image.pad_width = param->Paddings()[1];
convArgs.image.scale_address = input->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true; return true;
} }
......
...@@ -22,9 +22,9 @@ template <> ...@@ -22,9 +22,9 @@ template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init( bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) { ElementwiseAddReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
Tensor *input_x = const_cast<Tensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
Tensor *input_y = const_cast<Tensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
Tensor *out = param->Out(); auto *out = param->Out();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<float>();
fpga::format_ofm(out); fpga::format_ofm(out);
...@@ -34,22 +34,22 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -34,22 +34,22 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs.relu_enabled = relu_enabled; ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.const0 = 1; ewaddArgs.const0 = 1;
ewaddArgs.const1 = 1; ewaddArgs.const1 = 1;
ewaddArgs.image0.address = (void *)input_x_ptr; ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = input_x->dims()[1]; ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale; ewaddArgs.image0.scale_address = input_x->scale;
ewaddArgs.image0.height = input_x->dims()[2]; ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
ewaddArgs.image0.width = input_x->dims()[3]; ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
ewaddArgs.image0.pad_height = 0; ewaddArgs.image0.pad_height = 0;
ewaddArgs.image0.pad_width = 0; ewaddArgs.image0.pad_width = 0;
ewaddArgs.image1.address = (void *)input_y_ptr; ewaddArgs.image1.address = input_y_ptr;
ewaddArgs.image1.channels = input_y->dims()[1]; ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
ewaddArgs.image1.scale_address = input_y->scale; ewaddArgs.image1.scale_address = input_y->scale;
ewaddArgs.image1.height = input_y->dims()[2]; ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
ewaddArgs.image1.width = input_y->dims()[3]; ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
ewaddArgs.image1.pad_height = 0; ewaddArgs.image1.pad_height = 0;
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = (void *)out_ptr; ewaddArgs.output.address = out_ptr;
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
return true; return true;
} }
......
...@@ -14,71 +14,84 @@ limitations under the License. */ ...@@ -14,71 +14,84 @@ limitations under the License. */
#ifdef FUSION_FCRELU_OP #ifdef FUSION_FCRELU_OP
#include "operators/kernel/fc_relu_kernel.h" #include "operators/kernel/fc_relu_kernel.h"
#include "fpga/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
bool relu_enabled = true; bool relu_enabled = true;
Tensor *input_x = const_cast<Tensor *>(param->InputX()); auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<float>();
Tensor *input_y = param->InputY(); auto filter = const_cast<Tensor *>(param->InputY());
const Tensor *input_z = param->InputZ(); auto input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out(); auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
"Image channel should be equal to weight number"); "Image channel should be equal to weight number");
int channel = out->dims()[1]; int channel = (uint32_t)out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1; bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i]; bs_ptr[i] = input_z_ptr[i];
} }
int num = input_y->dims()[1]; int num = (uint32_t)filter->dims()[1];
int chw = input_y->dims()[0]; int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(), chw == input_x->numel(),
"Filter element num should be equal to IFM element num"); "Filter element num should be equal to IFM element num");
int height = input_x->dims()[2]; int height = (uint32_t)input_x->dims()[2];
int width = input_x->dims()[3]; int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width; int filter_channel = chw / height / width;
input_y->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(input_y); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(input_y, max_value, 1); fpga::format_filter(filter, max_value, 1);
auto input_y_ptr = input_y->data<float>(); auto filter_ptr = filter->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(input_y, 1); int element_num_per_div = fpga::get_element_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)input_y_ptr;
convArgs.filter_num = out->dims()[1];
convArgs.group_num = 1; convArgs.group_num = 1;
convArgs.sb_address = (void *)bs_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.kernel.stride_w = 1; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.kernel.stride_h = 1; convArgs.output.address = out_ptr;
convArgs.kernel.height = input_x->dims()[2];
convArgs.kernel.width = input_x->dims()[3];
convArgs.image.address = (void *)input_x_ptr;
convArgs.image.channels = input_x->dims()[1];
convArgs.image.height = input_x->dims()[2];
convArgs.image.width = input_x->dims()[3];
convArgs.image.pad_height = 0;
convArgs.image.pad_width = 0;
convArgs.image.scale_address = input_x->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = 1;
convArgs.conv_args[i].kernel.stride_h = 1;
convArgs.conv_args[i].kernel.stride_w = 1;
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_x_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.conv_args[i].image.pad_height = 0;
convArgs.conv_args[i].image.pad_width = 0;
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input_x->scale;
}
return true; return true;
} }
template <> template <>
......
...@@ -21,58 +21,78 @@ namespace operators { ...@@ -21,58 +21,78 @@ namespace operators {
template <> template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false; bool relu_enabled = false;
Tensor *input_x = const_cast<Tensor *>(param->InputX()); auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<float>();
Tensor *input_y = param->InputY(); auto filter = const_cast<Tensor *>(param->InputY());
const Tensor *input_z = param->InputZ(); const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out(); auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number"); "Image channel should be equal to weight number");
int channel = out->dims()[1]; int channel = (uint32_t)out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1; bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i]; bs_ptr[i] = input_z_ptr[i];
} }
int num = (uint32_t)filter->dims()[1];
int num = input_y->dims()[1]; int chw = (uint32_t)filter->dims()[0];
int chw = input_y->dims()[0];
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(), chw == input_x->numel(),
"Filter element num should be equal to IFM element num"); "Filter element num should be equal to IFM element num");
int height = input_x->dims()[2]; int height = (uint32_t)input_x->dims()[2];
int width = input_x->dims()[3]; int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width; int filter_channel = chw / height / width;
input_y->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(input_y); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
fpga::format_filter(input_y, max_value, 1); float max_value = fpga::filter_find_max(filter);
auto input_y_ptr = input_y->data<float>(); fpga::format_filter(filter, max_value, 1);
int element_num_per_div = fpga::get_element_num_per_div(input_y, 1); auto filter_ptr = filter->data<float>();
int element_num_per_div = fpga::get_element_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
auto out_ptr = out->mutable_data<float>(); auto out_ptr = out->mutable_data<float>();
fpga::ConvArgs convArgs; fpga::WrapperConvArgs convArgs;
convArgs.relu_enabled = relu_enabled;
convArgs.filter_address = (void *)input_y_ptr;
convArgs.filter_num = out->dims()[1];
convArgs.group_num = 1; convArgs.group_num = 1;
convArgs.sb_address = (void *)bs_ptr; convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
convArgs.kernel.stride_w = 1; convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.kernel.stride_h = 1; convArgs.output.address = out_ptr;
convArgs.kernel.height = input_x->dims()[2];
convArgs.kernel.width = input_x->dims()[3];
convArgs.image.address = (void *)input_x_ptr;
convArgs.image.channels = input_x->dims()[1];
convArgs.image.height = input_x->dims()[2];
convArgs.image.width = input_x->dims()[3];
convArgs.image.pad_height = 0;
convArgs.image.pad_width = 0;
convArgs.image.scale_address = input_x->scale;
convArgs.output.address = (void *)out_ptr;
convArgs.output.scale_address = out->scale; convArgs.output.scale_address = out->scale;
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs); param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = 1;
convArgs.conv_args[i].kernel.stride_h = 1;
convArgs.conv_args[i].kernel.stride_w = 1;
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_x_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.conv_args[i].image.pad_height = 0;
convArgs.conv_args[i].image.pad_width = 0;
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input_x->scale;
}
return true; return true;
} }
......
...@@ -21,7 +21,7 @@ namespace operators { ...@@ -21,7 +21,7 @@ namespace operators {
template <> template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
Tensor *input = const_cast<Tensor *>(param->Input()); auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
Tensor *output = param->Output(); Tensor *output = param->Output();
fpga::format_ofm(output); fpga::format_ofm(output);
...@@ -31,19 +31,19 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { ...@@ -31,19 +31,19 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
fpga::PoolingArgs poolArgs; fpga::PoolingArgs poolArgs;
poolArgs.image.address = (void *)input_ptr; poolArgs.image.address = input_ptr;
poolArgs.image.channels = input->dims()[1]; poolArgs.image.channels = (uint32_t)input->dims()[1];
poolArgs.image.height = input->dims()[2]; poolArgs.image.height = (uint32_t)input->dims()[2];
poolArgs.image.width = input->dims()[3]; poolArgs.image.width = (uint32_t)input->dims()[3];
poolArgs.image.pad_height = paddings[0]; poolArgs.image.pad_height = (uint32_t)paddings[0];
poolArgs.image.pad_width = paddings[1]; poolArgs.image.pad_width = (uint32_t)paddings[1];
poolArgs.image.scale_address = input->scale; poolArgs.image.scale_address = input->scale;
poolArgs.output.address = output_ptr; poolArgs.output.address = output_ptr;
poolArgs.output.scale_address = input->scale; poolArgs.output.scale_address = input->scale;
poolArgs.kernel.height = ksize[0]; poolArgs.kernel.height = (uint32_t)ksize[0];
poolArgs.kernel.width = ksize[1]; poolArgs.kernel.width = (uint32_t)ksize[1];
poolArgs.kernel.stride_h = strides[0]; poolArgs.kernel.stride_h = (uint32_t)strides[0];
poolArgs.kernel.stride_w = strides[1]; poolArgs.kernel.stride_w = (uint32_t)strides[1];
param->SetFpgaArgs(poolArgs); param->SetFpgaArgs(poolArgs);
return true; return true;
} }
......
...@@ -33,8 +33,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -33,8 +33,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.convert_type = fpga::DATA_FP16_TO_FP32; args.convert_type = fpga::DATA_FP16_TO_FP32;
args.layout_type = fpga::LAYOUT_NO_CONVERT; args.layout_type = fpga::LAYOUT_NO_CONVERT;
args.image.address = (void *)(input_ptr); args.image.address = (void *)(input_ptr);
args.image.height = input->dims()[0]; args.image.height = (uint32_t)input->dims()[0];
args.image.width = input->dims()[1]; args.image.width = (uint32_t)input->dims()[1];
args.image.channels = 1; args.image.channels = 1;
args.output.address = output_ptr; args.output.address = output_ptr;
param->SetFpgaArgs(args); param->SetFpgaArgs(args);
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
#include <string> #include <string.h>
#include "common/log.h" #include "common/log.h"
#include "memory/t_malloc.h" #include "memory/t_malloc.h"
#if __ARM_NEON #if __ARM_NEON
...@@ -2985,6 +2985,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, ...@@ -2985,6 +2985,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) {} float *new_scale, float *new_bias) {}
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1) {}
#endif // __ARM_NEON #endif // __ARM_NEON
......
...@@ -489,6 +489,15 @@ class ConcatParam : public OpParam { ...@@ -489,6 +489,15 @@ class ConcatParam : public OpParam {
vector<GType *> inputs_; vector<GType *> inputs_;
GType *out_; GType *out_;
int axis_; int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConcatArgs fpga_concat_args;
public:
const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; }
void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; }
#endif
}; };
#endif #endif
...@@ -1238,11 +1247,7 @@ class FusionFcParam : public OpParam { ...@@ -1238,11 +1247,7 @@ class FusionFcParam : public OpParam {
} }
const GType *InputX() const { return input_x_; } const GType *InputX() const { return input_x_; }
#ifdef PADDLE_MOBILE_FPGA
RType *InputY() const { return input_y_; }
#else
const RType *InputY() const { return input_y_; } const RType *InputY() const { return input_y_; }
#endif
const RType *InputZ() const { return input_z_; } const RType *InputZ() const { return input_z_; }
...@@ -1265,11 +1270,11 @@ class FusionFcParam : public OpParam { ...@@ -1265,11 +1270,11 @@ class FusionFcParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
...@@ -1303,11 +1308,7 @@ class FusionConvAddParam : public OpParam { ...@@ -1303,11 +1308,7 @@ class FusionConvAddParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; } RType *Output() const { return output_; }
...@@ -1332,11 +1333,11 @@ class FusionConvAddParam : public OpParam { ...@@ -1332,11 +1333,11 @@ class FusionConvAddParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
...@@ -1385,11 +1386,7 @@ class FusionConvAddPReluParam : public OpParam { ...@@ -1385,11 +1386,7 @@ class FusionConvAddPReluParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; } RType *Output() const { return output_; }
...@@ -1416,11 +1413,11 @@ class FusionConvAddPReluParam : public OpParam { ...@@ -1416,11 +1413,11 @@ class FusionConvAddPReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1467,11 +1464,7 @@ class FusionConvAddAddPReluParam : public OpParam { ...@@ -1467,11 +1464,7 @@ class FusionConvAddAddPReluParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; } RType *Output() const { return output_; }
...@@ -1502,11 +1495,11 @@ class FusionConvAddAddPReluParam : public OpParam { ...@@ -1502,11 +1495,11 @@ class FusionConvAddAddPReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1544,11 +1537,7 @@ class FusionConvAddBNReluParam : public OpParam { ...@@ -1544,11 +1537,7 @@ class FusionConvAddBNReluParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; } RType *Output() const { return output_; }
...@@ -1604,11 +1593,11 @@ class FusionConvAddBNReluParam : public OpParam { ...@@ -1604,11 +1593,11 @@ class FusionConvAddBNReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1654,11 +1643,7 @@ class FusionConvBNAddReluParam : public OpParam { ...@@ -1654,11 +1643,7 @@ class FusionConvBNAddReluParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; } RType *Output() const { return output_; }
...@@ -1717,11 +1702,11 @@ class FusionConvBNAddReluParam : public OpParam { ...@@ -1717,11 +1702,11 @@ class FusionConvBNAddReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1754,11 +1739,8 @@ class FusionConvBNParam : public OpParam { ...@@ -1754,11 +1739,8 @@ class FusionConvBNParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_y_; } RType *Output() const { return output_y_; }
const vector<int> &Strides() const { return strides_; } const vector<int> &Strides() const { return strides_; }
...@@ -1811,11 +1793,11 @@ class FusionConvBNParam : public OpParam { ...@@ -1811,11 +1793,11 @@ class FusionConvBNParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1853,11 +1835,8 @@ class FusionConvAddBNParam : public OpParam { ...@@ -1853,11 +1835,8 @@ class FusionConvAddBNParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_y_; } RType *Output() const { return output_y_; }
const vector<int> &Strides() const { return strides_; } const vector<int> &Strides() const { return strides_; }
...@@ -1912,11 +1891,11 @@ class FusionConvAddBNParam : public OpParam { ...@@ -1912,11 +1891,11 @@ class FusionConvAddBNParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -2033,11 +2012,7 @@ class FusionConvBNReluParam : public OpParam { ...@@ -2033,11 +2012,7 @@ class FusionConvBNReluParam : public OpParam {
const RType *Input() const { return input_; } const RType *Input() const { return input_; }
#ifdef PADDLE_MOBILE_FPGA
RType *Filter() const { return filter_; }
#else
const RType *Filter() const { return filter_; } const RType *Filter() const { return filter_; }
#endif
RType *Output() const { return output_; } RType *Output() const { return output_; }
...@@ -2091,11 +2066,11 @@ class FusionConvBNReluParam : public OpParam { ...@@ -2091,11 +2066,11 @@ class FusionConvBNReluParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::ConvArgs fpga_conv_args; fpga::WrapperConvArgs fpga_conv_args;
public: public:
const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -2147,15 +2122,20 @@ class DropoutParam : public OpParam { ...@@ -2147,15 +2122,20 @@ class DropoutParam : public OpParam {
const AttributeMap &attrs, const Scope &scope) { const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<GType>(inputs, scope); input_x_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope); out_ = OutFrom<GType>(outputs, scope);
dropout_prob_ = GetAttr<float>("dropout_prob", attrs);
} }
const RType *InputX() const { return input_x_; } const RType *InputX() const { return input_x_; }
RType *Out() const { return out_; } RType *Out() const { return out_; }
float DropoutProb() const { return dropout_prob_; }
private: private:
RType *input_x_; RType *input_x_;
RType *out_; RType *out_;
float dropout_prob_;
}; };
#endif #endif
......
...@@ -208,6 +208,14 @@ else () ...@@ -208,6 +208,14 @@ else ()
target_link_libraries(test-gru-op paddle-mobile) target_link_libraries(test-gru-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-inceptionv4 paddle-mobile)
# gen test
ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-alexnet paddle-mobile)
ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
target_link_libraries(test-googlenetv1 paddle-mobile) target_link_libraries(test-googlenetv1 paddle-mobile)
...@@ -215,10 +223,13 @@ else () ...@@ -215,10 +223,13 @@ else ()
ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
target_link_libraries(test-fssd paddle-mobile) target_link_libraries(test-fssd paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif() endif()
# if(FPGA) # if(FPGA)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_alexnet, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
DLOG << vec_result;
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_inceptionv4, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
// DLOG << vec_result;
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
...@@ -34,6 +34,8 @@ static const char *g_mobilenet_detect = "../models/mobilenet-detect"; ...@@ -34,6 +34,8 @@ static const char *g_mobilenet_detect = "../models/mobilenet-detect";
static const char *g_squeezenet = "../models/squeezenet"; static const char *g_squeezenet = "../models/squeezenet";
static const char *g_googlenet = "../models/googlenet"; static const char *g_googlenet = "../models/googlenet";
static const char *g_mobilenet = "../models/mobilenet"; static const char *g_mobilenet = "../models/mobilenet";
static const char *g_alexnet = "../models/alexnet";
static const char *g_inceptionv4 = "../models/inceptionv4";
static const char *g_nlp = "../models/nlp"; static const char *g_nlp = "../models/nlp";
static const char *g_resnet_50 = "../models/resnet_50"; static const char *g_resnet_50 = "../models/resnet_50";
static const char *g_resnet = "../models/resnet"; static const char *g_resnet = "../models/resnet";
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册