提交 c2a649d3 编写于 作者: qnqinan's avatar qnqinan

add dw deconv with group in FPGA track

上级 c5ad3169
...@@ -151,6 +151,30 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { ...@@ -151,6 +151,30 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
} }
void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
int stride) {
auto dims = filter_tensor->dims();
auto num = dims[0], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
int hw = height * width;
deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
num = dims[1];
int channel = dims[0];
deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
scale_ptr, stride);
// framework::DDim dims_new =
// framework::make_ddim({num, 1, height, width});
// filter_tensor->Resize(dims_new);
filter_tensor->reset_data_ptr(new_data);
}
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
...@@ -243,6 +267,17 @@ void format_dwconv_data(framework::Tensor *filter_tensor, ...@@ -243,6 +267,17 @@ void format_dwconv_data(framework::Tensor *filter_tensor,
format_bias_array(bias_ptr, channel); format_bias_array(bias_ptr, channel);
format_fp16_ofm(ofm_tensor); format_fp16_ofm(ofm_tensor);
} }
void format_DWDeconv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
int group, int sub_conv_n) {
int channel = ofm_tensor->dims()[1];
// dw-deconv
format_DWDconv_filter(
filter_tensor,
(reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
format_bias_array(bs_ptr, channel);
format_fp16_ofm(ofm_tensor);
}
void expand_conv_arg(ConvArgs *arg) { void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg; ConvArgs args = *arg;
...@@ -788,5 +823,109 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, ...@@ -788,5 +823,109 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
} // end dwconv arg fill } // end dwconv arg fill
void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)filter->dims()[0];
arg->sub_conv_num = (uint32_t)stride_w;
arg->filter_num = (uint32_t)filter->dims()[0];
int sub_conv_num = stride_w;
int sub_pad =
deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT
padding_w, stride_w);
auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
(int)filter->dims()[3], stride_w); // NOLINT
auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
(int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT
auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
(int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT
arg->sub_output_width = (uint32_t)sub_output_width;
arg->sub_output_height = (uint32_t)sub_output_height;
arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
stride_w, (int)filter->dims()[3], padding_w); // NOLINT
auto sub_channels = (int)input->dims()[1]; // NOLINT
uint32_t omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
int sub_filter_num = sub_conv_num * (arg->filter_num);
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, real_out_height, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
/*====For Addition
arg->output.address =
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
*/
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
int filter_offset = sub_filter_width * sub_filter_width *
align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
arg->sub_conv_num;
for (int i = 0; i < sub_conv_num; ++i) {
arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
arg->dw_conv_args[i]->relu_enabled = relu_enabled;
arg->dw_conv_args[i]->bias_address = bias_ptr;
arg->dw_conv_args[i]->filter_address =
fpga_malloc(filter_offset * sizeof(int16_t));
memcpy(arg->dw_conv_args[i]->filter_address,
(reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
filter_offset * sizeof(int16_t));
arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
deleter));
arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
arg->dw_conv_args[i]->image.address = input_ptr;
arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
arg->dw_conv_args[i]->image.pad_height = sub_pad;
arg->dw_conv_args[i]->image.pad_width = sub_pad;
arg->dw_conv_args[i]->image.scale_address = input->scale;
arg->dw_conv_args[i]->output.address =
fpga_malloc(sub_output_height *
align_to_x(sub_output_width * sub_channels * sub_conv_num,
IMAGE_ALIGNMENT) *
sizeof(int16_t));
arg->dw_conv_args[i]->output.scale_address =
static_cast<float *>(fpga_malloc(2 * sizeof(float)));
arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
deleter));
arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
deleter));
}
// arg->output.scale_address = out->scale;
} // end dwconv arg fill
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -57,6 +57,10 @@ void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, ...@@ -57,6 +57,10 @@ void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int stride_h, int stride_w, bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float* bias_ptr); int padding_h, int padding_w, float* bias_ptr);
void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float* bs_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride); int group_num, int stride);
...@@ -69,6 +73,10 @@ void format_deconv_data(framework::Tensor* filter_tensor, ...@@ -69,6 +73,10 @@ void format_deconv_data(framework::Tensor* filter_tensor,
void format_dwconv_data(framework::Tensor* filter_tensor, void format_dwconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* scale_ptr, framework::Tensor* ofm_tensor, float* scale_ptr,
float** bias_ptr); float** bias_ptr);
void format_DWDeconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr,
int group, int sub_conv_n);
template <typename Dtype> template <typename Dtype>
void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) { void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
float data; float data;
......
...@@ -21,15 +21,6 @@ limitations under the License. */ ...@@ -21,15 +21,6 @@ limitations under the License. */
#include "fpga/V1/api.h" #include "fpga/V1/api.h"
// #include "fpga_api.h" // #include "fpga_api.h"
// just for test
//#include <string>
//#include "deconv.h"
//#include "deconv_api.h"
// using namespace std;
// using namespace paddle_mobile::fpga;
// using namespace baidu::fpga::deconv::api;
// namespace api = baidu::fpga::deconv::api;
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace deconv_filter { namespace deconv_filter {
...@@ -42,7 +33,8 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width, ...@@ -42,7 +33,8 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
float* tmp = *data_in; float* tmp = *data_in;
int data_size = num * channel * width * height; int data_size = num * channel * width * height;
int hw_len = height * width; int hw_len = height * width;
auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float)); auto tmp_data =
reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
for (int j = 0; j < channel; ++j) { for (int j = 0; j < channel; ++j) {
for (int k = 0; k < hw_len; ++k) { for (int k = 0; k < hw_len; ++k) {
...@@ -97,9 +89,10 @@ int deconv_get_omit(int stride, int filter_width, int pad) { ...@@ -97,9 +89,10 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
return (stride - idx); return (stride - idx);
} }
void deconv_get_sub_filter(char** data_in, int height, int width, template <typename T>
int sub_conv_n, int kernel_num, int channel) { void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
char* ptr_tmp = *data_in; int kernel_num, int channel) {
T* ptr_tmp = *data_in;
int sub_num = kernel_num * sub_conv_n; int sub_num = kernel_num * sub_conv_n;
int sub_h = height / sub_conv_n; int sub_h = height / sub_conv_n;
int sub_w = width / sub_conv_n; int sub_w = width / sub_conv_n;
...@@ -107,7 +100,8 @@ void deconv_get_sub_filter(char** data_in, int height, int width, ...@@ -107,7 +100,8 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
int sub_filter_size = int sub_filter_size =
kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n; kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char)); T* ptr_sub_filter =
reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
for (int idx = 0; idx < sub_conv_n; ++idx) { for (int idx = 0; idx < sub_conv_n; ++idx) {
for (int nn = 0; nn < sub_num; ++nn) { for (int nn = 0; nn < sub_num; ++nn) {
int ni = nn % kernel_num; int ni = nn % kernel_num;
...@@ -124,7 +118,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width, ...@@ -124,7 +118,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
fpga_copy( fpga_copy(
ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx, ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
(*data_in) + kidx, channel * sizeof(char)); (*data_in) + kidx, channel * sizeof(T));
// for (int cc =0; cc < channel; ++cc) { // for (int cc =0; cc < channel; ++cc) {
// ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] = // ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
// (*data_in)[kidx + cc]; // (*data_in)[kidx + cc];
...@@ -140,7 +134,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width, ...@@ -140,7 +134,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width,
void deconv_NC_convert(float** filter_in, int kernel_num, int channels, void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
int hw) { int hw) {
float* tmp = *filter_in; float* tmp = *filter_in;
float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc( float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
hw * kernel_num * channels * sizeof(float))); hw * kernel_num * channels * sizeof(float)));
for (int c = 0; c < channels; ++c) { for (int c = 0; c < channels; ++c) {
...@@ -188,7 +182,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -188,7 +182,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
result2); result2);
}*/ }*/
deconv_get_sub_filter(quantize_data, height, width, stride, num, channel); deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
channel);
/*{ /*{
char result2 = (char)0; char result2 = (char)0;
string filename = "sub_filter_filter_data"; string filename = "sub_filter_filter_data";
...@@ -212,10 +207,12 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -212,10 +207,12 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
((residual == 0) ? div_num : (div_num - 1)) + ((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT); align_to_x(residual, FILTER_NUM_ALIGNMENT);
char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*)); char** ptr_ptr_data =
reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
int origin_offset = sub_chw * sub_num; int origin_offset = sub_chw * sub_num;
for (int i = 0; i < sub_conv_n; ++i) { for (int i = 0; i < sub_conv_n; ++i) {
(ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char)); (ptr_ptr_data)[i] =
reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i, fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
origin_offset * sizeof(char)); origin_offset * sizeof(char));
...@@ -233,8 +230,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -233,8 +230,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
int align_offset = int align_offset =
align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment; align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset * char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
sizeof(char)); // continuous space sub_conv_n * align_offset * sizeof(char))); // continuous space
for (int i = 0; i < sub_conv_n; ++i) { for (int i = 0; i < sub_conv_n; ++i) {
char* ptr_tmp = (ptr_ptr_data)[i]; char* ptr_tmp = (ptr_ptr_data)[i];
...@@ -251,7 +248,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -251,7 +248,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
fpga_free(ptr_tmp); fpga_free(ptr_tmp);
} }
*data_in = (float*)ptr_space; *data_in = reinterpret_cast<float*>(ptr_space);
/* { /* {
char result2 = (char)0; char result2 = (char)0;
...@@ -262,6 +259,22 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -262,6 +259,22 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char)); fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
} }
void DWDconv_format_filter(float** data_in, int num, int channel, int height,
int width, float* scale_ptr, int stride) {
deconv_inverse_filter(data_in, num, channel, width, height);
filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
int16_t** quantize_data = (int16_t**)data_in; // NOLINT
filter::convert_to_hwn(quantize_data, channel, height, width);
deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
channel);
filter::align_element_n(quantize_data, channel, height, width);
fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
} // namespace deconv_filter } // namespace deconv_filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -24,11 +24,15 @@ int deconv_calc_sub_pad(int filter_axis, int pad, int stride); ...@@ -24,11 +24,15 @@ int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
int deconv_get_sub_filter_axis(int filter_axis, int stride); int deconv_get_sub_filter_axis(int filter_axis, int stride);
int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
int deconv_get_omit(int stride, int filter_width, int pad); int deconv_get_omit(int stride, int filter_width, int pad);
void deconv_get_sub_filter(char** data_in, int height, int width,
int sub_conv_n, int kernel_num, int channel); template <typename T>
void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
int kernel_num, int channel);
void deconv_format_filter(float** data_in, int num, int channel, int height, void deconv_format_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max, int stride); int width, int group_num, float max, int stride);
void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw); void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
void DWDconv_format_filter(float** data_in, int num, int channel, int height,
int width, float* scale_ptr, int stride);
} // namespace deconv_filter } // namespace deconv_filter
} // namespace fpga } // namespace fpga
......
...@@ -346,6 +346,16 @@ void format_dwconv_filter(float **data_in, int num, int height, int width, ...@@ -346,6 +346,16 @@ void format_dwconv_filter(float **data_in, int num, int height, int width,
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t)); height * width * sizeof(int16_t));
} }
void format_DWDeconv_filter(float **data_in, int num, int height, int width,
float *scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
convert_to_hwn(quantize_data, num, height, width);
align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "fpga/common/config.h" #include "fpga/common/config.h"
#include "fpga/common/driver.h" #include "fpga/common/driver.h"
#ifdef COST_TIME_PRINT #ifdef COST_TIME_PRINT
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
...@@ -163,6 +164,7 @@ using namespace std; // NOLINT ...@@ -163,6 +164,7 @@ using namespace std; // NOLINT
#define REG_DWCONV_FILTER_BASE_ADDR 0xe08 #define REG_DWCONV_FILTER_BASE_ADDR 0xe08
#define REG_DWCONV_FILTER_SHAPE 0xe10 #define REG_DWCONV_FILTER_SHAPE 0xe10
#define REG_DWCONV_FILTER_N_ALIGN 0xe18 #define REG_DWCONV_FILTER_N_ALIGN 0xe18
#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
#define REG_DWCONV_CMD 0xe00 #define REG_DWCONV_CMD 0xe00
int ComputeFpgaConv(const struct SplitConvArgs &args) { int ComputeFpgaConv(const struct SplitConvArgs &args) {
...@@ -591,6 +593,20 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -591,6 +593,20 @@ int PerformBypass(const struct BypassArgs &args) {
return 0; return 0;
} // PerformBypass } // PerformBypass
uint64_t FPGAVersion() {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaBypass===========";
#endif
#ifdef PADDLE_MOBILE_ZU5
uint64_t fpga_ver = 0;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
fpga_ver = reg_readq(REG_HARDWARE_STATUS);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return fpga_ver;
#endif
return 0;
} // FPGAVersion
int ComputeFPGAConcat(const struct ConcatArgs &args) { int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_PRINT_MODE #ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaConcat==========="; DLOG << "=============ComputeFpgaConcat===========";
...@@ -655,6 +671,45 @@ void deconv_post_process(const struct DeconvArgs &args) { ...@@ -655,6 +671,45 @@ void deconv_post_process(const struct DeconvArgs &args) {
fpga_flush(args.output.address, fpga_flush(args.output.address,
num * align_deconv_row_len * deconv_h * sizeof(int16_t)); num * align_deconv_row_len * deconv_h * sizeof(int16_t));
} }
void DWDeconv_post_process(const struct DWDeconvArgs &args) {
int sub_conv_n = args.sub_conv_num;
int sub_height = args.sub_output_height;
int sub_width = args.sub_output_width;
int omit_size = args.omit_size;
int channel = args.filter_num;
int num = 1;
int origin_h = sub_height * sub_conv_n;
int origin_w = sub_width * sub_conv_n;
int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
int deconv_h = origin_h - 2 * omit_size;
int deconv_w = origin_w - 2 * omit_size;
int deconv_row_len = deconv_w * channel;
int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
for (int idx = 0; idx < sub_conv_n; ++idx) {
paddle_mobile::fpga::fpga_invalidate(
args.dw_conv_args[idx]->output.address,
align_origin_w * origin_h * sizeof(int16_t));
}
int deconv_idx = 0;
for (int nn = 0; nn < num; ++nn) {
for (int hh = 0; hh < origin_h; ++hh) {
int hx = (hh % sub_conv_n);
auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1] // NOLINT
->output.address);
int hi = (hh / sub_conv_n);
if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
omit_size * channel);
fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT
sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT
deconv_idx += align_deconv_row_len;
}
}
fpga_flush(args.output.address,
num * align_deconv_row_len * deconv_h * sizeof(int16_t));
}
int ComputeFpgaDeconv(const struct DeconvArgs &args) { int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef FPGA_PRINT_MODE #ifdef FPGA_PRINT_MODE
...@@ -792,17 +847,21 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -792,17 +847,21 @@ int ComputeDWConv(const struct DWconvArgs &args) {
align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
uint64_t filter_amount_per_row_align = uint64_t filter_amount_per_row_align =
filter_N_align * (uint64_t)args.kernel.width; filter_N_align * (uint64_t)args.kernel.width;
uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width * uint64_t sub_filter_amount_align = filter_N_align *
(uint64_t)args.kernel.height; (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height;
uint64_t filter_amount_align =
sub_filter_amount_align * (uint64_t)args.sub_conv_num;
uint32_t output_height = (uint32_t)( uint32_t output_height = (uint32_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) / (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h + args.kernel.stride_h +
1); 1);
uint32_t output_width = (uint32_t)( uint32_t output_width = (uint32_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) / ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + args.kernel.stride_w +
1); 1) *
args.sub_conv_num);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
...@@ -845,12 +904,15 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -845,12 +904,15 @@ int ComputeDWConv(const struct DWconvArgs &args) {
/*restart scale*/ /*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
reg_writeq((bias_physical_address << 32 | filter_physical_address), reg_writeq((bias_physical_address << 32 | filter_physical_address),
REG_DWCONV_FILTER_BASE_ADDR); REG_DWCONV_FILTER_BASE_ADDR);
reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
REG_DWCONV_FILTER_SHAPE); REG_DWCONV_FILTER_SHAPE);
reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
REG_DWCONV_FILTER_SUBNUMBER);
reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
reg_writeq( reg_writeq(
...@@ -904,10 +966,89 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -904,10 +966,89 @@ int ComputeDWConv(const struct DWconvArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
DLOG << "output_scale:" << output_scale;
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
#endif #endif
return 0; return 0;
} }
int ComputeDWDeconv(const struct DWDeconvArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFPGADeConv===========";
DLOG << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num << "omit_size:" << args.omit_size
<< "sub_output_width: " << args.sub_output_width
<< "sub_output_height: " << args.sub_output_height
<< " sub_conv_num:" << args.sub_conv_num;
DLOG << "args.output.address: " << args.output.address
<< "args.output.scale_address: " << args.output.scale_address;
#endif
int sub_conv_num = args.sub_conv_num;
#ifdef COST_TIME_PRINT
timeval start, end;
long dif_sec, dif_usec; // NOLINT
#endif
for (int i = 0; i < sub_conv_num; i++) {
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
ComputeDWConv(*args.dw_conv_args[i]);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv basic_conv: " << i << " times: "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
}
if (sub_conv_num > 1) {
float max_scale = -1.0f;
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
for (int i = 0; i < sub_conv_num; i++) {
paddle_mobile::fpga::fpga_invalidate(
args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
if (ptr_scale > max_scale) {
args.output.scale_address[0] = ptr_scale;
args.output.scale_address[1] =
(args.dw_conv_args[i]->output.scale_address)[1];
}
}
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv scale "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
}
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
DWDeconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
#endif
return 0;
} // ComputeFpgaDeconv
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -229,6 +229,7 @@ struct DeconvArgs { ...@@ -229,6 +229,7 @@ struct DeconvArgs {
std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args; std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
}; };
struct DWconvArgs { struct DWconvArgs {
uint32_t sub_conv_num;
bool relu_enabled; bool relu_enabled;
void* bias_address; void* bias_address;
void* filter_address; void* filter_address;
...@@ -236,6 +237,19 @@ struct DWconvArgs { ...@@ -236,6 +237,19 @@ struct DWconvArgs {
struct ImageInputArgs image; struct ImageInputArgs image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
}; };
struct DWDeconvArgs {
uint32_t sub_conv_num;
uint32_t group_num;
uint32_t filter_num;
uint32_t omit_size;
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
std::vector<std::shared_ptr<DWconvArgs>> dw_conv_args;
std::vector<std::shared_ptr<char>> vector_dw_conv_space;
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// } // }
static inline uint32_t align_to_x(int64_t num, int64_t x) { static inline uint32_t align_to_x(int64_t num, int64_t x) {
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
uint64_t FPGAVersion();
int PerformBypass(const struct BypassArgs& args); int PerformBypass(const struct BypassArgs& args);
int ComputeBasicConv(const struct ConvArgs& args); int ComputeBasicConv(const struct ConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args);
...@@ -28,5 +29,7 @@ int ComputeFPGAConcat(const struct ConcatArgs& args); ...@@ -28,5 +29,7 @@ int ComputeFPGAConcat(const struct ConcatArgs& args);
int ComputeFPGASplit(const struct SplitArgs& args); int ComputeFPGASplit(const struct SplitArgs& args);
int ComputeFpgaDeconv(const struct DeconvArgs& args); int ComputeFpgaDeconv(const struct DeconvArgs& args);
int ComputeDWConv(const struct DWconvArgs& args); int ComputeDWConv(const struct DWconvArgs& args);
int ComputeDWDeconv(const struct DWDeconvArgs& args);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -49,13 +49,23 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -49,13 +49,23 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
"filter width should be equal to filter height "); "filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis "); "filter axis should be the multiple of stride axis ");
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); if (param->Groups() == channel) {
fpga::DeconvArgs deconv_arg = {0}; fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, sub_conv_n);
param->Groups(), param->Strides()[0], fpga::DWDeconvArgs DWDeconv_arg = {0};
param->Strides()[1], param->Paddings()[0], fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled,
param->Paddings()[1], bs_ptr); param->Strides()[0], param->Strides()[1],
param->SetFpgaArgs(deconv_arg); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true; return true;
} }
...@@ -63,7 +73,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { ...@@ -63,7 +73,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
template <> template <>
void DeconvAddKernel<FPGA, float>::Compute( void DeconvAddKernel<FPGA, float>::Compute(
const FusionDeconvAddParam<FPGA> &param) { const FusionDeconvAddParam<FPGA> &param) {
fpga::ComputeFpgaDeconv(param.FpgaArgs()); if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
} }
} // namespace operators } // namespace operators
......
...@@ -50,20 +50,35 @@ bool DeconvAddReluKernel<FPGA, float>::Init( ...@@ -50,20 +50,35 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
"filter width should be equal to filter height "); "filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis "); "filter axis should be the multiple of stride axis ");
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); if (param->Groups() == channel) {
fpga::DeconvArgs deconv_arg = {0}; fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, sub_conv_n);
param->Groups(), param->Strides()[0], fpga::DWDeconvArgs DWDeconv_arg = {0};
param->Strides()[1], param->Paddings()[0], fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled,
param->Paddings()[1], bs_ptr); param->Strides()[0], param->Strides()[1],
param->SetFpgaArgs(deconv_arg); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true; return true;
} }
template <> template <>
void DeconvAddReluKernel<FPGA, float>::Compute( void DeconvAddReluKernel<FPGA, float>::Compute(
const FusionDeconvAddReluParam<FPGA> &param) { const FusionDeconvAddReluParam<FPGA> &param) {
fpga::ComputeFpgaDeconv(param.FpgaArgs()); // fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
} }
} // namespace operators } // namespace operators
......
...@@ -22,7 +22,7 @@ namespace operators { ...@@ -22,7 +22,7 @@ namespace operators {
template <> template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<LoDTensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto out = param->Out(); auto out = param->Out();
fpga::format_fp32_ofm(out); fpga::format_fp32_ofm(out);
......
...@@ -2357,10 +2357,17 @@ class ConvTransposeParam : public OpParam { ...@@ -2357,10 +2357,17 @@ class ConvTransposeParam : public OpParam {
private: private:
fpga::DeconvArgs fpga_conv_args; fpga::DeconvArgs fpga_conv_args;
fpga::DWDeconvArgs fpga_DWDeconv_args;
public: public:
const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
const fpga::DWDeconvArgs &FpgaDWDconvArgs() const {
return fpga_DWDeconv_args;
}
void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
void SetFpgaArgs(const fpga::DWDeconvArgs &args) {
fpga_DWDeconv_args = args;
}
#endif #endif
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册