diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 137ac73512b9d88716ab585ba315f26aa3b14ea8..a8540e72f3ad077ac4aa49e34b535675f04dcd16 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -151,6 +151,30 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { filter_tensor->reset_data_ptr(new_data); } +void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, + int stride) { + auto dims = filter_tensor->dims(); + auto num = dims[0], height = dims[2], width = dims[3]; + auto data_ptr = filter_tensor->data(); + size_t memory_size = num * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + fpga_copy(new_data, data_ptr, memory_size); + + int hw = height * width; + deconv_filter::deconv_NC_convert(&new_data, num, 1, hw); + + num = dims[1]; + int channel = dims[0]; + + deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width, + scale_ptr, stride); + + // framework::DDim dims_new = + // framework::make_ddim({num, 1, height, width}); + // filter_tensor->Resize(dims_new); + filter_tensor->reset_data_ptr(new_data); +} + void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT @@ -243,6 +267,17 @@ void format_dwconv_data(framework::Tensor *filter_tensor, format_bias_array(bias_ptr, channel); format_fp16_ofm(ofm_tensor); } +void format_DWDeconv_data(framework::Tensor *filter_tensor, + framework::Tensor *ofm_tensor, float **bs_ptr, + int group, int sub_conv_n) { + int channel = ofm_tensor->dims()[1]; + // dw-deconv + format_DWDconv_filter( + filter_tensor, + (reinterpret_cast(*bs_ptr) + sub_conv_n * channel), sub_conv_n); + format_bias_array(bs_ptr, channel); + format_fp16_ofm(ofm_tensor); +} void expand_conv_arg(ConvArgs *arg) { ConvArgs args = *arg; @@ -770,6 +805,7 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, auto filter_ptr = filter->data(); auto input_ptr = input->data(); auto output_ptr = out->mutable_data(); + arg->sub_conv_num = 1; arg->relu_enabled = relu_enabled; arg->bias_address = bias_ptr; arg->filter_address = filter_ptr; @@ -788,5 +824,109 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, arg->output.scale_address = out->scale; } // end dwconv arg fill +void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, + framework::Tensor *out, framework::Tensor *filter, + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float *bias_ptr) { + auto filter_ptr = filter->data(); + auto input_ptr = input->data(); + auto output_ptr = out->mutable_data(); + + auto deleter = [](void *p) { fpga_free(p); }; + + arg->group_num = (uint32_t)filter->dims()[0]; + arg->sub_conv_num = (uint32_t)stride_w; + arg->filter_num = (uint32_t)filter->dims()[0]; + + int sub_conv_num = stride_w; + + int sub_pad = + deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT + padding_w, stride_w); + auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( + (int)filter->dims()[3], stride_w); // NOLINT + + auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( + (int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT + auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( + (int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT + + arg->sub_output_width = (uint32_t)sub_output_width; + arg->sub_output_height = (uint32_t)sub_output_height; + arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( + stride_w, (int)filter->dims()[3], padding_w); // NOLINT + + auto sub_channels = (int)input->dims()[1]; // NOLINT + uint32_t omit_size = arg->omit_size; + int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; + int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; + int sub_filter_num = sub_conv_num * (arg->filter_num); + + framework::DDim dims_out_new = framework::make_ddim( + {1, arg->filter_num, real_out_height, real_out_width}); + fpga::format_fp16_ofm(out, dims_out_new); + auto out_ptr = out->data(); + + /*====For Addition + arg->output.address = + (half *)out_ptr + // NOLINT + omit_size * sizeof(half) * + (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); + */ + arg->output.address = out_ptr; + arg->output.scale_address = out->scale; + + int filter_offset = sub_filter_width * sub_filter_width * + align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) * + arg->sub_conv_num; + + for (int i = 0; i < sub_conv_num; ++i) { + arg->dw_conv_args.push_back(std::make_shared()); + + arg->dw_conv_args[i]->sub_conv_num = sub_conv_num; + arg->dw_conv_args[i]->relu_enabled = relu_enabled; + arg->dw_conv_args[i]->bias_address = bias_ptr; + + arg->dw_conv_args[i]->filter_address = + fpga_malloc(filter_offset * sizeof(int16_t)); + memcpy(arg->dw_conv_args[i]->filter_address, + (reinterpret_cast(filter_ptr) + i * filter_offset), + filter_offset * sizeof(int16_t)); + arg->vector_dw_conv_space.push_back(std::shared_ptr( + reinterpret_cast(arg->dw_conv_args[i]->filter_address), + deleter)); + + arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width; + arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width; + + arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1; + arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1; + arg->dw_conv_args[i]->image.address = input_ptr; + arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1]; + arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2]; + arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3]; + + arg->dw_conv_args[i]->image.pad_height = sub_pad; + arg->dw_conv_args[i]->image.pad_width = sub_pad; + arg->dw_conv_args[i]->image.scale_address = input->scale; + + arg->dw_conv_args[i]->output.address = + fpga_malloc(sub_output_height * + align_to_x(sub_output_width * sub_channels * sub_conv_num, + IMAGE_ALIGNMENT) * + sizeof(int16_t)); + arg->dw_conv_args[i]->output.scale_address = + static_cast(fpga_malloc(2 * sizeof(float))); + arg->vector_dw_conv_space.push_back(std::shared_ptr( + reinterpret_cast(arg->dw_conv_args[i]->output.address), + deleter)); + arg->vector_dw_conv_space.push_back(std::shared_ptr( + reinterpret_cast(arg->dw_conv_args[i]->output.scale_address), + deleter)); + } + + // arg->output.scale_address = out->scale; +} // end dwconv arg fill + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index b5c586e92aca2cc8a540ba54479ae7941f42e02c..05d6a938c85f14770b97cd477580d0e6103fa777 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -57,6 +57,10 @@ void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, bool relu_enabled, int stride_h, int stride_w, int padding_h, int padding_w, float* bias_ptr); +void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input, + framework::Tensor* out, framework::Tensor* filter, + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float* bs_ptr); void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, int group_num, int stride); @@ -69,6 +73,10 @@ void format_deconv_data(framework::Tensor* filter_tensor, void format_dwconv_data(framework::Tensor* filter_tensor, framework::Tensor* ofm_tensor, float* scale_ptr, float** bias_ptr); +void format_DWDeconv_data(framework::Tensor* filter_tensor, + framework::Tensor* ofm_tensor, float** bs_ptr, + int group, int sub_conv_n); + template void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) { float data; diff --git a/src/fpga/V1/deconv_filter.cpp b/src/fpga/V1/deconv_filter.cpp index 8fb3cd69fdfb10effb5769b656e19858e481f5f4..4c484a45d0a36db4aac677377ae11b5235603ac6 100644 --- a/src/fpga/V1/deconv_filter.cpp +++ b/src/fpga/V1/deconv_filter.cpp @@ -21,15 +21,6 @@ limitations under the License. */ #include "fpga/V1/api.h" // #include "fpga_api.h" -// just for test -//#include -//#include "deconv.h" -//#include "deconv_api.h" -// using namespace std; -// using namespace paddle_mobile::fpga; -// using namespace baidu::fpga::deconv::api; -// namespace api = baidu::fpga::deconv::api; - namespace paddle_mobile { namespace fpga { namespace deconv_filter { @@ -42,7 +33,8 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width, float* tmp = *data_in; int data_size = num * channel * width * height; int hw_len = height * width; - auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float)); + auto tmp_data = + reinterpret_cast(fpga_malloc(data_size * sizeof(float))); for (int i = 0; i < num; ++i) { for (int j = 0; j < channel; ++j) { for (int k = 0; k < hw_len; ++k) { @@ -97,9 +89,10 @@ int deconv_get_omit(int stride, int filter_width, int pad) { return (stride - idx); } -void deconv_get_sub_filter(char** data_in, int height, int width, - int sub_conv_n, int kernel_num, int channel) { - char* ptr_tmp = *data_in; +template +void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n, + int kernel_num, int channel) { + T* ptr_tmp = *data_in; int sub_num = kernel_num * sub_conv_n; int sub_h = height / sub_conv_n; int sub_w = width / sub_conv_n; @@ -107,7 +100,8 @@ void deconv_get_sub_filter(char** data_in, int height, int width, int sub_filter_size = kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n; - char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char)); + T* ptr_sub_filter = + reinterpret_cast(fpga_malloc(sub_filter_size * sizeof(T))); for (int idx = 0; idx < sub_conv_n; ++idx) { for (int nn = 0; nn < sub_num; ++nn) { int ni = nn % kernel_num; @@ -124,7 +118,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width, fpga_copy( ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx, - (*data_in) + kidx, channel * sizeof(char)); + (*data_in) + kidx, channel * sizeof(T)); // for (int cc =0; cc < channel; ++cc) { // ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] = // (*data_in)[kidx + cc]; @@ -140,7 +134,7 @@ void deconv_get_sub_filter(char** data_in, int height, int width, void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw) { float* tmp = *filter_in; - float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc( + float* ptr_filter = reinterpret_cast(paddle_mobile::fpga::fpga_malloc( hw * kernel_num * channels * sizeof(float))); for (int c = 0; c < channels; ++c) { @@ -188,7 +182,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, result2); }*/ - deconv_get_sub_filter(quantize_data, height, width, stride, num, channel); + deconv_get_sub_filter(quantize_data, height, width, stride, num, + channel); /*{ char result2 = (char)0; string filename = "sub_filter_filter_data"; @@ -212,10 +207,12 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ((residual == 0) ? div_num : (div_num - 1)) + align_to_x(residual, FILTER_NUM_ALIGNMENT); - char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*)); + char** ptr_ptr_data = + reinterpret_cast(fpga_malloc(sub_conv_n * sizeof(char*))); int origin_offset = sub_chw * sub_num; for (int i = 0; i < sub_conv_n; ++i) { - (ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char)); + (ptr_ptr_data)[i] = + reinterpret_cast(fpga_malloc(origin_offset * sizeof(char))); fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i, origin_offset * sizeof(char)); @@ -233,8 +230,8 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, int align_offset = align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment; - char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset * - sizeof(char)); // continuous space + char* ptr_space = reinterpret_cast(fpga_malloc( + sub_conv_n * align_offset * sizeof(char))); // continuous space for (int i = 0; i < sub_conv_n; ++i) { char* ptr_tmp = (ptr_ptr_data)[i]; @@ -251,7 +248,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); fpga_free(ptr_tmp); } - *data_in = (float*)ptr_space; + *data_in = reinterpret_cast(ptr_space); /* { char result2 = (char)0; @@ -262,6 +259,22 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char)); } +void DWDconv_format_filter(float** data_in, int num, int channel, int height, + int width, float* scale_ptr, int stride) { + deconv_inverse_filter(data_in, num, channel, width, height); + + filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr); + int16_t** quantize_data = (int16_t**)data_in; // NOLINT + filter::convert_to_hwn(quantize_data, channel, height, width); + + deconv_get_sub_filter(quantize_data, height, width, stride, num, + channel); + + filter::align_element_n(quantize_data, channel, height, width); + fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) * + height * width * sizeof(int16_t)); +} + } // namespace deconv_filter } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/deconv_filter.h b/src/fpga/V1/deconv_filter.h index 5fa9781933712a8506c052258dbf2f7f7e05fe37..f1a50b95c52dadc49f4dd333791a22f63bf6d0a3 100644 --- a/src/fpga/V1/deconv_filter.h +++ b/src/fpga/V1/deconv_filter.h @@ -24,11 +24,15 @@ int deconv_calc_sub_pad(int filter_axis, int pad, int stride); int deconv_get_sub_filter_axis(int filter_axis, int stride); int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); int deconv_get_omit(int stride, int filter_width, int pad); -void deconv_get_sub_filter(char** data_in, int height, int width, - int sub_conv_n, int kernel_num, int channel); + +template +void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n, + int kernel_num, int channel); void deconv_format_filter(float** data_in, int num, int channel, int height, int width, int group_num, float max, int stride); void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw); +void DWDconv_format_filter(float** data_in, int num, int channel, int height, + int width, float* scale_ptr, int stride); } // namespace deconv_filter } // namespace fpga diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp old mode 100755 new mode 100644 index 197448d515d67459b280bf33a14b8f8419970fc2..50341b75e129479e7f8d8ab4d9c200df574996cb --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -346,6 +346,16 @@ void format_dwconv_filter(float **data_in, int num, int height, int width, fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width * sizeof(int16_t)); } + +void format_DWDeconv_filter(float **data_in, int num, int height, int width, + float *scale_ptr) { + quantize_to_fp16(data_in, num, height, width, scale_ptr); + int16_t **quantize_data = (int16_t **)data_in; // NOLINT + convert_to_hwn(quantize_data, num, height, width); + align_element_n(quantize_data, num, height, width); + fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * + height * width * sizeof(int16_t)); +} } // namespace filter } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index aeb5cdd65385b87a5da1e15e98b9914ca6be189c..16d3bc793389f49ad0b6e3bf3b064a880e4a927a 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -18,7 +18,6 @@ limitations under the License. */ #include "fpga/V1/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" - #ifdef COST_TIME_PRINT #include #include @@ -163,6 +162,7 @@ using namespace std; // NOLINT #define REG_DWCONV_FILTER_BASE_ADDR 0xe08 #define REG_DWCONV_FILTER_SHAPE 0xe10 #define REG_DWCONV_FILTER_N_ALIGN 0xe18 +#define REG_DWCONV_FILTER_SUBNUMBER 0xe20 #define REG_DWCONV_CMD 0xe00 int ComputeFpgaConv(const struct SplitConvArgs &args) { @@ -591,6 +591,20 @@ int PerformBypass(const struct BypassArgs &args) { return 0; } // PerformBypass +uint64_t FPGAVersion() { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaBypass==========="; +#endif +#ifdef PADDLE_MOBILE_ZU5 + uint64_t fpga_ver = 0; + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + fpga_ver = reg_readq(REG_HARDWARE_STATUS); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return fpga_ver; +#endif + return 0; +} // FPGAVersion + int ComputeFPGAConcat(const struct ConcatArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFpgaConcat==========="; @@ -655,6 +669,45 @@ void deconv_post_process(const struct DeconvArgs &args) { fpga_flush(args.output.address, num * align_deconv_row_len * deconv_h * sizeof(int16_t)); } +void DWDeconv_post_process(const struct DWDeconvArgs &args) { + int sub_conv_n = args.sub_conv_num; + int sub_height = args.sub_output_height; + int sub_width = args.sub_output_width; + int omit_size = args.omit_size; + int channel = args.filter_num; + int num = 1; + int origin_h = sub_height * sub_conv_n; + int origin_w = sub_width * sub_conv_n; + int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT); + int deconv_h = origin_h - 2 * omit_size; + int deconv_w = origin_w - 2 * omit_size; + int deconv_row_len = deconv_w * channel; + int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT); + + for (int idx = 0; idx < sub_conv_n; ++idx) { + paddle_mobile::fpga::fpga_invalidate( + args.dw_conv_args[idx]->output.address, + align_origin_w * origin_h * sizeof(int16_t)); + } + + int deconv_idx = 0; + for (int nn = 0; nn < num; ++nn) { + for (int hh = 0; hh < origin_h; ++hh) { + int hx = (hh % sub_conv_n); + auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1] // NOLINT + ->output.address); + int hi = (hh / sub_conv_n); + if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; + int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + + omit_size * channel); + fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT + sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT + deconv_idx += align_deconv_row_len; + } + } + fpga_flush(args.output.address, + num * align_deconv_row_len * deconv_h * sizeof(int16_t)); +} int ComputeFpgaDeconv(const struct DeconvArgs &args) { #ifdef FPGA_PRINT_MODE @@ -792,17 +845,21 @@ int ComputeDWConv(const struct DWconvArgs &args) { align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t filter_amount_per_row_align = filter_N_align * (uint64_t)args.kernel.width; - uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height; + uint64_t sub_filter_amount_align = filter_N_align * + (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height; + uint64_t filter_amount_align = + sub_filter_amount_align * (uint64_t)args.sub_conv_num; uint32_t output_height = (uint32_t)( (args.image.height + args.image.pad_height * 2 - args.kernel.height) / args.kernel.stride_h + 1); uint32_t output_width = (uint32_t)( - (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1); + ((args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1) * + args.sub_conv_num); uint64_t image_amount_per_row = align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, @@ -845,12 +902,15 @@ int ComputeDWConv(const struct DWconvArgs &args) { /*restart scale*/ reg_writeq(output_scale, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); reg_writeq((bias_physical_address << 32 | filter_physical_address), REG_DWCONV_FILTER_BASE_ADDR); reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), REG_DWCONV_FILTER_SHAPE); + reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32), + REG_DWCONV_FILTER_SUBNUMBER); reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); reg_writeq( @@ -904,10 +964,88 @@ int ComputeDWConv(const struct DWconvArgs &args) { output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + DLOG << "output_scale:" << output_scale; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif return 0; } +int ComputeDWDeconv(const struct DWDeconvArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFPGADeConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num << "omit_size:" << args.omit_size + << "sub_output_width: " << args.sub_output_width + << "sub_output_height: " << args.sub_output_height + << " sub_conv_num:" << args.sub_conv_num; + DLOG << "args.output.address: " << args.output.address + << "args.output.scale_address: " << args.output.scale_address; + +#endif + + int sub_conv_num = args.sub_conv_num; + +#ifdef COST_TIME_PRINT + timeval start, end; + long dif_sec, dif_usec; // NOLINT +#endif + + for (int i = 0; i < sub_conv_num; i++) { +#ifdef COST_TIME_PRINT + gettimeofday(&start, NULL); +#endif + + ComputeDWConv(*args.dw_conv_args[i]); +#ifdef COST_TIME_PRINT + gettimeofday(&end, NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv basic_conv: " << i << " times: " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" + << std::endl; +#endif + } + + if (sub_conv_num > 1) { + float max_scale = -1.0f; +#ifdef COST_TIME_PRINT + gettimeofday(&start, NULL); +#endif + for (int i = 0; i < sub_conv_num; i++) { + paddle_mobile::fpga::fpga_invalidate( + args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float)); + float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0]; + if (ptr_scale > max_scale) { + args.output.scale_address[0] = ptr_scale; + args.output.scale_address[1] = + (args.dw_conv_args[i]->output.scale_address)[1]; + } + } + +#ifdef COST_TIME_PRINT + gettimeofday(&end, NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv scale " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" + << std::endl; +#endif + } + +#ifdef COST_TIME_PRINT + gettimeofday(&start, NULL); +#endif + DWDeconv_post_process(args); +#ifdef COST_TIME_PRINT + gettimeofday(&end, NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv_post_process " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" + << std::endl; +#endif + return 0; +} // ComputeFpgaDeconv + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp index 0a1787aa3f211a247d95cd7124879ce14af980a9..bf90a3a11926b1f90ed8a659db908a061f79b0e9 100644 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -76,7 +76,7 @@ int32_t convertmantissa(int32_t i) { } float fp16_2_fp32(int16_t fp16_num) { - int16_t se_fp16 = fp16_num >> 10; + int16_t se_fp16 = (fp16_num >> 10) & 0x3f; int16_t m_fp16 = fp16_num & 0x3ff; int32_t e_fp32 = 0; int16_t offset = 0; @@ -94,7 +94,7 @@ float fp16_2_fp32(int16_t fp16_num) { e_fp32 = 0x80000000; offset = 0; } else if (se_fp16 < 63) { - e_fp32 = 0x80000000 + (se_fp16 - 32) << 23; + e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23); offset = 1024; } else { // se_fp16 == 63 e_fp32 = 0xC7800000; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h old mode 100755 new mode 100644 index c9519071fba94ad1e2b526d9e4d5cd96a1bcdbac..25ca99613e91dcbab4ffedf3802f2025afdc040f --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -229,6 +229,7 @@ struct DeconvArgs { std::vector> split_conv_args; }; struct DWconvArgs { + uint32_t sub_conv_num; bool relu_enabled; void* bias_address; void* filter_address; @@ -236,6 +237,19 @@ struct DWconvArgs { struct ImageInputArgs image; struct ImageOutputArgs output; }; + +struct DWDeconvArgs { + uint32_t sub_conv_num; + uint32_t group_num; + uint32_t filter_num; + uint32_t omit_size; + uint32_t sub_output_width; + uint32_t sub_output_height; + struct ImageOutputArgs output; + std::vector> dw_conv_args; + std::vector> vector_dw_conv_space; +}; + // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; // } static inline uint32_t align_to_x(int64_t num, int64_t x) { diff --git a/src/fpga/common/pe.h b/src/fpga/common/pe.h index 9f2800428e431ea302d6cd33685e8ff1dcdc2751..cf0574bc04b05d538766ecba895e97944e1233f8 100644 --- a/src/fpga/common/pe.h +++ b/src/fpga/common/pe.h @@ -18,6 +18,7 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { +uint64_t FPGAVersion(); int PerformBypass(const struct BypassArgs& args); int ComputeBasicConv(const struct ConvArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args); @@ -28,5 +29,7 @@ int ComputeFPGAConcat(const struct ConcatArgs& args); int ComputeFPGASplit(const struct SplitArgs& args); int ComputeFpgaDeconv(const struct DeconvArgs& args); int ComputeDWConv(const struct DWconvArgs& args); +int ComputeDWDeconv(const struct DWDeconvArgs& args); + } // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 83adddabf0213a441779815d312161d1737d1296..1e21d374cb3651e582f43b2875a9c302ae86cdfb 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -49,13 +49,23 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { "filter width should be equal to filter height "); PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), "filter axis should be the multiple of stride axis "); - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } return true; } @@ -63,7 +73,11 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { template <> void DeconvAddKernel::Compute( const FusionDeconvAddParam ¶m) { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp index 9a96ca6e53644e6b5a8a99a8eed2f5e92449e681..ca77b2fd440fdfcfb61498205739b3ded6346ebc 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -50,20 +50,35 @@ bool DeconvAddReluKernel::Init( "filter width should be equal to filter height "); PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), "filter axis should be the multiple of stride axis "); - fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); - fpga::DeconvArgs deconv_arg = {0}; - fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], - param->Strides()[1], param->Paddings()[0], - param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(deconv_arg); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, relu_enabled, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } return true; } template <> void DeconvAddReluKernel::Compute( const FusionDeconvAddReluParam ¶m) { - fpga::ComputeFpgaDeconv(param.FpgaArgs()); + // fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..276a8fef62edfabfabb116fada145eedbf23ffa3 --- /dev/null +++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SIGMOID_OP + +#include "operators/kernel/activation_kernel.h" +namespace paddle_mobile { +namespace operators { + +using framework::DDim; +using framework::Tensor; + +template <> +bool SigmoidKernel::Init(SigmoidParam *param) { + auto input = const_cast(param->InputX()); + auto input_ptr = input->data(); + auto out = param->Out(); + fpga::format_fp32_ofm(out); + + auto float_input = new Tensor; + if (input->dims().size() == 2) { + float_input->mutable_data({1, input->dims()[1]}); + } else if (input->dims().size() == 4) { + float_input->mutable_data( + {1, input->dims()[2], input->dims()[3], input->dims()[1]}); + } else { + DLOG << "wrong dimension of softmax input"; + } + + fpga::format_fp32_ofm(float_input); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input_ptr; + args.image.height = + (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1; + args.image.width = + (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = float_input->data(); + args.output.scale_address = float_input->scale; + param->SetFloatInput(float_input); + param->SetFpgaArgs(args); + + return true; +} +template +T Sigmoid(const T a) { + T tmp = -1.0f * a; + return (1.0 / (1.0 + exp(tmp))); +} +template +void sigmoidFuntor(Tensor *input, Tensor *output) { + auto *input_ptr = input->data(); + auto *output_ptr = output->mutable_data(); + for (int i = 0; i < input->numel(); i++) { + *(output_ptr + i) = Sigmoid(*(input_ptr + i)); + } +} +template <> +void SigmoidKernel::Compute(const SigmoidParam ¶m) { + Tensor *in_x = param.FloatInput(); + Tensor *out = param.Out(); + + fpga::PerformBypass(param.FpgaArgs()); + fpga::fpga_invalidate((void *)in_x->data(), // NOLINT + in_x->numel() * sizeof(float)); + // TODO: In general case, 0 should be squeezed before softmax input // NOLINT + sigmoidFuntor(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 959bfd7f743401a453ab0169ca773285e2904d4e..d90dff2d7e919f736b5cfd0531074944938f2a8a 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1078,6 +1078,20 @@ class SigmoidParam : public OpParam { private: RType *input_x_; RType *out_; +#ifdef PADDLE_MOBILE_FPGA + + private: + std::shared_ptr float_input_x_; + fpga::BypassArgs fpga_bypass_args; + + public: + RType *FloatInput() const { + return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); + } + void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } + const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } + void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } +#endif }; #endif @@ -2357,10 +2371,17 @@ class ConvTransposeParam : public OpParam { private: fpga::DeconvArgs fpga_conv_args; + fpga::DWDeconvArgs fpga_DWDeconv_args; public: const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; } + const fpga::DWDeconvArgs &FpgaDWDconvArgs() const { + return fpga_DWDeconv_args; + } void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; } + void SetFpgaArgs(const fpga::DWDeconvArgs &args) { + fpga_DWDeconv_args = args; + } #endif };