diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 4caf588a7fede0fb1f2798cce3e10af34449f6c5..1ab433d0243d416c7d0d91a5344e84ca074e4746 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -140,6 +140,16 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, max_value); filter_tensor->reset_data_ptr(new_data); } +void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { + auto dims = filter_tensor->dims(); + auto num = dims[0], height = dims[2], width = dims[3]; + auto data_ptr = filter_tensor->data(); + size_t memory_size = num * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + fpga_copy(new_data, data_ptr, memory_size); + filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); + filter_tensor->reset_data_ptr(new_data); +} void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT @@ -186,6 +196,9 @@ void format_bias_scale_array(float **bias_scale_array, bias_scale::format_bias_scale_array(bias_scale_array, element_num_per_division, num); } +void format_bias_array(float **bias_array, int num) { + bias_scale::format_bias_array(bias_array, num); +} void format_concat_output(framework::Tensor *out, int height, int width, int image_num, uint32_t *channel_num) { @@ -200,7 +213,36 @@ void format_concat_output(framework::Tensor *out, int height, int width, out->Resize(ddim); out->reset_data_ptr(data_ptr); } +void format_conv_data(framework::Tensor *filter_tensor, + framework::Tensor *ofm_tensor, float **bs_ptr, + int group) { + float max_value = fpga::filter_find_max(filter_tensor); + fpga::format_filter(filter_tensor, max_value, group); + int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group); + fpga::format_bias_scale_array(bs_ptr, element_num_per_div, + ofm_tensor->dims()[1]); + fpga::format_fp16_ofm(ofm_tensor); +} +void format_deconv_data(framework::Tensor *filter_tensor, + framework::Tensor *ofm_tensor, float **bs_ptr, + int group, int sub_conv_n) { + int channel = ofm_tensor->dims()[1]; + float max_value = filter_find_max(filter_tensor); + format_deconv_filter(filter_tensor, max_value, group, sub_conv_n); + int element_num_per_div = + get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n); + format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n); + format_fp16_ofm(ofm_tensor); +} +void format_dwconv_data(framework::Tensor *filter_tensor, + framework::Tensor *ofm_tensor, float *scale_ptr, + float **bias_ptr) { + auto channel = ofm_tensor->dims()[1]; + format_dwconv_filter(filter_tensor, scale_ptr); + format_bias_array(bias_ptr, channel); + format_fp16_ofm(ofm_tensor); +} void expand_conv_arg(ConvArgs *arg) { ConvArgs args = *arg; @@ -360,7 +402,6 @@ void expand_EW_arg(EWAddArgs *arg) { (*arg).driver.output_address_phy = output_address_phy; (*arg).driver.coefficient = coefficient; (*arg).driver.cmd = cmd; - } // expand_EW_arg void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, @@ -399,7 +440,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, auto channel = (int)out->dims()[1]; // NOLINT int filter_num_per_div = get_filter_num_per_div(filter, group_num); int element_num = get_aligned_filter_element_num( - (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3])); + (int)(filter->dims()[1] * filter->dims()[2] * // NOLINT + filter->dims()[3])); for (int i = 0; i < n; i++) { arg->conv_arg[i].relu_enabled = relu_enabled; @@ -424,8 +466,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, element_num * align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) * sizeof(int8_t); - auto filter_head = - &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; + auto filter_head = &( + (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT arg->conv_arg[i].filter_address = fpga_malloc(filter_size); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size); @@ -441,11 +483,12 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, if (n > 1) { arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT - arg->conv_arg[i].output.address = fpga_malloc( - out->dims()[2] * - align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num), - IMAGE_ALIGNMENT) * - sizeof(half)); + arg->conv_arg[i].output.address = + fpga_malloc(out->dims()[2] * + align_to_x((int)(out->dims()[3] * // NOLINT + arg->conv_arg[i].filter_num), + IMAGE_ALIGNMENT) * + sizeof(half)); } else { arg->conv_arg[i].output.scale_address = out->scale; arg->conv_arg[i].output.address = out_ptr; @@ -474,22 +517,23 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->sub_conv_num = (uint32_t)stride_h; arg->filter_num = (uint32_t)filter->dims()[0]; uint32_t sub_conv_num = arg->sub_conv_num; - int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], - padding_w, stride_w); + int sub_pad = + deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT + padding_w, stride_w); auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( - (int)filter->dims()[3], stride_w); + (int)filter->dims()[3], stride_w); // NOLINT auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[3], sub_pad, sub_filter_width); + (int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( - (int)input->dims()[2], sub_pad, sub_filter_width); + (int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT arg->sub_output_width = (uint32_t)sub_output_width; arg->sub_output_height = (uint32_t)sub_output_height; arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( - stride_w, (int)filter->dims()[3], padding_w); + stride_w, (int)filter->dims()[3], padding_w); // NOLINT - auto sub_channels = (int)input->dims()[1]; + auto sub_channels = (int)input->dims()[1]; // NOLINT uint32_t omit_size = arg->omit_size; int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; int sub_filter_num = sub_conv_num * (arg->filter_num); @@ -499,7 +543,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga::format_fp16_ofm(out, dims_out_new); auto out_ptr = out->data(); arg->output.address = - (half *)out_ptr + + (half *)out_ptr + // NOLINT omit_size * sizeof(half) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->output.scale_address = out->scale; @@ -510,31 +554,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, uint32_t split_num = group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1; - arg->split_conv_args = - (SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs)); + arg->split_conv_args = (SplitConvArgs *)fpga_malloc( // NOLINT + sub_conv_num * sizeof(SplitConvArgs)); // NOLINT for (int i = 0; i < sub_conv_num; ++i) { arg->split_conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); arg->split_conv_args[i].group_num = (uint32_t)group_num; arg->split_conv_args[i].split_num = split_num; arg->split_conv_args[i].conv_arg = - (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs)); + (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs)); // NOLINT arg->split_conv_args[i].concat_arg.height = sub_output_height; arg->split_conv_args[i].concat_arg.width = sub_output_width; arg->split_conv_args[i].concat_arg.image_num = split_num; arg->split_conv_args[i].concat_arg.images_in = - (half **)fpga_malloc(split_num * sizeof(half *)); + (half **)fpga_malloc(split_num * sizeof(half *)); // NOLINT arg->split_conv_args[i].concat_arg.scales_in = - (float **)fpga_malloc(split_num * sizeof(float *)); + (float **)fpga_malloc(split_num * sizeof(float *)); // NOLINT arg->split_conv_args[i].concat_arg.channel_num = - (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t)); + (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t)); // NOLINT } auto filter_num_per_div = (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w); int element_num = get_aligned_filter_element_num( - (int)(sub_channels * sub_filter_width * sub_filter_width)); + (int)(sub_channels * sub_filter_width * sub_filter_width)); // NOLINT int chw = sub_channels * sub_filter_width * sub_filter_width; int division_capacity = filter::calc_division_capacity(chw); @@ -558,14 +602,15 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, out_addr_offset = 0; } else { - auto ptr_output = (half *)out_ptr; + auto ptr_output = (half *)out_ptr; // NOLINT out_addr_offset = sizeof(half) * (sub_conv_num - 1 - i) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); - arg->split_conv_args[i].output.address = (void *)(ptr_output); + arg->split_conv_args[i].output.address = (void *)(ptr_output); // NOLINT - auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); + auto ptr_output_scale = + (float *)fpga_malloc(2 * sizeof(float)); // NOLINT arg->split_conv_args[i].output.scale_address = ptr_output_scale; } @@ -609,9 +654,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num, FILTER_NUM_ALIGNMENT) * sizeof(int8_t); - auto filter_head = - &((int8_t *)filter_ptr)[j * element_num * filter_num_per_div + - i * filter_sub_conv_offset]; + auto filter_head = &(( + int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT + i * filter_sub_conv_offset]; arg->split_conv_args[i].conv_arg[j].filter_address = fpga_malloc(filter_size); memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head, @@ -634,10 +679,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i].conv_arg[j].output.scale_address = arg->split_conv_args[i].output.scale_address; } else { - auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); + auto ptr_output = + (half *)fpga_malloc(conv_output_size * sizeof(half)); // NOLINT arg->split_conv_args[i].conv_arg[j].output.address = - (void *)((half *)ptr_output); - auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); + (void *)((half *)ptr_output); // NOLINT + auto ptr_output_scale = + (float *)fpga_malloc(2 * sizeof(float)); // NOLINT arg->split_conv_args[i].conv_arg[j].output.scale_address = ptr_output_scale; } @@ -660,5 +707,30 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga_free(bs_ptr); } // fill_deconv_arg +void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, + framework::Tensor *out, framework::Tensor *filter, + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float *bias_ptr) { + auto filter_ptr = filter->data(); + auto input_ptr = input->data(); + auto output_ptr = out->mutable_data(); + arg->relu_enabled = relu_enabled; + arg->bias_address = bias_ptr; + arg->filter_address = filter_ptr; + arg->kernel.height = filter->dims()[2]; + arg->kernel.width = filter->dims()[3]; + arg->kernel.stride_h = stride_h; + arg->kernel.stride_w = stride_w; + arg->image.address = input_ptr; + arg->image.channels = (uint32_t)input->dims()[1]; + arg->image.height = (uint32_t)input->dims()[2]; + arg->image.width = (uint32_t)input->dims()[3]; + arg->image.pad_height = padding_h; + arg->image.pad_width = padding_w; + arg->image.scale_address = input->scale; + arg->output.address = output_ptr; + arg->output.scale_address = out->scale; +} // end dwconv arg fill + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index 4cbba4c59a3bb8e4b02ad69c6331b30ad2f22a50..b5c586e92aca2cc8a540ba54479ae7941f42e02c 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "fpga/common/fpga_common.h" #include "fpga/common/pe.h" #include "framework/tensor.h" @@ -40,6 +41,7 @@ void format_filter(framework::Tensor* filter_tensor, float max_value, void format_fc_filter(framework::Tensor* filter_tensor, float max_value); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); +void format_bias_array(float** bias_array, int num); void format_concat_output(framework::Tensor* out, int height, int width, int image_num, uint32_t* channel_num); @@ -51,16 +53,28 @@ void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, bool relu_enabled, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float* bs_ptr); +void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input, + framework::Tensor* out, framework::Tensor* filter, + bool relu_enabled, int stride_h, int stride_w, + int padding_h, int padding_w, float* bias_ptr); void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, int group_num, int stride); - +void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr); +void format_conv_data(framework::Tensor* filter_tensor, + framework::Tensor* ofm_tensor, float** bs_ptr, int group); +void format_deconv_data(framework::Tensor* filter_tensor, + framework::Tensor* ofm_tensor, float** bs_ptr, + int group, int sub_conv_n); +void format_dwconv_data(framework::Tensor* filter_tensor, + framework::Tensor* ofm_tensor, float* scale_ptr, + float** bias_ptr); template void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) { float data; std::ofstream out(filename.c_str()); for (int i = 0; i < dataSize; ++i) { - data = (((Dtype*)buffer)[i]); + data = (((Dtype*)buffer)[i]); // NOLINT out << data << std::endl; } out.close(); diff --git a/src/fpga/V1/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp index 263a7494c5602c13208aa0d8899ce80d781aa11b..215ba8c04f361d2aedc645fc202d90231e91b6c7 100644 --- a/src/fpga/V1/bias_scale.cpp +++ b/src/fpga/V1/bias_scale.cpp @@ -82,6 +82,25 @@ void format_bias_scale_array(float **bias_scale_array, interleave(bias_scale_array, div_num * element_num_after_division); fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float)); } +void format_bias_array(float **bias_array, int num) { + float *ptr_unaligned = *bias_array; + int num_before_align = num; + int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT); + float *ptr_aligned = + (float *)fpga_malloc(num_after_align * sizeof(float)); // NOLINT + + memset(ptr_aligned, 0, num_after_align * sizeof(float)); + if (num < 16) { + memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float)); + for (int i = num; i < num_after_align; i++) { + ptr_aligned[i] = ptr_unaligned[i % num]; + } + } else { + memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float)); + } + fpga_free(ptr_unaligned); + *bias_array = ptr_aligned; +} } // namespace bias_scale } // namespace fpga diff --git a/src/fpga/V1/bias_scale.h b/src/fpga/V1/bias_scale.h old mode 100644 new mode 100755 index 9ae572a556a01719d918809ab9becea2d9fa5a20..9ebdc71bce1df1bd15b4be395de18c57f5ed3c09 --- a/src/fpga/V1/bias_scale.h +++ b/src/fpga/V1/bias_scale.h @@ -22,6 +22,7 @@ void align_element(float** data_in, int num_per_div_before_alignment, int num); void interleave(float** data_in, int num_after_alignment); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); +void format_bias_array(float** bias_array, int num); } // namespace bias_scale } // namespace fpga diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index 1f204ca30289136fc3ff83e2712ff32842a88100..28fcbc3a201b9f17c3888404cd949dd5b35817ef 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -277,7 +277,84 @@ void format_fc_filter(float **data_in, int num, int channel, int height, fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment * sizeof(char)); } +void convert_to_hwn(int16_t **data_in, int num, int height, int width) { + int16_t *tmp = *data_in; + int16_t *data_tmp = + (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + *(data_tmp + h * width * num + w * num + n) = *((*data_in)++); + } + } + } + *data_in = data_tmp; + fpga_free(tmp); +} + +void align_element_nw(int16_t **data_in, int num, int height, int width) { + int unalign_nw = num * width; + int align_nw = align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT); + if (unalign_nw == align_nw) { + return; + } else { + int16_t *tmp = *data_in; + + int num_element = height * align_nw; + int16_t *data_tmp = + (int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT + + memset(data_tmp, 0, num_element * sizeof(int16_t)); + if (unalign_nw >= FILTER_ELEMENT_ALIGNMENT) { + for (int h = 0; h < height; h++) { + int offset_unalign = h * unalign_nw; + int offset_align = h * align_nw; + for (int nw = 0; nw < unalign_nw; nw++) { + data_tmp[offset_align + nw] = *((*data_in) + offset_unalign + nw); + } + } + } else { + for (int h = 0; h < height; h++) { + int offset_unalign = h * unalign_nw; + int offset_align = h * align_nw; + for (int nw = 0; nw < align_nw; nw++) { + data_tmp[offset_align + nw] = + *((*data_in) + offset_unalign + nw % unalign_nw); + } + } + } + + *data_in = data_tmp; + free(tmp); + } +} +void quantize_to_fp16(float **data_in, int num, int height, int width, + float *scale_ptr) { + float *tmp = *data_in; + int size = num * height * width; + int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t)); // NOLINT + for (int n = 0; n < num; n++) { + float scale_val = scale_ptr[n]; + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + int index = n * height * width + h * width + w; + tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val); + } + } + } + *data_in = (float *)tmp_data; // NOLINT + fpga_free(tmp); +} +void format_dwconv_filter(float **data_in, int num, int height, int width, + float *scale_ptr) { + quantize_to_fp16(data_in, num, height, width, scale_ptr); + int16_t **quantize_data = (int16_t **)data_in; // NOLINT + convert_to_hwn(quantize_data, num, height, width); + align_element_nw(quantize_data, num, height, width); + fpga_flush(*quantize_data, align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT) * + height * sizeof(char)); +} } // namespace filter } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/filter.h b/src/fpga/V1/filter.h index 6cb35d380733b0bf64a3a782d44fd321e3f00cfa..c4f44fc72c5010c5a338498f39bda7e9cd594e64 100644 --- a/src/fpga/V1/filter.h +++ b/src/fpga/V1/filter.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#include namespace paddle_mobile { namespace fpga { namespace filter { @@ -38,6 +38,13 @@ void convert_fc_filter(char** data_in, int num, int chw); void format_fc_filter(float** data_in, int num, int channel, int height, int width, int group_num, float max); +void convert_to_hwn(int16_t** data_in, int num, int height, int width); +void align_element_nw(int16_t** data_in, int num, int height, int width); +void quantize_to_fp16(float** data_in, int num, int height, int width, + float* scale_ptr); +void format_dwconv_filter(float** data_in, int num, int height, int width, + float* scale_ptr); + } // namespace filter } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index d70d52d053aea05649e340a768736077311f0f5c..2791cc71a0c7f1323bb5ae603d9c766f777269ca 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -24,14 +24,13 @@ limitations under the License. */ #include #include #include -//#include #endif namespace paddle_mobile { namespace fpga { using namespace driver; // NOLINT -using namespace std; +using namespace std; // NOLINT #define USE_RELU 1 #define USE_BIAS 2 @@ -53,7 +52,6 @@ using namespace std; #define INTERRUPT_CONV 0x0004 #define INTERRUPT_POOLING 0x0008 #define INTERRUPT_EW 0x0010 -//#define INTERRUPT_RESIZE 0x0020 /* Register offset */ #define REG_INTERRUPT 0x000 @@ -73,9 +71,6 @@ using namespace std; #define REG_FLASH_STATUS 0x218 #define REG_SN 0x220 -//#define REG_READ_SCALE -//#define REG_WRITE_SCALE - /*bypass*/ #define REG_CONVERT_CMD 0x400 #define REG_CONVERT_SRC_ADDR 0x408 @@ -236,8 +231,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); - reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE); - reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE); + reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT + REG_CONV_IMAGE_SCALE); + reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT + REG_CONV_FILTER_SCALE); reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR); reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR); reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR); @@ -280,7 +277,6 @@ int ComputeBasicConv(const struct ConvArgs &args) { return ret; #endif return 0; - } // ComputeBasicConv int ComputeFpgaPool(const struct PoolingArgs &args) { @@ -406,13 +402,11 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif return 0; - } // ComputeFpgaPool int ComputeFpgaEWAdd(const struct EWAddArgs &args) { @@ -468,13 +462,10 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif return 0; - } // ComputeFpgaEWAdd int PerformBypass(const struct BypassArgs &args) { @@ -588,13 +579,10 @@ int PerformBypass(const struct BypassArgs &args) { output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif return 0; - } // PerformBypass int ComputeFPGAConcat(const struct ConcatArgs &args) { @@ -647,13 +635,14 @@ void deconv_post_process(const struct DeconvArgs &args) { for (int hh = 0; hh < origin_h; ++hh) { int hx = (hh % sub_conv_n); auto sub_t = - (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address); + (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT + .output.address); int hi = (hh / sub_conv_n); if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + omit_size * channel); - fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx, - sizeof(int16_t) * deconv_row_len); + fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT + sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT deconv_idx += align_deconv_row_len; } } @@ -678,7 +667,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { #ifdef COST_TIME_PRINT timeval start, end; - long dif_sec, dif_usec; + long dif_sec, dif_usec; // NOLINT #endif for (int i = 0; i < sub_conv_num; i++) { @@ -723,18 +712,16 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { #endif // fpga_flush(args.output.scale_address, 2 * sizeof(float)); -#ifdef COST_TIME_PRINT - gettimeofday(&start, NULL); -#endif - deconv_post_process(args); -#ifdef COST_TIME_PRINT - gettimeofday(&end, NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " - << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" - << std::endl; -#endif + /*#ifdef COST_TIME_PRINT + gettimeofday(&start,NULL); + #endif + //deconv_post_process(args); + #ifdef COST_TIME_PRINT + gettimeofday(&end,NULL); + dif_sec = end.tv_sec - start.tv_sec; + dif_usec = end.tv_usec - start.tv_usec; + std::cout << "deconv_post_process " << " cost time: " << + (dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/ } return 0; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h old mode 100644 new mode 100755 index bb446dda1a99cee44f48d45f02e25b1ef8b6d6a3..8c48e52a1784789dfbd77d9508c8703aac40b15e --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -25,6 +25,7 @@ namespace fpga { #define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 #define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 #define BS_NUM_ALIGNMENT 8 +#define BIAS_NUM_ALIGNMENT 16 #endif enum DataType { @@ -222,7 +223,14 @@ struct DeconvArgs { struct ImageOutputArgs output; struct SplitConvArgs* split_conv_args; }; - +struct DWconvArgs { + bool relu_enabled; + void* bias_address; + void* filter_address; + struct KernelArgs kernel; + struct ImageInputArgs image; + struct ImageOutputArgs output; +}; // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; // } static inline uint32_t align_to_x(int64_t num, int64_t x) { diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp old mode 100644 new mode 100755 index 679a95ff54168da821ed0debb80b6bce8eca407b..5d7e180f01aea4c6357d4638f73fbe51b8c18c17 --- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVADDBN_OP #include "operators/kernel/conv_add_bn_kernel.h" - +#include namespace paddle_mobile { namespace operators { @@ -58,14 +58,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, param->Groups()); - - int element_num_per_div = - fpga::get_filter_num_per_div(filter, param->Groups()); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); - + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b19db3cffc25282d3414a4da26bbd951494a8f86 --- /dev/null +++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBNRELU_OP + +#include "operators/kernel/conv_add_bn_relu_kernel.h" +#include + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddBNReluKernel::Init( + FusionConvAddBNReluParam *param) { + bool relu_enabled = true; + auto input = const_cast(param->Input()); + auto bias = param->Bias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + vector paddings = param->Paddings(); + vector strides = param->Strides(); + auto bn_mean_ptr = param->InputMean()->data(); + auto bn_var_ptr = param->InputVariance()->data(); + auto bn_scale_ptr = param->InputScale()->data(); + auto bn_bias_ptr = param->InputBias()->data(); + const float epsilon = param->Epsilon(); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && + bias->dims()[0] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); + + const int channel = out->dims()[1]; + auto bs_ptr = + reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({channel}); + auto new_bias_ptr = new_bias->mutable_data({channel}); + + for (int i = 0; i < channel; i++) { + new_scale_ptr[i] = bn_scale_ptr[i] / + static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); + new_bias_ptr[i] = + bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; + } + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + const int groups = param->Groups(); + if (groups == channel) { + fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); + fpga::DWconvArgs dwconv_arg = {0}; + fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled, + strides[0], strides[1], paddings[0], paddings[1], + new_bias_ptr); + param->SetFpgaArgs(dwconv_arg); + } else { + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), strides[0], strides[1], paddings[0], + paddings[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + } + return true; +} + +template <> +void ConvAddBNReluKernel::Compute( + const FusionConvAddBNReluParam ¶m) { + if (param.Groups() == param.Output()->dims()[1]) { + // fpga::ComputeFpgaConv(param.FpgaDwconvArgs()); + } else { + fpga::ComputeFpgaConv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp old mode 100644 new mode 100755 index 5ad4c86441f7870b00e6639e7cda22083d3c10d5..e566dc9b165811a3e8a9f78d040cc8c571fd93a9 --- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp @@ -38,15 +38,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { bs_ptr[i] = bias_ptr[i]; } - float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, param->Groups()); - - int element_num_per_div = - fpga::get_filter_num_per_div(filter, param->Groups()); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - fpga::format_fp16_ofm(out); - + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp old mode 100644 new mode 100755 index ce2fbbda0ee4c7e0a1e97b45674ef269df3be3be..6b2a2d77c0df29b4c319061776491b0583157d6f --- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp @@ -38,15 +38,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bs_ptr[i] = bias_ptr[i]; } - float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, param->Groups()); - - int element_num_per_div = - fpga::get_filter_num_per_div(filter, param->Groups()); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - fpga::format_fp16_ofm(out); - + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp old mode 100644 new mode 100755 index da10350553ddef7a15414b1583580219491e2306..492d418b9023a3c4c802da099a5da5ebf5568649 --- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp @@ -51,15 +51,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, param->Groups()); - - int element_num_per_div = - fpga::get_filter_num_per_div(filter, param->Groups()); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - fpga::format_fp16_ofm(out); - + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp old mode 100644 new mode 100755 index 1382b20cdfc201cc5e048871bf3ebac22ef7ba7e..337b25ffa5d3ba00cd60935f8643213cb5ea70d3 --- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp @@ -51,15 +51,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, param->Groups()); - - int element_num_per_div = - fpga::get_filter_num_per_div(filter, param->Groups()); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - fpga::format_fp16_ofm(out); - + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 7523ef59d14d7096360d06411685c54564faf29a..83adddabf0213a441779815d312161d1737d1296 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -35,8 +35,8 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { int channel = out->dims()[1]; int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * - sizeof(float)); // NOLINT + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = 1; @@ -49,17 +49,7 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { "filter width should be equal to filter height "); PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), "filter axis should be the multiple of stride axis "); - - float max_value = fpga::filter_find_max(filter); - fpga::format_deconv_filter(filter, max_value, param->Groups(), - param->Strides()[0]); - - int element_num_per_div = - fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n); - - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, - channel * sub_conv_n); - + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp index 4cc0bed75dc8ab0e968bd97f214a035ac22aac7e..9a96ca6e53644e6b5a8a99a8eed2f5e92449e681 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -36,8 +36,8 @@ bool DeconvAddReluKernel::Init( int channel = out->dims()[1]; int sub_conv_n = param->Strides()[0]; - auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * - sizeof(float)); // NOLINT + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT for (int i = 0; i < channel * sub_conv_n; i++) { bs_ptr[i + sub_conv_n * channel] = 1; @@ -50,17 +50,7 @@ bool DeconvAddReluKernel::Init( "filter width should be equal to filter height "); PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), "filter axis should be the multiple of stride axis "); - - float max_value = fpga::filter_find_max(filter); - fpga::format_deconv_filter(filter, max_value, param->Groups(), - param->Strides()[0]); - - int element_num_per_div = - fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n); - - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, - channel * sub_conv_n); - + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); fpga::DeconvArgs deconv_arg = {0}; fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/reshape_kernel.cpp b/src/operators/kernel/fpga/V1/reshape_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bef69df54a8a2a26c9eea1491bb08d13201ccd1a --- /dev/null +++ b/src/operators/kernel/fpga/V1/reshape_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE_OP + +#include "operators/kernel/reshape_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReshapeKernel::Init(ReshapeParam *param) { + return true; +} + +template <> +void ReshapeKernel::Compute(const ReshapeParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 918760bcfab0ea6c940fa35b3aebd0351f4d88cf..0cfc3896051095be3f72bfc6008c7a9c27456efd 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -28,18 +28,26 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { fpga::format_fp32_ofm(out); auto float_input = new Tensor; - float_input->mutable_data( - {1, input->dims()[2], input->dims()[3], input->dims()[1]}); - fpga::format_fp32_ofm(float_input); + if (input->dims().size() == 2) { + float_input->mutable_data({1, input->dims()[1]}); + } else if (input->dims().size() == 4) { + float_input->mutable_data( + {1, input->dims()[2], input->dims()[3], input->dims()[1]}); + } else { + DLOG << "wrong dimension of softmax input"; + } + fpga::format_fp32_ofm(float_input); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; args.input_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_CHW; args.input_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP32; args.image.address = input_ptr; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; + args.image.height = + (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1; + args.image.width = + (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; args.image.channels = (uint32_t)input->dims()[1]; args.output.address = float_input->data(); args.output.scale_address = float_input->scale; @@ -56,7 +64,7 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { fpga::PerformBypass(param.FpgaArgs()); fpga::fpga_invalidate((void *)in_x->data(), // NOLINT in_x->numel() * sizeof(float)); - // TODO: In general case, 0 should be squeezed before softmax input + // TODO: In general case, 0 should be squeezed before softmax input // NOLINT math::SoftmaxFuntor()(in_x, out); fpga::fpga_flush(out->data(), out->memory_size()); } diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 643bd65ee0c039c177b4cb8d3008fb29e315d371..0362ee44454569980f017c2b527acf8b77e2ff10 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -462,6 +462,13 @@ class ConvParam : public OpParam { public: const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } + + public: + fpga::DWconvArgs fpga_dwconv_args; + + public: + const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; } + void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; } #endif }; template diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp old mode 100644 new mode 100755 index 8ceb157d28764de469e5de5108ad483387ba8ca9..28351051098a57a59a11f53a268bf4b8ceac018e --- a/src/operators/reshape_op.cpp +++ b/src/operators/reshape_op.cpp @@ -38,6 +38,9 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp); #endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp); +#endif #ifdef PADDLE_MOBILE_CL REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp); #endif diff --git a/tools/op.cmake b/tools/op.cmake index 4fe9988e55629feb65ecd49b5c3a72f6faf12997..3254a83ab2bdb506c4d4b74d5f4d63f157b3db81 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -122,6 +122,11 @@ if (CON GREATER -1) set(SPLIT_OP ON) set(FUSION_DECONVADD_OP ON) set(FUSION_DECONVADDRELU_OP ON) + + set(RESHAPE_OP ON) + set(FUSION_CONVADDBNRELU_OP ON) + set(FUSION_CONVADDBN_OP ON) + set(FOUND_MATCH ON) endif()