diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index ef7d5c13dce50706d69b28830d8c23748266b216..efcc9f742e88e9a96394a2f6a15118c918624515 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #include "fpga/V1/api.h" #include "fpga/V1/bias_scale.h" +#include "fpga/V1/deconv_filter.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" @@ -124,6 +125,32 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { max_value); filter_tensor->reset_data_ptr(new_data); } +void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, + int group_num, int stride) { + filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT + filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT + auto dims = filter_tensor->dims(); + auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; + auto data_ptr = filter_tensor->data(); + size_t memory_size = num * channel * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + memcpy(new_data, data_ptr, memory_size); + + int hw = height * width; + deconv_filter::deconv_NC_convert(&new_data, num, channel, hw); + + num = dims[1]; + channel = dims[0]; + deconv_filter::deconv_format_filter( + &new_data, (int)num, (int)channel, // NOLINT + (int)height, // NOLINT + (int)width, group_num, max_value, stride); // NOLINT + + framework::DDim dims_new = + framework::make_ddim({num, channel, height, width}); + filter_tensor->Resize(dims_new); + filter_tensor->reset_data_ptr(new_data); +} void format_bias_scale_array(float **bias_scale_array, int element_num_per_division, int num) { @@ -240,6 +267,100 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, filter->reset_data_ptr(nullptr); fpga_free(bs_ptr); } +void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, + framework::Tensor *out, framework::Tensor *filter, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, + float *bs_ptr) { + auto input_ptr = input->data(); + auto filter_ptr = filter->data(); + auto out_ptr = out->data(); + + arg->group_num = (uint32_t)group_num; + arg->sub_conv_num = stride_h; + arg->filter_num = (uint32_t)filter->dims()[0]; + + int sub_conv_num = arg->sub_conv_num; + int sub_stride = 1; + int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w, + stride_w); + int sub_filter_width = + deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w); + int sub_output_width = deconv_filter::deconv_get_sub_out_axis( + input->dims()[3], sub_pad, sub_filter_width); + int sub_output_height = deconv_filter::deconv_get_sub_out_axis( + input->dims()[2], sub_pad, sub_filter_width); + + arg->sub_output_width = sub_output_width; + arg->sub_output_height = sub_output_height; + arg->omit_size = + deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w); + arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); + + int sub_channels = (int32_t)input->dims()[1]; + int omit_size = arg->omit_size; + int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; + int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; + int sub_filter_num = sub_conv_num * (arg->filter_num); + + int conv_output_size = + (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * + sub_output_height; + int ouput_size = conv_output_size * sub_conv_num; + + int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT); + int align_sub_filter_count = + align_to_x(sub_filter_width * sub_filter_width * sub_channels, + FILTER_ELEMENT_ALIGNMENT); + int align_conv_sub_filter_count = + align_sub_filter_count * align_sub_filter_num; + + for (int i = 0; i < sub_conv_num; ++i) { + arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); + arg->conv_args[i].group_num = group_num; + + arg->conv_args[i].filter_scale_address = filter->scale; + arg->conv_args[i].relu_enabled = relu_enabled; + + arg->conv_args[i].kernel.width = sub_filter_width; + arg->conv_args[i].kernel.height = sub_filter_width; + arg->conv_args[i].kernel.stride_w = 1; + arg->conv_args[i].kernel.stride_h = 1; + + // DeconvParam.conv_args[i].image.address = (void*)ptr_image; + arg->conv_args[i].image.scale_address = input->scale; + arg->conv_args[i].image.channels = sub_channels; + arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; + arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; + arg->conv_args[i].image.pad_width = sub_pad; + arg->conv_args[i].image.pad_height = sub_pad; + arg->conv_args[i].image.address = input_ptr; + + arg->conv_args[i].sb_address = (void *)bs_ptr; + + char *filter_sub_space = + (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); + fpga_copy(filter_sub_space, + (char *)filter_ptr + i * align_conv_sub_filter_count, + align_conv_sub_filter_count); + arg->conv_args[i].filter_address = (void *)(filter_sub_space); + fpga_flush(filter_sub_space, align_conv_sub_filter_count); + + if (sub_conv_num == 1) { + arg->conv_args[i].output.address = out_ptr; + arg->conv_args[i].output.scale_address = out->scale; + } else { + half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); + arg->conv_args[i].output.address = (void *)((half *)ptr_output); + float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); + arg->conv_args[i].output.scale_address = ptr_output_scale; + } + } + + arg->output.address = out_ptr; + arg->output.scale_address = out->scale; + // fpga_free(filter_ptr); +} } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index daa7902ab4a6cb72a77bba31f8cfe84c897f30a4..dbc3051b5784d3ecdf3cd08afb15242b7208a543 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -43,6 +43,25 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, framework::Tensor* out, framework::Tensor* filter, bool relu_enabled, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float* bs_ptr); +void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input, + framework::Tensor* out, framework::Tensor* filter, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float* bs_ptr); + +void format_deconv_filter(framework::Tensor* filter_tensor, float max_value, + int group_num, int stride); + +template +void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) { + float data; + std::ofstream out(filename.c_str()); + for (int i = 0; i < dataSize; ++i) { + data = (((Dtype*)buffer)[i]); + out << data << std::endl; + } + out.close(); + return; +} } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/deconv_bias_scale.cpp b/src/fpga/V1/deconv_bias_scale.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bcc91ddd295603d248639329773e74ef2e33a6a --- /dev/null +++ b/src/fpga/V1/deconv_bias_scale.cpp @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V1/deconv_bias_scale.h" +// #include "deconv_bias_scale.h" +#include "fpga/V1/bias_scale.h" +// #include "bias_scale.h" +#include + +#include "fpga/V1/api.h" +// #include "fpga_api.h" +namespace paddle_mobile { +namespace fpga { +namespace deconv_bias_scale { + +void deconv_bias_scale_expand(float** bias_scale_array, int num, + int sub_conv_n) { + int sub_num = num * sub_conv_n; + float* ptr_tmp = *bias_scale_array; + float* ptr_bias_scale_expand = + (float*)fpga_malloc(sizeof(float) * sub_num * 2); + int scale_base_offset = sub_num; + for (int i = 0; i < sub_conv_n; ++i) { + int offset = num * i; + // copy bias + fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float)); + // copy scale + fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num, + num * sizeof(float)); + } + *bias_scale_array = ptr_bias_scale_expand; + fpga_free(ptr_tmp); +} + +} // namespace deconv_bias_scale +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V1/deconv_bias_scale.h b/src/fpga/V1/deconv_bias_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..7b9aaff756809b43884883b3333fcbc1dd2b6adf --- /dev/null +++ b/src/fpga/V1/deconv_bias_scale.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define BS_NUM_ALIGNMENT 8 + +namespace paddle_mobile { +namespace fpga { +namespace deconv_bias_scale { + +void deconv_bias_scale_expand(float** bias_scale_array, int num, + int sub_conv_n); + +} // namespace deconv_bias_scale +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V1/deconv_filter.cpp b/src/fpga/V1/deconv_filter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96634e520fd8638dd0ded1da7ca5b5ad33bd7aec --- /dev/null +++ b/src/fpga/V1/deconv_filter.cpp @@ -0,0 +1,277 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V1/deconv_filter.h" +#include +#include +// #include "deconv_filter.h" +#include "fpga/V1/filter.h" +// #include "filter.h" +#include "fpga/V1/api.h" +// #include "fpga_api.h" + +// just for test +//#include +//#include "deconv.h" +//#include "deconv_api.h" +// using namespace std; +// using namespace paddle_mobile::fpga; +// using namespace baidu::fpga::deconv::api; +// namespace api = baidu::fpga::deconv::api; + +namespace paddle_mobile { +namespace fpga { +namespace deconv_filter { + +/* +inverse kernel weights of each channel for every filter +*/ +void deconv_inverse_filter(float** data_in, int num, int channel, int width, + int height) { + float* tmp = *data_in; + // float fix_range = 127;// float scale = fix_range / max; + int data_size = num * channel * width * height; + int hw_len = height * width; + float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float)); + for (int i = 0; i < num; ++i) { + for (int j = 0; j < channel; ++j) { + for (int k = 0; k < hw_len; ++k) { + tmp_data[i * channel * hw_len + j * hw_len + k] = + (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1]; + } + } + } + *data_in = (float*)tmp_data; // + fpga_free(tmp); +} + +/* + calculate sub padding number +*/ +int deconv_calc_sub_pad(int filter_axis, int pad, int stride) { + if (stride == 0 || ((filter_axis - pad - 1) < 0)) { + // error + return 0; + } + return (filter_axis - pad - 1) / stride; +} +int deconv_get_sub_filter_axis(int filter_axis, int stride) { + return (filter_axis / stride); +} + +int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) { + return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1); +} + +/* + (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image + position. so the omit rows or columns is (stride - ) +*/ +int deconv_get_omit(int stride, int filter_width, int pad) { + if (((filter_width - pad) <= 0)) { // ((filter_width-pad) > stride) || + // error + return 0; + } + int idx = 1; + bool flag = false; + for (idx = 1; idx <= stride; ++idx) { + int j = idx; + for (; j <= filter_width;) { + if (j == filter_width - pad) { + flag = true; + break; + } + j = j + stride; + } + if (flag) { + break; + } + } + + return (stride - idx); +} + +int deconv_get_sub_filter_num(int filter_num, int stride) { + return filter_num * stride; +} + +void deconv_get_sub_filter(char** data_in, int height, int width, + int sub_conv_n, int kernel_num, int channel) { + char* ptr_tmp = *data_in; + int sub_num = kernel_num * sub_conv_n; + int sub_h = height / sub_conv_n; + int sub_w = width / sub_conv_n; + + int sub_filter_size = + kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n; + + char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char)); + for (int idx = 0; idx < sub_conv_n; ++idx) { + for (int nn = 0; nn < sub_num; ++nn) { + int ni = nn % kernel_num; + + int woff = sub_conv_n - 1 - (nn / kernel_num); // + + for (int hh = 0; hh < sub_h; ++hh) { + int hi = hh * sub_conv_n + idx % sub_conv_n; + for (int ww = 0; ww < sub_w; ++ww) { + int wi = ww * sub_conv_n + woff; // 1 0 + + int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel; // + int kidx = ((ni * height + hi) * width + wi) * channel; // + + fpga_copy( + ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx, + (*data_in) + kidx, channel * sizeof(char)); + // for (int cc =0; cc < channel; ++cc) { + // ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] = + // (*data_in)[kidx + cc]; + // } + } + } + } + } + *data_in = ptr_sub_filter; + fpga_free(ptr_tmp); +} + +void deconv_NC_convert(float** filter_in, int kernel_num, int channels, + int hw) { + float* tmp = *filter_in; + float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc( + hw * kernel_num * channels * sizeof(float))); + + for (int c = 0; c < channels; ++c) { + for (int n = 0; n < kernel_num; ++n) { + paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c, + tmp + n * channels * hw + c * hw, + hw * sizeof(float)); + } + } + *filter_in = ptr_filter; + paddle_mobile::fpga::fpga_free(tmp); +} + +void deconv_format_filter(float** data_in, int num, int channel, int height, + int width, int group_num, float max, int stride) { + int data_size = channel * height * width * num; + + /*{ + float result2 = (float)0; + string filename = "origin_filter_data"; + api::savefile(filename, (void *)*data_in, data_size, result2); + }*/ + + deconv_inverse_filter(data_in, num, channel, width, height); + + /* { + float result2 = (float)0; + string filename = "inverse_filter_data"; + api::savefile(filename, (void *)*data_in, data_size, result2); + }*/ + + filter::quantize(data_in, data_size, max); + /* { + char result2 = (char)0; + string filename = "quantize_filter_data"; + api::savefile(filename, (void *)*data_in, data_size, result2); + }*/ + char** quantize_data = (char**)data_in; // NOLINT + + filter::convert_to_hwc(quantize_data, num, channel, height, width); + /*{ + char result2 = (char)0; + string filename = "convert_to_hwc_filter_data"; + api::savefile(filename, (void *)*quantize_data, data_size, + result2); + }*/ + + deconv_get_sub_filter(quantize_data, height, width, stride, num, channel); + /*{ + char result2 = (char)0; + string filename = "sub_filter_filter_data"; + api::savefile(filename, (void *)*quantize_data, data_size, result2); +}*/ + + int sub_conv_n = stride; + int sub_h = height / sub_conv_n; + int sub_w = width / sub_conv_n; + int sub_chw = sub_h * sub_w * channel; + int sub_num = sub_conv_n * num; + int division_capacity = filter::calc_division_capacity(sub_chw); + int num_per_div_before_alignment = + filter::calc_num_per_div(sub_num, group_num, division_capacity); + int num_per_div_after_alignment = + align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); + int div_num = (sub_num + num_per_div_before_alignment - 1) / + num_per_div_before_alignment; + int residual = (sub_num) % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); + + char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*)); + int origin_offset = sub_chw * sub_num; + for (int i = 0; i < sub_conv_n; ++i) { + (ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char)); + fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i, + origin_offset * sizeof(char)); + + /* char result2 = (char)0; + string filename = "ptr_ptr_data" + to_string(i); + api::savefile(filename, (void *)(ptr_ptr_data[i]), origin_offset, + result2); + */ + } + // char result2 = (char)0; + // string filename = "interleave"; + // api::savefile(filename, (void *)*ptr_ptr_data, origin_offset, + // result2); + fpga_free(*quantize_data); + + int align_offset = + align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment; + char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset * + sizeof(char)); // continuous space + for (int i = 0; i < sub_conv_n; ++i) { + int offset = i * origin_offset; + char* ptr_tmp = (ptr_ptr_data)[i]; + + filter::align_element(&ptr_tmp, sub_num, sub_chw); + filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw); + + filter::reorder(&ptr_tmp, num_after_alignment, sub_chw); + filter::interleave(&ptr_tmp, num_after_alignment, sub_chw); + + /* char result2 = (char)0; + string filename = "interleave" + to_string(i); + api::savefile(filename, (void *)ptr_tmp, align_offset, result2); +*/ + fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); + fpga_free(ptr_tmp); + } + *data_in = (float*)ptr_space; + + /* { + char result2 = (char)0; + string filename = "ptr_space"; + api::savefile(filename, (void *)ptr_space, sub_conv_n * + align_offset, result2); + }*/ + fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char)); +} + +} // namespace deconv_filter +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V1/deconv_filter.h b/src/fpga/V1/deconv_filter.h new file mode 100644 index 0000000000000000000000000000000000000000..e89ebe5087a3ab8ee7ef007696e5b1fd50d4173e --- /dev/null +++ b/src/fpga/V1/deconv_filter.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle_mobile { +namespace fpga { +namespace deconv_filter { + +void deconv_inverse_filter(float** data_in, int num, int channel, int width, + int height); +int deconv_calc_sub_pad(int filter_axis, int pad, int stride); +int deconv_get_sub_filter_num(int filter_num, int stride); +int deconv_get_sub_filter_axis(int filter_axis, int stride); +int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); +int deconv_get_omit(int stride, int filter_width, int pad); +void deconv_get_sub_filter(char** data_in, int height, int width, + int sub_conv_n, int kernel_num, int channel); +void deconv_format_filter(float** data_in, int num, int channel, int height, + int width, int group_num, float max, int stride); +void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw); + +} // namespace deconv_filter +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index d67c9fdc18ac8f715a5389625b6d76b71281e349..1f204ca30289136fc3ff83e2712ff32842a88100 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -146,12 +146,16 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, memset(data_tmp, 0, num_element * sizeof(char)); - for (i = 0; i < div_num; i++) { + for (i = 0; i < div_num - 1; i++) { memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, *data_in + num_per_div_before_alignment * align_chw * i, num_per_div_before_alignment * align_chw); } + memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, + *data_in + num_per_div_before_alignment * align_chw * i, + (num - (div_num - 1) * num_per_div_before_alignment) * align_chw); + *data_in = data_tmp; fpga_free(tmp); } diff --git a/src/fpga/V1/filter.h b/src/fpga/V1/filter.h index 5d03ee9b4a0b1455b27f7c978678bd1dfaa5a698..7bcc334738be98a49fc7eb25cf7090c0f8b6b3b3 100644 --- a/src/fpga/V1/filter.h +++ b/src/fpga/V1/filter.h @@ -29,11 +29,11 @@ void convert_to_hwc(char** data_in, int num, int channel, int height, int width); float find_max(float* data_in, int data_size); void quantize(float** data_in, int data_size, float max); -void align_element(float** data_in, int num, int chw); +void align_element(char** data_in, int num, int chw); void align_num(char** data_in, int num_per_div_before_alignment, int num, int chw); -void reorder(float** data_in, int num_after_alignment, int chw); -void interleave(float** data_in, int num_after_alignment, int chw); +void reorder(char** data_in, int num_after_alignment, int chw); +void interleave(char** data_in, int num_after_alignment, int chw); void format_filter(float** data_in, int num, int channel, int height, int width, int group_num, float max); diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 1a906ba4a4f43e1e1b57bbb3652fdc19fa052a78..2002e9e88ebd7a33a85811b0ff41e080c6b57ab9 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -56,6 +56,7 @@ class CLImage { tensor_dims_ = dim; } + bool isInit() { return initialized_; } /* * need call SetTensorData first * diff --git a/src/operators/fusion_conv_bn_add_relu_op.cpp b/src/operators/fusion_conv_bn_add_relu_op.cpp index 1c03e29ea07729efb24f7c86c674ecc72aaceed5..9a3926353319aa267814097d93a6d9b1fa20bd2d 100644 --- a/src/operators/fusion_conv_bn_add_relu_op.cpp +++ b/src/operators/fusion_conv_bn_add_relu_op.cpp @@ -55,6 +55,9 @@ REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu, #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); +#endif #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); #endif diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp index 0d5695cb80736dcc126ce5f726c0a2566884fe45..590e8c5cb028ab4acd3a94df28c5c10dd477d51f 100644 --- a/src/operators/kernel/cl/batchnorm_kernel.cpp +++ b/src/operators/kernel/cl/batchnorm_kernel.cpp @@ -77,15 +77,25 @@ void BatchNormKernel::Compute( auto new_scale = param.NewScale()->GetCLImage(); auto new_bias = param.NewBias()->GetCLImage(); const int out_width = default_work_size[1]; - - clSetKernelArg(kernel, 1, sizeof(int), &out_width); - clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); - clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale); - clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias); - clSetKernelArg(kernel, 5, sizeof(cl_mem), &out); - - // cl_event out_event = param.OutputY()->GetClEvent(); - // cl_event wait_event = param.InputX()->GetClEvent(); + DLOG << *param.InputX(); + DLOG << *param.NewBias(); + DLOG << *param.NewScale(); + DLOG << default_work_size[0]; + DLOG << default_work_size[1]; + DLOG << default_work_size[2]; + DLOG << out_width; + DLOG << *param.OutputY(); + cl_int status; + clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width); + CL_CHECK_ERRORS(status); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &out); + CL_CHECK_ERRORS(status); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); } diff --git a/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..140cbd93d6d2dae35154b8297efd81ecf5c00431 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + + +#define BATCH_NORM +#define BIASE +#define RELU + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 1085e97c10d27aa99583a86a2e2d70ae11d2d68d..f9bbd4da74d8f017a13df2e9cd0ed344ed27c09b 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -924,6 +924,387 @@ __kernel void conv_5x5(__private const int global_size_dim0, write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); } +__kernel void convBNAdd_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + + + half4 output = (half4)0.0f; + + half4 input[9]; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + +/* + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } +*/ + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef BIASE + output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + +__kernel void convBNAdd_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + + + half4 output = 0.0f; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); +/* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); +*/ + + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef BIASE + output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); +} diff --git a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b7352a679afcb1137421040117e9d85d9589e61 --- /dev/null +++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp @@ -0,0 +1,233 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNADDRELU_OP + +#include "operators/kernel/conv_bn_add_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvBNAddReluKernel::Init( + FusionConvBNAddReluParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + const framework::CLImage *mean = param->InputMean(); + const framework::CLImage *variance = param->InputVariance(); + const framework::CLImage *scale = param->InputScale(); + const framework::CLImage *bias = param->InputBias(); + + const float epsilon = param->Epsilon(); + + const int C = mean->numel(); + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + float *new_scale_ptr = new float[C]; + float *new_bias_ptr = new float[C]; + + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + framework::CLImage *new_scale = new framework::CLImage(); + + // for (int j = 0; j < C; ++j) { + // DLOG << " new scale - " << j << new_scale_ptr[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " new bias - " << j << new_bias_ptr[j]; + // } + + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // DLOG << " climage - y bias: " << *(param->Bias()); + // + // DLOG << " climage - new scale: " << *new_scale; + + framework::CLImage *new_bias = new framework::CLImage(); + + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // DLOG << " climage - new bias: " << *new_bias; + // + // DLOG << " climage - filter: " << *(param->Filter()); + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + delete[](new_scale_ptr); + delete[](new_bias_ptr); + + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + + param->SetOffset(offset); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("convBNAdd_1x1", "conv_bn_add_relu_kernel.cl"); + DLOG << " conv bn add relu conv 1x1"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_convBNAdd_3x3", + "conv_bn_add_relu_kernel.cl"); + DLOG << " conv bn add relu depth_conv_3x3"; + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("convBNAdd_3x3", "conv_bn_add_relu_kernel.cl"); + DLOG << " conv bn add relu conv_3x3"; + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvBNAddReluKernel::Compute( + const FusionConvBNAddReluParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto biase = param.Bias()->GetCLImage(); + auto new_scale = param.NewScale()->GetCLImage(); + auto new_bias = param.NewBias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + // DLOG << " c block " << c_block; + // DLOG << " w " << w; + // DLOG << " nh " << nh; + // DLOG << " stride " << stride; + // DLOG << " offset " << offset; + // DLOG << " input_c " << input_c; + // DLOG << " dilation " << dilation; + // DLOG << " input width " << input_width; + // DLOG << " input height " << input_height; + // DLOG << " output width " << output_width; + // DLOG << " output height " << output_height; + // DLOG << " input dim " << *param.Input(); + // DLOG << " output dim " <<* param.Output(); + // DLOG << " filter dim " << *param.Filter(); + // DLOG<<*param.Bias(); + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} +template class ConvBNAddReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp index 35813a31f570c8daf956e4c90d0f3e3de1675eb4..372c25b596206812a664e49c57e3108c607d513f 100644 --- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -1,97 +1,96 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEPTHWISECONV_OP - -#include "operators/kernel/depthwise_conv_kernel.h" -#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool DepthwiseConvKernel::Init(ConvParam *param) { - DLOG << " depthwise conv kernel init begin "; - PADDLE_MOBILE_ENFORCE( - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Paddings()[0] == param->Paddings()[1], - "need equal"); - param->Filter()->InitCLImage(cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue()); - int offset = static_cast(param->Filter()->dims()[2]) / 2 - - static_cast(param->Paddings()[1]); - param->SetOffset(offset); - this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); - DLOG << " depthwise conv kernel init end "; - return true; -} - -template <> -void DepthwiseConvKernel::Compute( - const ConvParam ¶m) { - auto kernel = this->cl_helper_.KernelAt(0); - auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); - int c_block = default_work_size[0]; - int w = default_work_size[1]; - int nh = default_work_size[2]; - auto input = param.Input()->GetCLImage(); - auto filter = param.Filter()->GetCLImage(); - auto output = param.Output()->GetCLImage(); - int stride = param.Strides()[0]; - int offset = param.Offset(); - int input_c = reinterpret_cast( - param.Input()->Converter()) - ->GetCBlock(); - int dilation = param.Dilations()[0]; - - int input_width = param.Input()->dims()[3]; - int input_height = param.Input()->dims()[2]; - int output_width = param.Output()->dims()[3]; - int output_height = param.Output()->dims()[2]; - - cl_int status; - - status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); - status = clSetKernelArg(kernel, 1, sizeof(int), &w); - status = clSetKernelArg(kernel, 2, sizeof(int), &nh); - status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); - status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); - status = clSetKernelArg(kernel, 6, sizeof(int), &stride); - status = clSetKernelArg(kernel, 7, sizeof(int), &offset); - status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); - status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); - status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); - status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); - status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); - status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); - - CL_CHECK_ERRORS(status); - - // cl_event out_event = param.Output()->GetClEvent(); - // cl_event wait_event = param.Input()->GetClEvent(); - - status = clEnqueueNDRangeKernel( - this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); - - CL_CHECK_ERRORS(status); -} - -template class DepthwiseConvKernel; - -} // namespace operators -} // namespace paddle_mobile - -#endif +///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. */ +// +//#ifdef DEQUANT_OP +// +//#include "operators/kernel/dequantize_kernel.h" +// +// namespace paddle_mobile { +// namespace operators { +// +// template <> +// bool DequantizeKernel::Init(DequantizeParam *param) { +// DLOG << " depthwise conv kernel init begin "; +// PADDLE_MOBILE_ENFORCE( +// param->Filter()->dims()[2] == param->Filter()->dims()[3] && +// param->Paddings()[0] == param->Paddings()[1], +// "need equal"); +// param->Filter()->InitCLImage(cl_helper_.CLContext(), +// this->cl_helper_.CLCommandQueue()); +// int offset = static_cast(param->Filter()->dims()[2]) / 2 - +// static_cast(param->Paddings()[1]); +// param->SetOffset(offset); +// this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); +// DLOG << " depthwise conv kernel init end "; +// return true; +//} +// +// template <> +// void DequantizeKernel::Compute( +// const DequantizeParam ¶m) { +// auto kernel = this->cl_helper_.KernelAt(0); +// auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); +// int c_block = default_work_size[0]; +// int w = default_work_size[1]; +// int nh = default_work_size[2]; +// auto input = param.Input()->GetCLImage(); +// auto filter = param.Filter()->GetCLImage(); +// auto output = param.Output()->GetCLImage(); +// int stride = param.Strides()[0]; +// int offset = param.Offset(); +// int input_c = reinterpret_cast( +// param.Input()->Converter()) +// ->GetCBlock(); +// int dilation = param.Dilations()[0]; +// +// int input_width = param.Input()->dims()[3]; +// int input_height = param.Input()->dims()[2]; +// int output_width = param.Output()->dims()[3]; +// int output_height = param.Output()->dims()[2]; +// +// cl_int status; +// +// status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); +// status = clSetKernelArg(kernel, 1, sizeof(int), &w); +// status = clSetKernelArg(kernel, 2, sizeof(int), &nh); +// status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); +// status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); +// status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); +// status = clSetKernelArg(kernel, 6, sizeof(int), &stride); +// status = clSetKernelArg(kernel, 7, sizeof(int), &offset); +// status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); +// status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); +// status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); +// status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); +// status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); +// status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); +// +// CL_CHECK_ERRORS(status); +// +// // cl_event out_event = param.Output()->GetClEvent(); +// // cl_event wait_event = param.Input()->GetClEvent(); +// +// status = clEnqueueNDRangeKernel( +// this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), +// NULL, default_work_size.data(), NULL, 0, NULL, NULL); +// +// CL_CHECK_ERRORS(status); +//} +// +// template class DepthwiseConvKernel; +// +//} // namespace operators +//} // namespace paddle_mobile +// +//#endif diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp index e62714b3fa3182706270627e7fd1a13b06f3b66a..4a12bde4f38f7d7c256ff61b0c9a4f0af8c54da6 100644 --- a/src/operators/kernel/cl/elementwise_add_kernel.cpp +++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp @@ -24,7 +24,11 @@ bool ElementwiseAddKernel::Init( ElementwiseAddParam *param) { DLOG << "-----init add-----"; CLImage *bias = (CLImage *)(param->InputY()); - bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); + if (!bias->isInit()) { + bias->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + } + DLOG << " bias: " << *bias; if (bias->dims().size() == 4) { this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 39d7e818976b56eaea8649392784e7b5dc8b7e1f..7ab25193c86228ba290bc4a79fd1fa203a8e3617 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -23,12 +23,61 @@ namespace operators { template <> bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { + bool relu_enabled = false; + auto input = const_cast(param->Input()); + const Tensor *bias = param->Bias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + + float max_value = fpga::filter_find_max(filter); + fpga::format_deconv_filter(filter, max_value, param->Groups(), + param->Strides()[0]); + + // int element_num_per_div = + // fpga::get_filter_num_per_div(filter, param->Groups()); + + // deconv only support group=1 && no spilt + fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n, + channel * sub_conv_n); + + fpga::format_fp16_ofm(out); + + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + return true; } template <> void DeconvAddKernel::Compute( - const FusionDeconvAddParam ¶m) {} + const FusionDeconvAddParam ¶m) { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp index ef2556208a8650a86522264f40f42cb596ec4190..a9a6909b00b7608d5edb26aa943c1c9cae663b3e 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -24,12 +24,60 @@ namespace operators { template <> bool DeconvAddReluKernel::Init( FusionDeconvAddReluParam *param) { + bool relu_enabled = true; + auto input = const_cast(param->Input()); + const Tensor *bias = param->Bias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + + float max_value = fpga::filter_find_max(filter); + fpga::format_deconv_filter(filter, max_value, param->Groups(), + param->Strides()[0]); + + // int element_num_per_div = + // fpga::get_filter_num_per_div(filter, param->Groups()); + + // deconv only support group=1 && no spilt + fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n, + channel * sub_conv_n); + + fpga::format_fp16_ofm(out); + + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); return true; } template <> void DeconvAddReluKernel::Compute( - const FusionDeconvAddReluParam ¶m) {} + const FusionDeconvAddReluParam ¶m) { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); +} } // namespace operators } // namespace paddle_mobile