提交 7d999238 编写于 作者: J jameswu2014 提交者: GitHub

Merge pull request #1564 from qnqinan/develop

add static quantization code and update FPGA V2(V3) related files fixed #1563
此差异已折叠。
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "fpga/common/fpga_common.h" #include "fpga/common/fpga_common.h"
#include "fpga/common/pe.h" #include "fpga/common/pe.h"
#include "framework/tensor.h" #include "framework/tensor.h"
...@@ -21,31 +22,81 @@ limitations under the License. */ ...@@ -21,31 +22,81 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor);
float filter_find_max(framework::Tensor* filter_tensor); float filter_find_max(framework::Tensor* filter_tensor);
int get_aligned_channel_num(int channel_num); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
int get_aligned_filter_num(framework::Tensor* filter_tensor); int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
int get_conv_output_channel(framework::Tensor* filter_tensor); int group_num, int stride);
void format_image(framework::Tensor* image_tensor); int get_plit_num(framework::Tensor* filter_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor, int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
int aligned_channel); // only allocate memory
void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
int get_aligned_filter_element_num(int chw);
void format_filter(framework::Tensor* filter_tensor, float max_value, void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num); int group_num);
void format_fc_filter(framework::Tensor* filter_tensor, float max_value); void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
void format_bias_scale_array(float** bias_scale_array, int filter_num, void format_bias_scale_array(float** bias_scale_array,
int filter_channel); int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
void format_concat_output(framework::Tensor* out, int height, int width, void format_concat_output(framework::Tensor* out, int height, int width,
uint32_t out_channel); int image_num, uint32_t* channel_num);
int format_conv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr, int group);
int format_fc_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr);
void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h, ActivationType activation_enable,
int stride_w, int padding_h, int padding_w, float* bs_ptr); int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float* bs_ptr);
void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float* bs_ptr);
void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float* bias_ptr);
void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
ActivationType activation_enable,
int16_t leaky_relu_negative_slope, int stride_h,
int stride_w, int padding_h, int padding_w,
float* bs_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride);
void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
void format_conv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr, int group);
void format_deconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr,
int group, int sub_conv_n);
void format_dwconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* scale_ptr,
float** bias_ptr);
void format_DWDeconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr,
int group, int sub_conv_n);
template <typename Dtype>
void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
float data;
std::ofstream out(filename.c_str());
for (int i = 0; i < dataSize; ++i) {
data = (((Dtype*)buffer)[i]); // NOLINT
out << data << std::endl;
}
out.close();
return;
}
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -20,26 +20,81 @@ namespace paddle_mobile { ...@@ -20,26 +20,81 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace bias_scale { namespace bias_scale {
void align_element(float **data_in, int num, int num_after_alignment) { void align_element(float **data_in, int num_per_div_before_alignment, int num) {
int copynum = 0;
float *ptr_unaligned = *data_in; float *ptr_unaligned = *data_in;
int total_element = 2 * num_after_alignment; // including bias & scale int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = float *ptr_aligned =
(float *)fpga_malloc(total_element * sizeof(float)); // NOLINT (float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, total_element * sizeof(float));
for (int i = 0; i < num; i++) { memset(ptr_aligned, 0, num_element * sizeof(float));
ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num]; for (int i = 0; i < div_num; i++) {
if (i == div_num - 1) {
copynum = (num_per_div_after_alignment * div_num > num)
? (num % num_per_div_after_alignment)
: (num_per_div_before_alignment);
} else {
copynum = num_per_div_before_alignment;
}
memcpy(ptr_aligned + i * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i,
copynum * sizeof(float));
memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
ptr_unaligned + num_per_div_before_alignment * i + num,
copynum * sizeof(float));
} }
fpga_free(ptr_unaligned); fpga_free(ptr_unaligned);
*data_in = ptr_aligned; *data_in = ptr_aligned;
} }
void format_bias_scale_array(float **data_in, int num, void interleave(float **data_in, int num_after_alignment) {
int num_after_alignment) { // num_after_alignment: number of bias after alignment
align_element(data_in, num, num_after_alignment);
fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float)); float *ptr_uninterleaved = *data_in;
float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4;
for (int i = 0; i < num; i++) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
4 * sizeof(float));
memcpy(ptr_interleaved + 8 * i + 4,
ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
}
fpga_free(ptr_uninterleaved);
*data_in = ptr_interleaved;
}
void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) {
align_element(bias_scale_array, element_num_per_division, num);
int div_num = (num + element_num_per_division - 1) / element_num_per_division;
int element_num_after_division =
align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
}
void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array;
int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
int16_t *ptr_aligned =
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
for (int i = 0; i < num_before_align; i++) {
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
}
*bias_array = (float *)ptr_aligned; // NOLINT
fpga_free(ptr_unaligned);
} }
} // namespace bias_scale } // namespace bias_scale
......
...@@ -18,8 +18,11 @@ namespace paddle_mobile { ...@@ -18,8 +18,11 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace bias_scale { namespace bias_scale {
void align_element(float **data_in, int num, int num_after_alignment); void align_element(float** data_in, int num_per_div_before_alignment, int num);
void format_bias_scale_array(float **data_in, int num, int num_after_alignment); void interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
} // namespace bias_scale } // namespace bias_scale
} // namespace fpga } // namespace fpga
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/deconv_bias_scale.h"
// #include "deconv_bias_scale.h"
#include "fpga/V2/bias_scale.h"
// #include "bias_scale.h"
// #include <memory.h>
#include "fpga/V2/api.h"
// #include "fpga_api.h"
namespace paddle_mobile {
namespace fpga {
namespace deconv_bias_scale {
void deconv_bias_scale_expand(float** bias_scale_array, int num,
int sub_conv_n) {
int sub_num = num * sub_conv_n;
float* ptr_tmp = *bias_scale_array;
float* ptr_bias_scale_expand =
reinterpret_cast<float*>(fpga_malloc(sizeof(float) * sub_num * 2));
int scale_base_offset = sub_num;
for (int i = 0; i < sub_conv_n; ++i) {
int offset = num * i;
// copy bias
fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
// copy scale
fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
num * sizeof(float));
}
*bias_scale_array = ptr_bias_scale_expand;
fpga_free(ptr_tmp);
}
} // namespace deconv_bias_scale
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle_mobile {
namespace fpga {
namespace deconv_bias_scale {
void deconv_bias_scale_expand(float** bias_scale_array, int num,
int sub_conv_n);
} // namespace deconv_bias_scale
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/deconv_filter.h"
#include <memory.h>
#include <algorithm>
// #include "deconv_filter.h"
#include "fpga/V2/filter.h"
// #include "filter.h"
#include "fpga/V2/api.h"
namespace paddle_mobile {
namespace fpga {
namespace deconv_filter {
/*
inverse kernel weights of each channel for every filter
*/
void deconv_inverse_filter(float** data_in, int num, int channel, int width,
int height) {
float* tmp = *data_in;
int data_size = num * channel * width * height;
int hw_len = height * width;
auto tmp_data =
reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
for (int i = 0; i < num; ++i) {
for (int j = 0; j < channel; ++j) {
for (int k = 0; k < hw_len; ++k) {
tmp_data[i * channel * hw_len + j * hw_len + k] =
(*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
}
}
}
*data_in = tmp_data;
fpga_free(tmp);
}
/*
calculate sub padding number
*/
int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
}
return (filter_axis - pad - 1) / stride;
}
int deconv_get_sub_filter_axis(int filter_axis, int stride) {
return (filter_axis / stride);
}
int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
}
/*
(filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
position. so the omit rows or columns is (stride - )
*/
int deconv_get_omit(int stride, int filter_width, int pad) {
PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
int idx;
bool flag = false;
for (idx = 1; idx <= stride; ++idx) {
int j = idx;
for (; j <= filter_width;) {
if (j == filter_width - pad) {
flag = true;
break;
}
j = j + stride;
}
if (flag) {
break;
}
}
return (stride - idx);
}
template <typename T>
void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
int kernel_num, int channel) {
T* ptr_tmp = *data_in;
int sub_num = kernel_num * sub_conv_n;
int sub_h = height / sub_conv_n;
int sub_w = width / sub_conv_n;
int sub_filter_size =
kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
T* ptr_sub_filter =
reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
for (int idx = 0; idx < sub_conv_n; ++idx) {
for (int nn = 0; nn < sub_num; ++nn) {
int ni = nn % kernel_num;
int woff = sub_conv_n - 1 - (nn / kernel_num); //
for (int hh = 0; hh < sub_h; ++hh) {
int hi = hh * sub_conv_n + idx % sub_conv_n;
for (int ww = 0; ww < sub_w; ++ww) {
int wi = ww * sub_conv_n + woff; // 1 0
int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel; //
int kidx = ((ni * height + hi) * width + wi) * channel; //
fpga_copy(
ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
(*data_in) + kidx, channel * sizeof(T));
// for (int cc =0; cc < channel; ++cc) {
// ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
// (*data_in)[kidx + cc];
// }
}
}
}
}
*data_in = ptr_sub_filter;
fpga_free(ptr_tmp);
}
void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
int hw) {
float* tmp = *filter_in;
float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
hw * kernel_num * channels * sizeof(float)));
for (int c = 0; c < channels; ++c) {
for (int n = 0; n < kernel_num; ++n) {
paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
tmp + n * channels * hw + c * hw,
hw * sizeof(float));
}
}
*filter_in = ptr_filter;
paddle_mobile::fpga::fpga_free(tmp);
}
void deconv_format_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max, int stride) {
int data_size = channel * height * width * num;
/*{
float result2 = (float)0;
string filename = "origin_filter_data";
api::savefile<float>(filename, (void *)*data_in, data_size, result2);
}*/
deconv_inverse_filter(data_in, num, channel, width, height);
/* {
float result2 = (float)0;
string filename = "inverse_filter_data";
api::savefile<float>(filename, (void *)*data_in, data_size, result2);
}*/
filter::quantize(data_in, data_size, max);
/* {
char result2 = (char)0;
string filename = "quantize_filter_data";
api::savefile<char>(filename, (void *)*data_in, data_size, result2);
}*/
char** quantize_data = (char**)data_in; // NOLINT
filter::convert_to_hwc(quantize_data, num, channel, height, width);
/*{
char result2 = (char)0;
string filename = "convert_to_hwc_filter_data";
api::savefile<char>(filename, (void *)*quantize_data, data_size,
result2);
}*/
deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
channel);
/*{
char result2 = (char)0;
string filename = "sub_filter_filter_data";
api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
}*/
int sub_conv_n = stride;
int sub_h = height / sub_conv_n;
int sub_w = width / sub_conv_n;
int sub_chw = sub_h * sub_w * channel;
int sub_num = sub_conv_n * num;
int division_capacity = filter::calc_division_capacity(sub_chw);
int num_per_div_before_alignment =
filter::calc_num_per_div(sub_num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num = (sub_num + num_per_div_before_alignment - 1) /
num_per_div_before_alignment;
int residual = (sub_num) % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
char** ptr_ptr_data =
reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
int origin_offset = sub_chw * sub_num;
for (int i = 0; i < sub_conv_n; ++i) {
(ptr_ptr_data)[i] =
reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
origin_offset * sizeof(char));
/* char result2 = (char)0;
string filename = "ptr_ptr_data" + to_string(i);
api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
result2);
*/
}
// char result2 = (char)0;
// string filename = "interleave";
// api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
// result2);
fpga_free(*quantize_data);
int align_offset =
align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
sub_conv_n * align_offset * sizeof(char))); // continuous space
for (int i = 0; i < sub_conv_n; ++i) {
char* ptr_tmp = (ptr_ptr_data)[i];
filter::align_element(&ptr_tmp, sub_num, sub_chw);
filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
/* char result2 = (char)0;
string filename = "interleave" + to_string(i);
api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
*/
fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
fpga_free(ptr_tmp);
}
fpga_free(ptr_ptr_data);
*data_in = reinterpret_cast<float*>(ptr_space);
/* {
char result2 = (char)0;
string filename = "ptr_space";
api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
align_offset, result2);
}*/
fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
}
void DWDconv_format_filter(float** data_in, int num, int channel, int height,
int width, float* scale_ptr, int stride) {
deconv_inverse_filter(data_in, num, channel, width, height);
filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
int16_t** quantize_data = (int16_t**)data_in; // NOLINT
filter::convert_to_hwn(quantize_data, channel, height, width);
deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
channel);
filter::align_element_n(quantize_data, channel, height, width);
fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
} // namespace deconv_filter
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle_mobile {
namespace fpga {
namespace deconv_filter {
void deconv_inverse_filter(float** data_in, int num, int channel, int width,
int height);
int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
int deconv_get_sub_filter_axis(int filter_axis, int stride);
int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
int deconv_get_omit(int stride, int filter_width, int pad);
template <typename T>
void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
int kernel_num, int channel);
void deconv_format_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max, int stride);
void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
void DWDconv_format_filter(float** data_in, int num, int channel, int height,
int width, float* scale_ptr, int stride);
} // namespace deconv_filter
} // namespace fpga
} // namespace paddle_mobile
...@@ -16,44 +16,53 @@ limitations under the License. */ ...@@ -16,44 +16,53 @@ limitations under the License. */
#include <memory.h> #include <memory.h>
#include <algorithm> #include <algorithm>
#include "fpga/common/fpga_common.h" #include "fpga/common/fpga_common.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
int calc_channel_parallelism(int channel) { int calc_division_capacity(int chw) {
if (channel <= 16) { int n = 2048 / ((chw + 15) / 16) * 32;
return 16; return n < 2048 ? n : 2048;
} else if (channel <= 32) {
return 32;
} else if (channel <= 64) {
return 64;
} else {
return 128;
}
}
int calc_aligned_channel(int channel) {
return align_to_x(channel, calc_channel_parallelism(channel));
} }
int calc_num_parallelism(int channel) { int calc_split_num(int num, int division_capacity) {
return FILTER_PARALLELISM / calc_channel_parallelism(channel); return (num + division_capacity - 1) / division_capacity;
} }
int calc_aligned_num(int num, int channel) { int calc_division_number(int num, int group_num, int division_capacity) {
return align_to_x(num, calc_num_parallelism(channel)); // PADDLE_MOBILE_ENFORCE(num % group_num == 0,
// "Filter number should be divisible by group
// number");
int split_num = calc_split_num(num, division_capacity);
// PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
// "Split number or group number should be 1");
return group_num * split_num;
} }
int calc_aligned_total_pixel_num(int num, int channel, int height, int width) { int calc_num_per_div(int num, int group_num, int division_capacity) {
int aligned_channel = calc_aligned_channel(channel); // PADDLE_MOBILE_ENFORCE(num % group_num == 0,
int aligned_filter_num = calc_aligned_num(num, channel); // "Filter number should be divisible by group
return aligned_filter_num * aligned_channel * height * width; // number");
int split_num = calc_split_num(num, division_capacity);
// PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
// "Split number or group number should be 1");
if (group_num == 1) {
if (num > division_capacity) {
return division_capacity;
} else {
return num;
}
} else {
return (num + group_num - 1) / group_num;
}
} }
void convert_to_hwc(float **data_in, int num, int channel, int height, void convert_to_hwc(char **data_in, int num, int channel, int height,
int width) { int width) {
float *tmp = *data_in; char *tmp = *data_in;
int chw = channel * height * width; int chw = channel * height * width;
float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float)); // NOLINT char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel; int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
...@@ -66,52 +75,170 @@ void convert_to_hwc(float **data_in, int num, int channel, int height, ...@@ -66,52 +75,170 @@ void convert_to_hwc(float **data_in, int num, int channel, int height,
} }
} }
} }
*data_in = data_tmp; *data_in = data_tmp;
fpga_free(tmp); fpga_free(tmp);
} }
void align_filter(float **data_in, int num, int channel, int height, float find_max(float *data_in, int data_size) {
int width) { float max = 0.0;
int aligned_channel = calc_aligned_channel(channel); for (int i = 0; i < data_size; ++i) {
int hw = height * width; float value = data_in[i];
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); float abs = value > 0 ? value : -value;
float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float)); // NOLINT max = std::max(max, abs);
float *temp = *data_in; }
memset(new_data, 0, pixel_num * sizeof(float)); return max;
for (int i = 0; i < num; i++) { }
for (int j = 0; j < hw; j++) {
memcpy(new_data + i * aligned_channel * hw + j * aligned_channel, signed char float_to_int8(float fdata) {
temp + i * channel * hw + j * channel, channel * sizeof(float)); if (fdata < 0.0) {
} fdata -= 0.5;
} } else {
*data_in = new_data; fdata += 0.5;
fpga_free(temp); }
} return (signed char)fdata;
void convert_to_fp16(float **data_in, int data_size) { }
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in; float *tmp = *data_in;
// half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size * float fix_range = 127;
// sizeof(half_float::half)); float scale = fix_range / max;
int16_t *tmp_data =
(int16_t *)fpga_malloc(data_size * sizeof(int16_t)); // NOLINT signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) { for (int i = 0; i < data_size; i++) {
// tmp_data[i] = (half_float::half)((*data_in)[i]); tmp_data[i] = float_to_int8(
tmp_data[i] = fp32_2_fp16((*data_in)[i]); (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
} }
*data_in = (float *)tmp_data; // NOLINT *data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp); fpga_free(tmp);
} }
void align_element(char **data_in, int num, int chw) {
int i = 0;
int j = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) {
char *tmp = *data_in;
char *data_tmp =
(char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT
memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) {
memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
}
void align_num(char **data_in, int num_per_div_before_alignment, int num,
int chw) {
int i = 0;
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char));
for (i = 0; i < div_num - 1; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
(num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
*data_in = data_tmp;
fpga_free(tmp);
}
void reorder(char **data_in, int num_after_alignment, int chw) {
int index = 0;
int new_index;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
(index / 16 % 2 * 4);
memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
chw_align);
}
*data_in = data_tmp;
fpga_free(tmp);
}
void interleave(char **data_in, int num_after_alignment, int chw) {
int i = 0;
int j = 0;
int k = 0;
int interleave_per_num = 16;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) {
for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
memcpy(data_tmp + i * chw_align + interleave_per_num * j,
*data_in + i * chw_align + interleave_per_num * k,
interleave_per_num);
memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
*data_in + (i + 1) * chw_align + interleave_per_num * k,
interleave_per_num);
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_filter(float **data_in, int num, int channel, int height, int width, void format_filter(float **data_in, int num, int channel, int height, int width,
int group_num, float max) { int group_num, float max) {
convert_to_hwc(data_in, num, channel, height, width); int data_size = channel * height * width * num;
align_filter(data_in, num, channel, height, width); int chw = channel * height * width;
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
convert_to_fp16(data_in, pixel_num); int division_capacity = calc_division_capacity(chw);
fpga_flush(*data_in, pixel_num * sizeof(float)); int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
} }
void convert_fc_filter(float **data_in, int num, int chw) { void convert_fc_filter(char **data_in, int num, int chw) {
float *tmp = *data_in; char *tmp = *data_in;
float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float)); // NOLINT char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
for (int c = 0; c < chw; c++) { for (int c = 0; c < chw; c++) {
data_tmp[n * chw + c] = (*data_in)[num * c + n]; data_tmp[n * chw + c] = (*data_in)[num * c + n];
...@@ -123,47 +250,113 @@ void convert_fc_filter(float **data_in, int num, int chw) { ...@@ -123,47 +250,113 @@ void convert_fc_filter(float **data_in, int num, int chw) {
void format_fc_filter(float **data_in, int num, int channel, int height, void format_fc_filter(float **data_in, int num, int channel, int height,
int width, int group_num, float max) { int width, int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width; int chw = channel * height * width;
convert_fc_filter(data_in, num, chw);
align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
convert_to_fp16(data_in, pixel_num);
fpga_flush(*data_in, pixel_num * sizeof(float));
}
float find_max(float *data_in, int data_size) { int division_capacity = calc_division_capacity(chw);
float max = 0.0; int num_per_div_before_alignment =
for (int i = 0; i < data_size; ++i) { calc_num_per_div(num, group_num, division_capacity);
float value = data_in[i]; int num_per_div_after_alignment =
float abs = value > 0 ? value : -value; align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
max = std::max(max, abs); int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw);
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
} }
return max; reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
}
void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
int16_t *tmp = *data_in;
int16_t *data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
*(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
} }
signed char float_to_int8(float fdata) { void align_element_n(int16_t **data_in, int num, int height, int width) {
if (fdata < 0.0) { int unalign_n = num;
fdata -= 0.5; int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
if (unalign_n == align_n) {
return;
} else { } else {
fdata += 0.5; int16_t *tmp = *data_in;
int num_element = height * width * align_n;
int16_t *data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t));
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int offset_unalign = h * width * unalign_n + w * unalign_n;
int offset_align = h * width * align_n + w * align_n;
for (int n = 0; n < unalign_n; n++) {
data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
}
}
} }
return (signed char)fdata;
}
void quantize(float **data_in, int data_size, float max) { *data_in = data_tmp;
fpga_free(tmp);
}
}
void quantize_to_fp16(float **data_in, int num, int height, int width,
float *scale_ptr) {
float *tmp = *data_in; float *tmp = *data_in;
float fix_range = 127; int size = num * height * width;
float scale = fix_range / max;
signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t)); // NOLINT
for (int i = 0; i < data_size; i++) { for (int n = 0; n < num; n++) {
tmp_data[i] = float_to_int8( float scale_val = scale_ptr[n];
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
}
}
} }
*data_in = (float *)tmp_data; // NOLINT *data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp); fpga_free(tmp);
} }
void format_dwconv_filter(float **data_in, int num, int height, int width,
float *scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
convert_to_hwn(quantize_data, num, height, width);
align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
void format_DWDeconv_filter(float **data_in, int num, int height, int width,
float *scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
convert_to_hwn(quantize_data, num, height, width);
align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -13,25 +13,38 @@ See the License for the specific language governing permissions and ...@@ -13,25 +13,38 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cstdint>
#define FILTER_PARALLELISM 1024
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
int calc_channel_parallelism(int channel); int calc_division_capacity(int chw);
int calc_aligned_channel(int channel); int calc_split_num(int num, int division_capacity);
int calc_num_parallelism(int channel); int calc_division_number(int num, int group_num, int division_capacity);
int calc_aligned_num(int num, int channel); int calc_num_per_div(int num, int group_num, int division_capacity);
int calc_aligned_total_pixel_num(int num, int channel, int height, int width); void convert_to_hwc(char** data_in, int num, int channel, int height,
void convert_to_hwc(float** data_in, int num, int channel, int height,
int width); int width);
float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max);
void align_element(char** data_in, int num, int chw);
void align_num(char** data_in, int num_per_div_before_alignment, int num,
int chw);
void reorder(char** data_in, int num_after_alignment, int chw);
void interleave(char** data_in, int num_after_alignment, int chw);
void format_filter(float** data_in, int num, int channel, int height, int width, void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max); int group_num, float max);
void convert_fc_filter(float** data_in, int num, int chw);
void convert_fc_filter(char** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height, void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max); int width, int group_num, float max);
float find_max(float* data_in, int data_size);
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
void align_element_n(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16(float** data_in, int num, int height, int width,
float* scale_ptr);
void format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr);
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -13,80 +13,124 @@ See the License for the specific language governing permissions and ...@@ -13,80 +13,124 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "fpga/V2/image.h" #include "fpga/V2/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/common/fpga_common.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace image { namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) { void convert_to_hwc(float **data_in, int channel, int height, int width,
float *tmp = *data_in; int num) {
float *data_tmp = float *data_tmp = reinterpret_cast<float *>(
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT fpga_malloc(num * channel * height * width * sizeof(float)));
int64_t amount_per_row = width * channel; int64_t amount_per_row = width * channel;
for (int n = 0; n < num; n++) {
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row; int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) { for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++); *(data_tmp + n * channel * height * width + offset_height +
w * channel + c) = *((*data_in)++);
}
} }
} }
} }
*data_in = data_tmp; *data_in = data_tmp;
fpga_free(tmp);
} }
void align_image(float **data_in, int channel, int height, int width,
int aligned_channel) {
if (channel == aligned_channel) return;
float *tmp = *data_in;
float *new_data =
(float *)fpga_malloc(aligned_channel * height * width * // NOLINT
sizeof(float)); // NOLINT
memset(new_data, 0, aligned_channel * height * width * sizeof(float));
for (int i = 0; i < height * width; i++) { void convert_to_chw(float **data_in, int channel, int height, int width,
memcpy(new_data + i * aligned_channel, tmp + i * channel, int num) {
channel * sizeof(float)); float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_side = width * height;
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
for (int c = 0; c < channel; c++) {
*(data_tmp + n * height * width * channel + c * amount_per_side +
width * h + w) = *((*data_in)++);
} }
*data_in = new_data; }
fpga_free(tmp); }
} }
*data_in = data_tmp;
void format_image(float **data_in, int channel, int height, int width,
int aligned_channel) {
convert_to_hwc(data_in, channel, height, width);
align_image(data_in, channel, height, width, aligned_channel);
fpga_flush(*data_in, aligned_channel * height * width * sizeof(float));
} }
void concat_images(int16_t **images_in, float **scales_in, void *image_out, void concat_images(int16_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, const uint32_t *channel_num, float *scale_out, int image_num, uint32_t *channel_num,
int height, int width, const uint32_t *aligned_channel_num, int height, int width) {
int out_channel) { int i = 0;
int hw = height * width; int j = 0;
int k = 0;
int each_out_line_channel = 0;
int align_each_out_area_cw = 0;
int align_each_in_area_cw = 0;
int align_each_out_area_cw_differ = 0;
int tmp_channel = 0;
scale_out[0] = 0.0; scale_out[0] = 0.0;
scale_out[1] = 0.0; scale_out[1] = 0.0;
for (int i = 0; i < image_num; i++) { for (i = 0; i < image_num; i++) {
each_out_line_channel += channel_num[i];
scale_out[0] = std::max(*scale_out, scales_in[i][0]); scale_out[0] = std::max(*scale_out, scales_in[i][0]);
fpga_invalidate(images_in[i], fpga_invalidate(images_in[i],
height * width * aligned_channel_num[i] * sizeof(int16_t)); height *
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
sizeof(int16_t));
} }
scale_out[1] = 1 / scale_out[0]; scale_out[1] = 1 / scale_out[0];
align_each_out_area_cw =
align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
align_each_out_area_cw_differ =
align_each_out_area_cw - each_out_line_channel * width;
for (int j = 0; j < hw; j++) { for (k = 0; k < height; k++) {
int tmp_channel_sum = 0; for (j = 0; j < width; j++) {
for (int i = 0; i < image_num; i++) { for (i = 0; i < image_num; i++) {
memcpy( align_each_in_area_cw =
(int16_t *)image_out + j * out_channel + tmp_channel_sum, // NOLINT align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
images_in[i] + j * aligned_channel_num[i], memcpy((int16_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int16_t)); channel_num[i] * sizeof(int16_t));
tmp_channel_sum += channel_num[i]; tmp_channel += channel_num[i];
} }
} }
fpga_flush(image_out, hw * out_channel * sizeof(int16_t)); }
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
}
void split_image(int16_t *image_in, const float *scale_in, void **images_out,
float **scales_out, int image_num,
const uint32_t *channel_nums, int height, int width) {
int total_channel = 0;
for (int i = 0; i < image_num; i++) {
scales_out[i][0] = scale_in[0];
scales_out[i][1] = scale_in[1];
total_channel += channel_nums[i];
}
int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
fpga_invalidate(image_in, element_num * sizeof(int16_t));
int src_offset = 0, des_offset = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
w * total_channel;
for (int i = 0; i < image_num; i++) {
des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
w * channel_nums[i];
memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
image_in + src_offset, channel_nums[i] * sizeof(int16_t));
src_offset += channel_nums[i];
}
}
}
for (int i = 0; i < image_num; i++) {
element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
fpga_flush(images_out[i], element_num * sizeof(int16_t));
}
} }
} // namespace image } // namespace image
......
...@@ -14,23 +14,63 @@ limitations under the License. */ ...@@ -14,23 +14,63 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h> #include <memory.h>
#include <algorithm>
#include <cstdint>
#include "fpga/common/fpga_common.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace image { namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width); void convert_to_hwc(float** data_in, int channel, int height, int width,
void align_image(float **data_in, int channel, int height, int width, int num = 1);
int aligned_channel); void convert_to_chw(float** data_in, int channel, int height, int width,
void format_image(float **data_in, int channel, int height, int width, int num = 1);
int aligned_channel); // template <typename Dtype>
void concat_images( // void align_element_conv(Dtype** data_in, int height, int cw);
int16_t **images_in, float **scales_in, void *image_out, float *scale_out, // template <typename T>
int image_num, const uint32_t *channel_num, int height, int width, // void format_image(T** data_in, int channel, int height, int width);
const uint32_t *aligned_channel_num, template <typename Dtype>
int out_channel); // Concat featuremaps along channel direction void align_element_conv(Dtype** data_in, int height, int cw);
template <typename Dtype>
void align_element_conv(Dtype** data_in, int height, int cw) {
int h = 0;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
Dtype* data_tmp =
(Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype)); // NOLINT
memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
for (h = 0; h < height; h++) {
memcpy((void*)(data_tmp + h * align_cw), // NOLINT
(void*)(*data_in + h * cw), // NOLINT
cw * sizeof(Dtype));
}
*data_in = data_tmp;
}
template <typename T>
void format_image(T** data_in, int channel, int height, int width) {
int cw = channel * width;
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) {
T* hwc_temp = *data_in;
align_element_conv(data_in, height, channel * width);
fpga_free(hwc_temp);
}
fpga_flush(*data_in,
align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
}
// Concat featuremaps along channel direction
void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num,
int height, int width);
// Split featuremap along channel direction
void split_image(int16_t* image_in, const float* scale_in, void** images_out,
float** scales_out, int image_num,
const uint32_t* channel_nums, int height, int width);
} // namespace image } // namespace image
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
此差异已折叠。
...@@ -27,6 +27,14 @@ limitations under the License. */ ...@@ -27,6 +27,14 @@ limitations under the License. */
#define BIAS_NUM_ALIGNMENT (16) #define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3) #define ROW_PARALLEL_NUM (3)
#endif #endif
#ifdef PADDLE_MOBILE_FPGA_V2
#define IMAGE_ALIGNMENT (32) // Aligned to 32
#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3)
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -80,7 +88,8 @@ struct ImageOutputArgs { ...@@ -80,7 +88,8 @@ struct ImageOutputArgs {
activation; // To select activation and specify (Leaky)Relu parameter. activation; // To select activation and specify (Leaky)Relu parameter.
}; };
#ifdef PADDLE_MOBILE_FPGA_V1 // #ifdef PADDLE_MOBILE_FPGA_V1
#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
struct ConvDriverParam { struct ConvDriverParam {
uint64_t image_address_phy; uint64_t image_address_phy;
uint64_t filter_address_phy; uint64_t filter_address_phy;
...@@ -146,11 +155,8 @@ struct ConvArgs { ...@@ -146,11 +155,8 @@ struct ConvArgs {
struct ImageInputArgs image; // input image; struct ImageInputArgs image; // input image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
#ifdef PADDLE_MOBILE_FPGA_V2 // #ifdef PADDLE_MOBILE_FPGA_V1
void* free_space; // used by FPGA logic #if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
#endif
#ifdef PADDLE_MOBILE_FPGA_V1
struct DeconvTxParm deconv_tx_param; struct DeconvTxParm deconv_tx_param;
struct ConvDriverParam driver; struct ConvDriverParam driver;
#endif #endif
...@@ -208,7 +214,10 @@ struct EWAddArgs { ...@@ -208,7 +214,10 @@ struct EWAddArgs {
struct ImageInputArgs image0; struct ImageInputArgs image0;
struct ImageInputArgs image1; struct ImageInputArgs image1;
struct ImageOutputArgs output; struct ImageOutputArgs output;
#ifdef PADDLE_MOBILE_FPGA_V1 std::vector<float> image_in_quantVal;
std::vector<float> image_out_quantVal;
// #ifdef PADDLE_MOBILE_FPGA_V1
#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
struct EWAddDriverParam driver; struct EWAddDriverParam driver;
#endif #endif
}; };
......
...@@ -68,7 +68,6 @@ Executor<Device, T>::Executor(const Program<Device> &program, ...@@ -68,7 +68,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
// resize feed and fetch list // resize feed and fetch list
// should init feed and fetch variables before infer shape // should init feed and fetch variables before infer shape
InitFeedFetchList(); InitFeedFetchList();
const auto &blocks = program_desc_->Blocks(); const auto &blocks = program_desc_->Blocks();
std::shared_ptr<BlockDesc> block_desc = blocks[0]; std::shared_ptr<BlockDesc> block_desc = blocks[0];
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
...@@ -86,6 +85,9 @@ Executor<Device, T>::Executor(const Program<Device> &program, ...@@ -86,6 +85,9 @@ Executor<Device, T>::Executor(const Program<Device> &program,
} }
ops_of_block0_.push_back(op_handler); ops_of_block0_.push_back(op_handler);
} }
#ifdef PADDLE_MOBILE_FPGA_V2
InitQuantMemory();
#endif
if (program_.combined) { if (program_.combined) {
InitCombineMemory(); InitCombineMemory();
} else { } else {
...@@ -626,8 +628,74 @@ template <typename Device, typename T> ...@@ -626,8 +628,74 @@ template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) { void Executor<Device, T>::Predict_To(int end) {
Predict_From_To(0, end); Predict_From_To(0, end);
} }
#endif #ifdef PADDLE_MOBILE_FPGA_V2
std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
std::map<std::string, float> quantValList;
std::ifstream in;
in.open(filename, std::ios::in);
if (!in.is_open()) {
std::cout << "open File Failed." << std::endl;
exit(-1);
}
std::string line;
while (getline(in, line)) {
std::string splitStr = " : ";
std::string::size_type pos;
pos = line.find(splitStr);
std::string subStr[2];
subStr[0] = line.substr(0, pos);
subStr[1] = line.substr(pos + splitStr.size(), line.size());
quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str())));
}
in.close();
return quantValList;
}
template <typename Device, typename T>
void Executor<Device, T>::InitQuantMemory() {
std::string quantValFilePath;
if (program_.combined) {
quantValFilePath = program_.para_path;
quantValFilePath =
quantValFilePath.substr(0, (quantValFilePath.length() - 6));
quantValFilePath = quantValFilePath + "scale";
} else {
quantValFilePath = program_.model_path + "/scale";
}
std::map<std::string, float> quantValList =
LoadQuantValFromFile(quantValFilePath);
auto ops = ops_of_block0_;
for (int id = 0; id < ops.size(); id++) {
auto op = ops[id];
auto input_keys = op->GetInputKeys();
auto inputs = op->Inputs();
for (auto key = input_keys.begin(); key != input_keys.end(); key++) {
auto inputs_vars = inputs[*key];
int count = inputs_vars.size();
for (int i = 0; i < count; i++) {
auto tensor = GetTensorByName(inputs_vars[i]);
tensor->scale[0] = quantValList[inputs_vars[i]];
std::cout << "input variance name : " << inputs_vars[i]
<< ", scale value : " << tensor->scale[0] << std::endl;
}
}
auto output_keys = op->GetOutKeys();
auto outputs = op->Outputs();
for (auto key = output_keys.begin(); key != output_keys.end(); key++) {
auto outputs_vars = outputs[*key];
int count = outputs_vars.size();
for (int i = 0; i < count; i++) {
auto tensor = GetTensorByName(outputs_vars[i]);
tensor->scale[0] = quantValList[outputs_vars[i]];
std::cout << "output variance name : " << outputs_vars[i]
<< ", scale value : " << tensor->scale[0] << std::endl;
}
}
}
}
#endif
#endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
template <> template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory( void Executor<GPU_CL, float>::InitNoPersistableMemory(
......
...@@ -64,6 +64,9 @@ class Executor { ...@@ -64,6 +64,9 @@ class Executor {
void Predict_From_To(int start = 0, int end = -1); void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start); void Predict_From(int start);
void Predict_To(int end); void Predict_To(int end);
#ifdef PADDLE_MOBILE_FPGA_V2
void InitQuantMemory();
#endif
#endif #endif
protected: protected:
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ANCHOR_GENERATOR_OP
#include <string.h>
#include <iostream>
#include <utility>
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool AnchorGeneratorKernel<FPGA, float>::Init(
AnchorGeneratorParam<FPGA> *param) {
auto input = param->input_;
auto anchors = param->output_anchors_;
auto anchor_ptr = anchors->mutable_data<float>();
auto stride = param->stride_;
auto feature_width = input->dims()[3], feature_height = input->dims()[2];
auto stride_width = stride[0], stride_height = stride[1];
auto offset = param->offset_;
int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23,
-20, 39, 36, -43, -34, 59, 49, -63, -54,
79, 69, -96, -77, 112, 93, -137, -118, 153,
134, -204, -188, 220, 204, -281, -395, 296, 441};
int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46};
if (offset > 0.6) {
memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
std::cout << "anchor generator marker" << std::endl;
} else {
std::cout << "anchor generator rfcn" << std::endl;
}
int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
// DLOG << "feature_height: " << feature_height;
// DLOG << "feature_width: " << feature_width;
// DLOG << "num_anchors: " << num_anchors;
// DLOG << "stride_width: " << stride_width;
// DLOG << "stride_height: " << stride_height;
for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
int offset0 = h_idx * feature_width * num_anchors * 4;
for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
int offset1 = w_idx * num_anchors * 4;
for (int idx = 0; idx < num_anchors; idx++) {
int offset = offset0 + offset1 + idx * 4;
anchor_ptr[offset + 0] =
anchors_offset[idx * 4 + 0] + w_idx * stride_width;
anchor_ptr[offset + 1] =
anchors_offset[idx * 4 + 1] + h_idx * stride_height;
anchor_ptr[offset + 2] =
anchors_offset[idx * 4 + 2] + w_idx * stride_width;
anchor_ptr[offset + 3] =
anchors_offset[idx * 4 + 3] + h_idx * stride_height;
}
}
}
return true;
}
template <>
void AnchorGeneratorKernel<FPGA, float>::Compute(
const AnchorGeneratorParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif // ANCHOR_GENERATOR_OP
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef CONCAT_OP #ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h" #include "operators/kernel/concat_kernel.h"
#include "fpga/V2/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -31,45 +30,36 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -31,45 +30,36 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
(float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT (float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT
auto channel_num = auto channel_num =
(uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT
auto aligned_channel_num =
(uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT
auto height = inputs[0]->dims()[2]; auto height = inputs[0]->dims()[2];
auto width = inputs[0]->dims()[3]; auto width = inputs[0]->dims()[3];
auto out_channel =
(uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]); // NOLINT
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
auto input = inputs[i]; auto input = inputs[i];
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width, input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified"); "Image height & width should be unified");
images_in[i] = (half *)input->data<float>(); // NOLINT images_in[i] = input->data<half>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
aligned_channel_num[i] =
(uint32_t)fpga::get_aligned_channel_num(channel_num[i]);
scales_in[i] = input->scale; scales_in[i] = input->scale;
} }
fpga::format_concat_output(out, (int)height, (int)width, // NOLINT fpga::format_concat_output(out, height, width, image_num, channel_num);
out_channel);
fpga::ConcatArgs concatArgs = {0}; fpga::ConcatArgs concatArgs = {0};
concatArgs.image_num = (uint32_t)image_num; concatArgs.image_num = image_num;
concatArgs.images_in = images_in; concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in; concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->data<float>(); // NOLINT concatArgs.image_out = out->data<half>();
concatArgs.scale_out = out->scale; concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num; concatArgs.channel_num = channel_num;
concatArgs.aligned_channel_num = aligned_channel_num; concatArgs.height = height;
concatArgs.out_channel = out_channel; concatArgs.width = width;
concatArgs.height = (uint32_t)height;
concatArgs.width = (uint32_t)width;
param->SetFpgaArgs(concatArgs); param->SetFpgaArgs(concatArgs);
return true; return true;
} }
template <> template <>
void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) { void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
fpga::ComputeFPGAConcat(param.FpgaArgs()); ComputeFPGAConcat(param.FpgaArgs());
} }
template class ConcatKernel<FPGA, float>; template class ConcatKernel<FPGA, float>;
......
...@@ -22,12 +22,15 @@ namespace operators { ...@@ -22,12 +22,15 @@ namespace operators {
template <> template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false; // bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input()); paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias(); auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
...@@ -56,18 +59,18 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -56,18 +59,18 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
param->Groups(), param->Strides()[0], leaky_relu_negative_slope, param->Groups(),
param->Strides()[1], param->Paddings()[0], param->Strides()[0], param->Strides()[1],
param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true; return true;
} }
......
...@@ -23,12 +23,18 @@ namespace operators { ...@@ -23,12 +23,18 @@ namespace operators {
template <> template <>
bool ConvAddBNReluKernel<FPGA, float>::Init( bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) { FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true; // bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input()); paddle_mobile::fpga::ActivationType activation_enable =
const Tensor *bias = param->Bias(); paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
auto bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
vector<int> paddings = param->Paddings();
vector<int> strides = param->Strides();
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -40,7 +46,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -40,7 +46,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
const int channel = out->dims()[1]; const int channel = out->dims()[1];
auto bs_ptr = auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
auto new_scale = new Tensor(); auto new_scale = new Tensor();
auto new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
...@@ -51,27 +57,41 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -51,27 +57,41 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5)); static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] = new_bias_ptr[i] =
bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
bs_ptr[i + 2] = new_scale_ptr[i]; bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; bs_ptr[i] = new_bias_ptr[i];
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
const int groups = param->Groups();
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, strides[0], strides[1],
paddings[0], paddings[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
fpga::fpga_free(new_scale_ptr);
fpga::fpga_free(bs_ptr);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
param->Groups(), param->Strides()[0], leaky_relu_negative_slope, param->Groups(), strides[0],
param->Strides()[1], param->Paddings()[0], strides[1], paddings[0], paddings[1], bs_ptr);
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
}
return true; return true;
} }
template <> template <>
void ConvAddBNReluKernel<FPGA, float>::Compute( void ConvAddBNReluKernel<FPGA, float>::Compute(
const FusionConvAddBNReluParam<FPGA> &param) { const FusionConvAddBNReluParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWConv(param.FpgaDwconvArgs());
} else {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
}
} }
} // namespace operators } // namespace operators
......
...@@ -21,11 +21,14 @@ namespace operators { ...@@ -21,11 +21,14 @@ namespace operators {
template <> template <>
bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) { bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
bool relu_enabled = false; // bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input()); paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
...@@ -39,12 +42,11 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) { ...@@ -39,12 +42,11 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
} }
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
param->Groups(), param->Strides()[0], leaky_relu_negative_slope, param->Groups(),
param->Strides()[1], param->Paddings()[0], param->Strides()[0], param->Strides()[1],
param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -21,11 +21,14 @@ namespace operators { ...@@ -21,11 +21,14 @@ namespace operators {
template <> template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bool relu_enabled = true; // bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input()); paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias(); const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter()); auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
...@@ -39,12 +42,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -39,12 +42,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
} }
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
param->Groups(), param->Strides()[0], leaky_relu_negative_slope, param->Groups(),
param->Strides()[1], param->Paddings()[0], param->Strides()[0], param->Strides()[1],
param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -22,10 +22,16 @@ namespace operators { ...@@ -22,10 +22,16 @@ namespace operators {
template <> template <>
bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bool relu_enabled = false; // bool relu_enabled = false;
auto input = const_cast<Tensor *>(param->Input()); paddle_mobile::fpga::ActivationType activation_enable =
auto filter = const_cast<Tensor *>(param->Filter()); paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -45,20 +51,21 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -45,20 +51,21 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
new_scale_ptr[i] = bn_scale_ptr[i] / new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5)); static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i]; // bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; // bs_ptr[i] = new_bias_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
} }
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
param->Groups(), param->Strides()[0], leaky_relu_negative_slope, param->Groups(),
param->Strides()[1], param->Paddings()[0], param->Strides()[0], param->Strides()[1],
param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
delete new_scale;
delete new_bias;
return true; return true;
} }
......
...@@ -16,17 +16,20 @@ limitations under the License. */ ...@@ -16,17 +16,20 @@ limitations under the License. */
#include "operators/kernel/conv_bn_relu_kernel.h" #include "operators/kernel/conv_bn_relu_kernel.h"
#include <cmath> #include <cmath>
#include "fpga/V2/filter.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable =
auto input = const_cast<Tensor *>(param->Input()); paddle_mobile::fpga::LEAKYRELU;
auto filter = const_cast<Tensor *>(param->Filter()); int16_t leaky_relu_negative_slope = 0;
const int groups = param->Groups();
auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output(); auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
auto bn_mean_ptr = param->InputMean()->data<float>(); auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>(); auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>(); auto bn_scale_ptr = param->InputScale()->data<float>();
...@@ -41,32 +44,49 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -41,32 +44,49 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
auto new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel}); auto new_bias_ptr = new_bias->mutable_data<float>({channel});
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = bn_scale_ptr[i] / new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5)); static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i]; // bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i]; // bs_ptr[i] = new_bias_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
if (groups == channel) {
new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
new_bias_ptr[i] = new_bias_ptr[i] * 127.0 / So;
} }
param->SetNewScale(new_scale); }
param->SetNewBias(new_bias); if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Strides()[0],
param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
param->Groups(), param->Strides()[0], leaky_relu_negative_slope, param->Groups(),
param->Strides()[1], param->Paddings()[0], param->Strides()[0], param->Strides()[1],
param->Paddings()[1], bs_ptr); param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
}
delete new_scale;
delete new_bias;
return true; return true;
} }
template <> template <>
void ConvBNReluKernel<FPGA, float>::Compute( void ConvBNReluKernel<FPGA, float>::Compute(
const FusionConvBNReluParam<FPGA> &param) { const FusionConvBNReluParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWConv(param.FpgaDwconvArgs());
} else {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
}
} }
} // namespace operators } // namespace operators
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
int channel = out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
// bs_ptr[i + channel] = 1;
// bs_ptr[i] = 0;
bs_ptr[i + channel] = Si / So * Sf / 127.0;
bs_ptr[i] = 0;
}
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
template <>
void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
// const Tensor *bias = param->Bias();
// auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
// PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
// "Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = 0; // bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void ConvTransposeKernel<FPGA, float>::Compute(
const ConvTransposeParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBN_OP
#include "operators/kernel/deconv_add_bn_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void DeconvAddBNKernel<FPGA, float>::Compute(
const FusionDeconvAddBNParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVADDBNRELU_OP
#include "operators/kernel/deconv_add_bn_relu_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvAddBNReluKernel<FPGA, float>::Init(
FusionDeconvAddBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true;
}
template <>
void DeconvAddBNReluKernel<FPGA, float>::Compute(
const FusionDeconvAddBNReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -23,12 +23,66 @@ namespace operators { ...@@ -23,12 +23,66 @@ namespace operators {
template <> template <>
bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) { bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true; return true;
} }
template <> template <>
void DeconvAddKernel<FPGA, float>::Compute( void DeconvAddKernel<FPGA, float>::Compute(
const FusionDeconvAddParam<FPGA> &param) {} const FusionDeconvAddParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -24,12 +24,66 @@ namespace operators { ...@@ -24,12 +24,66 @@ namespace operators {
template <> template <>
bool DeconvAddReluKernel<FPGA, float>::Init( bool DeconvAddReluKernel<FPGA, float>::Init(
FusionDeconvAddReluParam<FPGA> *param) { FusionDeconvAddReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
return true; return true;
} }
template <> template <>
void DeconvAddReluKernel<FPGA, float>::Compute( void DeconvAddReluKernel<FPGA, float>::Compute(
const FusionDeconvAddReluParam<FPGA> &param) {} const FusionDeconvAddReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DECONVBNRELU_OP
#include "operators/kernel/deconv_bn_relu_kernel.h"
#include <cmath>
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DeconvBNReluKernel<FPGA, float>::Init(
FusionDeconvBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
const Tensor *bias = param->InputBias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
auto bn_bias_ptr = param->InputBias()->data<float>();
const float epsilon = param->Epsilon();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
}
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
bs_ptr[i] = new_bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
if (param->Groups() == channel) {
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
fpga::DWDeconvArgs DWDeconv_arg = {0};
fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
activation_enable, leaky_relu_negative_slope,
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(DWDeconv_arg);
} else {
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
leaky_relu_negative_slope, param->Groups(),
param->Strides()[0], param->Strides()[1],
param->Paddings()[0], param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(deconv_arg);
}
delete new_scale;
delete new_bias;
return true;
}
template <>
void DeconvBNReluKernel<FPGA, float>::Compute(
const FusionDeconvBNReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
fpga::ComputeFpgaDeconv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -15,23 +15,31 @@ limitations under the License. */ ...@@ -15,23 +15,31 @@ limitations under the License. */
#include "operators/kernel/elementwise_add_kernel.h" #include "operators/kernel/elementwise_add_kernel.h"
#include <string>
#include "fpga/V1/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
bool relu_enabled = false;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
auto input_x_ptr = input_x->data<float>(); if (input_y->type() != type_id<float>()) {
auto input_y_ptr = input_y->data<float>(); paddle_mobile::fpga::ActivationType activation_enable =
int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]); paddle_mobile::fpga::NONE;
fpga::format_fp16_ofm(out, aligned_channel_num); int16_t leaky_relu_negative_slope = 0;
auto out_ptr = out->mutable_data<float>(); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<half>();
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.relu_enabled = relu_enabled; // ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = 0x3c00; // =1 ewaddArgs.const0 = 0x3c00; // =1
ewaddArgs.const1 = 0x3c00; // =1 ewaddArgs.const1 = 0x3c00; // =1
ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.address = input_x_ptr;
...@@ -50,14 +58,133 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { ...@@ -50,14 +58,133 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr; ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
} else {
param->float_input_x.Resize(param->InputX()->dims());
param->float_input_x.init(type_id<float>().hash_code());
fpga::format_fp32_ofm(&(param->float_input_x));
param->float_out.Resize(param->InputX()->dims());
param->float_out.mutable_data<float>(param->InputX()->dims());
fpga::format_fp32_ofm(&(param->float_out));
fpga::format_fp16_ofm(out);
}
return true; return true;
} }
inline void ElementwiseAddCompute(const ElementwiseAddParam<FPGA> &param) {
auto input_x = param.float_input_x;
auto input_y = param.InputY();
auto Out = param.float_out;
int axis = param.Axis();
const auto &x_dims = input_x.dims();
const auto &y_dims = input_y->dims();
/// axis = -1 represent the last dimensions.
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
size_t batch = 1;
size_t channels = 1;
size_t elementwise_num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
elementwise_num *= x_dims[i];
}
const float *bias_data = input_y->data<float>();
const float *input_data = input_x.data<float>();
float *output_data = Out.mutable_data<float>();
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset;
const float bias = bias_data[j];
float *output = output_data + offset;
// DLOG << "output address: "<< output;
for (int k = 0; k < elementwise_num; ++k) {
output[k] = input[k] + bias;
// DLOG << "output[" << k << "]= " << output[k] ;
}
}
}
}
template <> template <>
void ElementwiseAddKernel<FPGA, float>::Compute( void ElementwiseAddKernel<FPGA, float>::Compute(
const ElementwiseAddParam<FPGA> &param) { const ElementwiseAddParam<FPGA> &param) {
auto input_y = const_cast<LoDTensor *>(param.InputY());
if (input_y->type() != type_id<float>()) {
fpga::ComputeFpgaEWAdd(param.FpgaArgs()); fpga::ComputeFpgaEWAdd(param.FpgaArgs());
} else {
auto input_x = const_cast<LoDTensor *>(param.InputX());
auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = input_x->data<half>();
args.image.channels = (uint32_t)(input_x->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = intput_x_float->data<float>();
args.output.scale_address = intput_x_float->scale;
// fpga::fpga_flush(input_x->data<half>(),input_x->fpga_data_num *
// sizeof(half));
fpga::PerformBypass(args);
fpga::fpga_invalidate(args.output.address,
input_x->fpga_data_num * sizeof(float));
// just for test
/* {
static int cnt = 0;
if(cnt == 0){
std::string str= "first_bypass_data";
float rslt = 0.0f;
fpga::savefile(str, args.output.address, input_x->fpga_data_num,
rslt); cnt++;
}
}*/
ElementwiseAddCompute(param);
auto out_float = const_cast<Tensor *>(&(param.float_out));
DLOG << "out float: " << out_float->data<float>();
fpga::fpga_flush(out_float->data<float>(),
input_x->fpga_data_num * sizeof(float));
// just for test
/*{
static int cnt = 0;
if(cnt == 0){
std::string str= "ew_output_data";
float rslt = 0.0f;
fpga::savefile(str, out_float->data<float>(), input_x->fpga_data_num,
rslt); cnt++;
}
}*/
auto Out = param.Out();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = out_float->data<float>();
args.image.channels = (uint32_t)(input_x->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = Out->data<half>();
args.output.scale_address = Out->scale;
fpga::PerformBypass(args);
}
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -21,18 +21,23 @@ namespace operators { ...@@ -21,18 +21,23 @@ namespace operators {
template <> template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init( bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) { ElementwiseAddReluParam<FPGA> *param) {
bool relu_enabled = true; // bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX()); auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY()); auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out(); auto *out = param->Out();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<float>(); auto input_y_ptr = input_y->data<half>();
int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]); fpga::format_fp16_ofm(out);
fpga::format_fp16_ofm(out, aligned_channel_num); auto out_ptr = out->mutable_data<half>();
auto out_ptr = out->mutable_data<float>();
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.relu_enabled = relu_enabled; // ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = 0x3c00; // =1 ewaddArgs.const0 = 0x3c00; // =1
ewaddArgs.const1 = 0x3c00; // =1 ewaddArgs.const1 = 0x3c00; // =1
ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.address = input_x_ptr;
...@@ -51,6 +56,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -51,6 +56,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr; ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
return true; return true;
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEMUL_OP
#include "operators/kernel/elementwise_mul_kernel.h"
#include "operators/math/elementwise_op_function.h"
namespace paddle_mobile {
namespace operators {
template <typename T>
struct MulFunctor {
inline T operator()(T a, T b) const { return a * b; }
};
template <>
bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
param->float_input_x.Resize(param->InputX()->dims());
param->float_input_x.init(type_id<float>().hash_code());
fpga::format_fp32_ofm(&(param->float_input_x));
param->float_out.Resize(param->InputX()->dims());
param->float_out.init(type_id<float>().hash_code());
fpga::format_fp32_ofm(&(param->float_out));
auto *out = param->Out();
fpga::format_fp16_ofm(out);
return true;
}
template <>
void ElementwiseMulKernel<FPGA, float>::Compute(
const ElementwiseMulParam<FPGA> &param) {
auto input_x = const_cast<LoDTensor *>(param.InputX());
auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
// auto intput_x_32_ptr =
// const_cast<float*>(param.float_input_x.data<float>());
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = input_x->data<half>();
args.image.channels = (uint32_t)(input_x->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = intput_x_float->data<float>();
args.output.scale_address = intput_x_float->scale;
fpga::PerformBypass(args);
fpga::fpga_invalidate(args.output.address,
input_x->fpga_data_num * sizeof(float));
auto input_y = param.InputY();
int axis = param.Axis();
auto out_float = const_cast<Tensor *>(&(param.float_out));
ElementwiseComputeEx<MulFunctor<float>, float>(
intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
fpga::fpga_flush(out_float->data<float>(),
input_x->fpga_data_num * sizeof(float));
Tensor *Out = param.Out();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = out_float->data<float>();
args.image.channels = (uint32_t)(Out->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = Out->data<half>();
args.output.scale_address = Out->scale;
fpga::PerformBypass(args);
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -13,37 +13,85 @@ See the License for the specific language governing permissions and ...@@ -13,37 +13,85 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/kernel/feed_kernel.h" #include "operators/kernel/feed_kernel.h"
#include "fpga/V2/filter.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) { bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
Tensor *output = param->Out(); auto output = param->Out();
int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]); int col = param->Col();
fpga::format_fp16_ofm(output, aligned_channel); DLOG << "col = " << col;
auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
input->init(type_id<float>().hash_code());
input->Resize(output->dims());
if (output->dims().size() != 4) {
return true;
}
fpga::format_fp16_ofm(output);
return true; return true;
} }
template <> template <>
void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
auto input = auto output = param.Out();
reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX())); int col = param.Col();
fpga::format_image(input); auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
auto input_ptr = input->data<float>(); kTypeId_t input_type = input->type();
Tensor *output = param.Out();
if (input_type == type_id<float>()) {
input->init(type_id<float>().hash_code());
} else {
input->init(type_id<int8_t>().hash_code());
}
input->Resize(output->dims());
if (output->dims().size() != 4) {
size_t size = output->numel() * sizeof(float);
auto output_ptr = output->data<float>(); auto output_ptr = output->data<float>();
auto channel = input->dims()[1]; auto input_ptr = input->data<float>();
uint32_t aligned_channels = auto external_ptr = reinterpret_cast<float *>(input->external_data);
fpga::filter::calc_aligned_channel((int)channel); // NOLINT float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
memcpy(output_ptr, p_data, size);
input->external_data = nullptr;
return;
}
fpga::format_image(input);
auto output_ptr = output->data<half>();
fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
if (input_type == type_id<float>()) {
auto input_ptr = input->data<float>();
auto external_ptr = reinterpret_cast<float *>(input->external_data);
float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
args.input_data_type = fpga::DATA_TYPE_FP32; args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW; args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = reinterpret_cast<void *>(input_ptr); args.image.address = p_data;
args.image.channels = aligned_channels; args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output_ptr;
args.output.scale_address = output->scale;
fpga::PerformBypass(args);
input->external_data = nullptr;
} else {
auto input_ptr = input->data<int8_t>();
auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
args.input_data_type = fpga::DATA_TYPE_INT8;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = p_data;
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2]; args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3]; args.image.width = (uint32_t)input->dims()[3];
args.image.pad_height = 0; args.image.pad_height = 0;
...@@ -51,6 +99,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) { ...@@ -51,6 +99,8 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
args.output.address = output_ptr; args.output.address = output_ptr;
args.output.scale_address = output->scale; args.output.scale_address = output->scale;
fpga::PerformBypass(args); fpga::PerformBypass(args);
input->external_data = nullptr;
}
} }
template class FeedKernel<FPGA, float>; template class FeedKernel<FPGA, float>;
......
...@@ -11,22 +11,116 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,22 +11,116 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/kernel/fetch_kernel.h" #include "operators/kernel/fetch_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) { bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
auto input = const_cast<LoDTensor *>(param->InputX());
int col = param->Col();
DLOG << "col = " << col;
auto output = &(param->Out()->at(col));
if (input->type() == type_id<float>()) {
return true; return true;
} }
output->init(type_id<float>().hash_code());
output->Resize(input->dims());
fpga::format_fp32_ofm(output);
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
}
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (alignedCW != unalignedCW) {
param->aligned_out.Resize(input->dims());
param->aligned_out.mutable_data<float>(input->dims());
fpga::fpga_flush(param->aligned_out.data<float>(),
outH * unalignedCW * sizeof(float));
}
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = input->data<half>();
args.image.channels = (uint32_t)(input->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output->data<float>();
args.output.scale_address = output->scale;
param->fpga_bypass_args = args;
return true;
}
void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
int dealignCW = input_c * input_w;
for (int h = 0; h < input_h; ++h) {
auto input_offset = h * alignCW;
auto output_offset = h * dealignCW;
memcpy((dst + output_offset), (src + input_offset),
dealignCW * sizeof(float));
}
}
template <> template <>
void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) { void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
param.Out()->ShareDataWith(*(param.InputX())); auto input = const_cast<LoDTensor *>(param.InputX());
} int col = param.Col();
auto output = &param.Out()->at(col);
if (input->type() == type_id<float>()) {
output->ShareDataWith(*input);
return;
}
fpga::BypassArgs args = param.fpga_bypass_args;
auto input_address = (input->data<half>());
args.image.address = static_cast<void *>(input_address);
float *outdata_ptr =
reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
const int num_th = 32;
if (output->fpga_data_num < num_th) {
fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
for (int idx = 0; idx < product(input->dims()); ++idx) {
outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
}
return;
}
fpga::PerformBypass(args);
int outC = 1;
int outH = 1;
int outW = 1;
if (output->dims().size() == 4) {
outC = output->dims()[1];
outH = output->dims()[2];
outW = output->dims()[3];
} else { // 2
outC = output->dims()[1];
}
fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
output->fpga_data_num * sizeof(float));
int unalignedCW = outC * outW;
int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
if (unalignedCW != alignedCW) {
auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
}
}
template class FetchKernel<FPGA, float>; template class FetchKernel<FPGA, float>;
} // namespace operators } // namespace operators
......
...@@ -20,15 +20,18 @@ namespace operators { ...@@ -20,15 +20,18 @@ namespace operators {
template <> template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false; // bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input_x = const_cast<LoDTensor *>(param->InputX()); auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<Tensor *>(param->InputY()); auto filter = const_cast<LoDTensor *>(param->InputY());
const Tensor *input_z = param->InputZ(); const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>(); auto input_z_ptr = input_z->data<float>();
auto out = param->Out(); auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number"); // "Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1]; int channel = (uint32_t)out->dims()[1];
auto bs_ptr = auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
...@@ -47,11 +50,16 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -47,11 +50,16 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
out->Resize(framework::make_ddim({1, channel, 1, 1})); out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
fpga::format_fc_data(filter, out, &bs_ptr); float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::SplitConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
0, 0, bs_ptr); leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FCRELU_OP
#include "operators/kernel/fc_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<LoDTensor *>(param->InputY());
const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>();
auto out = param->Out();
// PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
// "Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i];
}
int num = (uint32_t)filter->dims()[1];
int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(),
"Filter element num should be equal to IFM element num");
int height = (uint32_t)input_x->dims()[2];
int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width;
out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
template <>
void FusionFcReluKernel<FPGA, float>::Compute(
const FusionFcReluParam<FPGA> &param) {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PAD2D_OP
#include "operators/kernel/pad2d_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
Tensor *output = param->output_;
fpga::format_fp16_ofm(output);
return true;
}
void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
auto input_data = (input->data<half>());
auto output_data = (output->data<half>());
auto input_c = input->dims()[1];
auto input_h = input->dims()[2];
auto input_w = input->dims()[3];
auto output_c = output->dims()[1];
auto output_w = output->dims()[3];
auto copysize = input_c * input_w;
for (int h = 0; h < input_h; ++h) {
auto input_offset = h * input_c * input_w;
auto output_offset = h * paddle_mobile::fpga::align_to_x(
output_c * output_w, IMAGE_ALIGNMENT);
memcpy((output_data + output_offset), (input_data + input_offset),
copysize * sizeof(half));
}
}
template <>
void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
auto in_x = param.input_;
auto out = param.output_;
fpga::fpga_invalidate((void *)in_x->data<half>(), // NOLINT
in_x->numel() * sizeof(half));
pad2dFunc(in_x, out);
(out->scale)[0] = (in_x->scale)[0];
(out->scale)[1] = (in_x->scale)[1];
DLOG << (out->scale)[0];
DLOG << (out->scale)[1];
size_t outputSize =
out->dims()[2] *
paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
IMAGE_ALIGNMENT) *
sizeof(half);
fpga::fpga_flush(out->data<half>(), outputSize);
}
} // namespace operators
} // namespace paddle_mobile
#endif // PAD2D_OP
...@@ -21,18 +21,30 @@ namespace operators { ...@@ -21,18 +21,30 @@ namespace operators {
template <> template <>
bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input()); auto *input = const_cast<LoDTensor *>(param->Input());
auto input_ptr = input->data<float>(); auto *output = param->Output();
Tensor *output = param->Output();
int aligned_channel_num =
fpga::get_aligned_channel_num((int)output->dims()[1]); // NOLINT
fpga::format_fp16_ofm(output, aligned_channel_num);
auto output_ptr = output->mutable_data<float>();
vector<int> ksize = param->Ksize(); vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides(); vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
std::string pooling_type = param->PoolingType(); std::string pooling_type = param->PoolingType();
if (input->type() == type_id<float>()) {
int channels = input->dims()[1];
int height = input->dims()[2];
int width = input->dims()[3];
int num = input->dims()[0];
int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
framework::DDim dim =
framework::make_ddim({num, channels, out_height, out_width});
output->mutable_data<float>(dim);
return true;
}
auto input_ptr = input->data<half>();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<half>();
fpga::PoolingArgs poolArgs = {0}; fpga::PoolingArgs poolArgs = {0};
poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1
poolArgs.kernel_reciprocal = poolArgs.kernel_reciprocal =
...@@ -56,6 +68,34 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { ...@@ -56,6 +68,34 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
template <> template <>
void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) { void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
auto *input = const_cast<LoDTensor *>(param.Input());
if (input->type() == type_id<float>()) {
auto *output = param.Output();
auto in = input->data<float>();
auto N = input->dims()[0];
output->Resize(
{N, output->dims()[1], output->dims()[2], output->dims()[3]});
auto len = output->numel();
auto out = output->mutable_data<float>();
int C = input->dims()[1], H = input->dims()[2], // N = input->dims()[0],
W = input->dims()[3];
int HW = H * W, CHW = C * H * W, WC = W * C;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
out[n * C + c] = 0;
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
out[n * C + c] += in[n * CHW + h * WC + w * C +
c]; // in[n * CHW + c * HW + h * W + w]; //
}
}
out[n * C + c] /= HW;
}
}
return;
}
fpga::ComputeFpgaPool(param.FpgaArgs()); fpga::ComputeFpgaPool(param.FpgaArgs());
} }
} // namespace operators } // namespace operators
......
此差异已折叠。
此差异已折叠。
...@@ -12,24 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,24 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_DECONVRELU_OP #ifdef RELU_OP
#include "operators/kernel/deconv_relu_kernel.h" #include "operators/kernel/activation_kernel.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool DeconvReluKernel<FPGA, float>::Init(FusionDeconvReluParam<FPGA> *param) { bool ReluKernel<FPGA, float>::Init(ReluParam<FPGA> *param) {
param->Out()->ShareDataWith(*param->InputX());
return true; return true;
} }
template <> template <>
void DeconvReluKernel<FPGA, float>::Compute( void ReluKernel<FPGA, float>::Compute(const ReluParam<FPGA> &param) {}
const FusionDeconvReluParam<FPGA> &param) {}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册