提交 96242b6e 编写于 作者: qnqinan's avatar qnqinan

add fpga dwconv and reshape op and refactor code of fpga conv and deconv kernel

上级 c4531b38
......@@ -140,6 +140,16 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
auto dims = filter_tensor->dims();
auto num = dims[0], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
filter_tensor->reset_data_ptr(new_data);
}
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
......@@ -186,6 +196,9 @@ void format_bias_scale_array(float **bias_scale_array,
bias_scale::format_bias_scale_array(bias_scale_array,
element_num_per_division, num);
}
void format_bias_array(float **bias_array, int num) {
bias_scale::format_bias_array(bias_array, num);
}
void format_concat_output(framework::Tensor *out, int height, int width,
int image_num, uint32_t *channel_num) {
......@@ -200,7 +213,36 @@ void format_concat_output(framework::Tensor *out, int height, int width,
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
}
void format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
int group) {
float max_value = fpga::filter_find_max(filter_tensor);
fpga::format_filter(filter_tensor, max_value, group);
int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
ofm_tensor->dims()[1]);
fpga::format_fp16_ofm(ofm_tensor);
}
void format_deconv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
int group, int sub_conv_n) {
int channel = ofm_tensor->dims()[1];
float max_value = filter_find_max(filter_tensor);
format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
int element_num_per_div =
get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
format_fp16_ofm(ofm_tensor);
}
void format_dwconv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float *scale_ptr,
float **bias_ptr) {
auto channel = ofm_tensor->dims()[1];
format_dwconv_filter(filter_tensor, scale_ptr);
format_bias_array(bias_ptr, channel);
format_fp16_ofm(ofm_tensor);
}
void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg;
......@@ -360,7 +402,6 @@ void expand_EW_arg(EWAddArgs *arg) {
(*arg).driver.output_address_phy = output_address_phy;
(*arg).driver.coefficient = coefficient;
(*arg).driver.cmd = cmd;
} // expand_EW_arg
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
......@@ -399,7 +440,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto channel = (int)out->dims()[1]; // NOLINT
int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = get_aligned_filter_element_num(
(int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3]));
(int)(filter->dims()[1] * filter->dims()[2] * // NOLINT
filter->dims()[3]));
for (int i = 0; i < n; i++) {
arg->conv_arg[i].relu_enabled = relu_enabled;
......@@ -424,8 +466,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
element_num *
align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
auto filter_head = &(
(int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT
arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size);
......@@ -441,11 +483,12 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
if (n > 1) {
arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address = fpga_malloc(
out->dims()[2] *
align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num),
IMAGE_ALIGNMENT) *
sizeof(half));
arg->conv_arg[i].output.address =
fpga_malloc(out->dims()[2] *
align_to_x((int)(out->dims()[3] * // NOLINT
arg->conv_arg[i].filter_num),
IMAGE_ALIGNMENT) *
sizeof(half));
} else {
arg->conv_arg[i].output.scale_address = out->scale;
arg->conv_arg[i].output.address = out_ptr;
......@@ -474,22 +517,23 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->sub_conv_num = (uint32_t)stride_h;
arg->filter_num = (uint32_t)filter->dims()[0];
uint32_t sub_conv_num = arg->sub_conv_num;
int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
padding_w, stride_w);
int sub_pad =
deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], // NOLINT
padding_w, stride_w);
auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
(int)filter->dims()[3], stride_w);
(int)filter->dims()[3], stride_w); // NOLINT
auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
(int)input->dims()[3], sub_pad, sub_filter_width);
(int)input->dims()[3], sub_pad, sub_filter_width); // NOLINT
auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
(int)input->dims()[2], sub_pad, sub_filter_width);
(int)input->dims()[2], sub_pad, sub_filter_width); // NOLINT
arg->sub_output_width = (uint32_t)sub_output_width;
arg->sub_output_height = (uint32_t)sub_output_height;
arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
stride_w, (int)filter->dims()[3], padding_w);
stride_w, (int)filter->dims()[3], padding_w); // NOLINT
auto sub_channels = (int)input->dims()[1];
auto sub_channels = (int)input->dims()[1]; // NOLINT
uint32_t omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int sub_filter_num = sub_conv_num * (arg->filter_num);
......@@ -499,7 +543,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<float>();
arg->output.address =
(half *)out_ptr +
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->output.scale_address = out->scale;
......@@ -510,31 +554,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
uint32_t split_num =
group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
arg->split_conv_args =
(SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs));
arg->split_conv_args = (SplitConvArgs *)fpga_malloc( // NOLINT
sub_conv_num * sizeof(SplitConvArgs)); // NOLINT
for (int i = 0; i < sub_conv_num; ++i) {
arg->split_conv_args[i].filter_num =
(arg->sub_conv_num) * (arg->filter_num);
arg->split_conv_args[i].group_num = (uint32_t)group_num;
arg->split_conv_args[i].split_num = split_num;
arg->split_conv_args[i].conv_arg =
(ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs));
(ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs)); // NOLINT
arg->split_conv_args[i].concat_arg.height = sub_output_height;
arg->split_conv_args[i].concat_arg.width = sub_output_width;
arg->split_conv_args[i].concat_arg.image_num = split_num;
arg->split_conv_args[i].concat_arg.images_in =
(half **)fpga_malloc(split_num * sizeof(half *));
(half **)fpga_malloc(split_num * sizeof(half *)); // NOLINT
arg->split_conv_args[i].concat_arg.scales_in =
(float **)fpga_malloc(split_num * sizeof(float *));
(float **)fpga_malloc(split_num * sizeof(float *)); // NOLINT
arg->split_conv_args[i].concat_arg.channel_num =
(uint32_t *)fpga_malloc(split_num * sizeof(uint32_t));
(uint32_t *)fpga_malloc(split_num * sizeof(uint32_t)); // NOLINT
}
auto filter_num_per_div =
(uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
int element_num = get_aligned_filter_element_num(
(int)(sub_channels * sub_filter_width * sub_filter_width));
(int)(sub_channels * sub_filter_width * sub_filter_width)); // NOLINT
int chw = sub_channels * sub_filter_width * sub_filter_width;
int division_capacity = filter::calc_division_capacity(chw);
......@@ -558,14 +602,15 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
out_addr_offset = 0;
} else {
auto ptr_output = (half *)out_ptr;
auto ptr_output = (half *)out_ptr; // NOLINT
out_addr_offset =
sizeof(half) * (sub_conv_num - 1 - i) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->split_conv_args[i].output.address = (void *)(ptr_output);
arg->split_conv_args[i].output.address = (void *)(ptr_output); // NOLINT
auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
auto ptr_output_scale =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->split_conv_args[i].output.scale_address = ptr_output_scale;
}
......@@ -609,9 +654,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num,
FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head =
&((int8_t *)filter_ptr)[j * element_num * filter_num_per_div +
i * filter_sub_conv_offset];
auto filter_head = &((
int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT
i * filter_sub_conv_offset];
arg->split_conv_args[i].conv_arg[j].filter_address =
fpga_malloc(filter_size);
memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head,
......@@ -634,10 +679,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i].conv_arg[j].output.scale_address =
arg->split_conv_args[i].output.scale_address;
} else {
auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
auto ptr_output =
(half *)fpga_malloc(conv_output_size * sizeof(half)); // NOLINT
arg->split_conv_args[i].conv_arg[j].output.address =
(void *)((half *)ptr_output);
auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
(void *)((half *)ptr_output); // NOLINT
auto ptr_output_scale =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->split_conv_args[i].conv_arg[j].output.scale_address =
ptr_output_scale;
}
......@@ -660,5 +707,30 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
fpga_free(bs_ptr);
} // fill_deconv_arg
void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float *bias_ptr) {
auto filter_ptr = filter->data<float>();
auto input_ptr = input->data<float>();
auto output_ptr = out->mutable_data<float>();
arg->relu_enabled = relu_enabled;
arg->bias_address = bias_ptr;
arg->filter_address = filter_ptr;
arg->kernel.height = filter->dims()[2];
arg->kernel.width = filter->dims()[3];
arg->kernel.stride_h = stride_h;
arg->kernel.stride_w = stride_w;
arg->image.address = input_ptr;
arg->image.channels = (uint32_t)input->dims()[1];
arg->image.height = (uint32_t)input->dims()[2];
arg->image.width = (uint32_t)input->dims()[3];
arg->image.pad_height = padding_h;
arg->image.pad_width = padding_w;
arg->image.scale_address = input->scale;
arg->output.address = output_ptr;
arg->output.scale_address = out->scale;
} // end dwconv arg fill
} // namespace fpga
} // namespace paddle_mobile
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include "fpga/common/fpga_common.h"
#include "fpga/common/pe.h"
#include "framework/tensor.h"
......@@ -40,6 +41,7 @@ void format_filter(framework::Tensor* filter_tensor, float max_value,
void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
void format_concat_output(framework::Tensor* out, int height, int width,
int image_num, uint32_t* channel_num);
......@@ -51,16 +53,28 @@ void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float* bias_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride);
void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
void format_conv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr, int group);
void format_deconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr,
int group, int sub_conv_n);
void format_dwconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* scale_ptr,
float** bias_ptr);
template <typename Dtype>
void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
float data;
std::ofstream out(filename.c_str());
for (int i = 0; i < dataSize; ++i) {
data = (((Dtype*)buffer)[i]);
data = (((Dtype*)buffer)[i]); // NOLINT
out << data << std::endl;
}
out.close();
......
......@@ -82,6 +82,25 @@ void format_bias_scale_array(float **bias_scale_array,
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
}
void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array;
int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
float *ptr_aligned =
(float *)fpga_malloc(num_after_align * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(float));
if (num < 16) {
memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float));
for (int i = num; i < num_after_align; i++) {
ptr_aligned[i] = ptr_unaligned[i % num];
}
} else {
memcpy(ptr_aligned, ptr_unaligned, num * sizeof(float));
}
fpga_free(ptr_unaligned);
*bias_array = ptr_aligned;
}
} // namespace bias_scale
} // namespace fpga
......
......@@ -22,6 +22,7 @@ void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
} // namespace bias_scale
} // namespace fpga
......
......@@ -277,7 +277,84 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
}
void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
int16_t *tmp = *data_in;
int16_t *data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
*(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_nw(int16_t **data_in, int num, int height, int width) {
int unalign_nw = num * width;
int align_nw = align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT);
if (unalign_nw == align_nw) {
return;
} else {
int16_t *tmp = *data_in;
int num_element = height * align_nw;
int16_t *data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t));
if (unalign_nw >= FILTER_ELEMENT_ALIGNMENT) {
for (int h = 0; h < height; h++) {
int offset_unalign = h * unalign_nw;
int offset_align = h * align_nw;
for (int nw = 0; nw < unalign_nw; nw++) {
data_tmp[offset_align + nw] = *((*data_in) + offset_unalign + nw);
}
}
} else {
for (int h = 0; h < height; h++) {
int offset_unalign = h * unalign_nw;
int offset_align = h * align_nw;
for (int nw = 0; nw < align_nw; nw++) {
data_tmp[offset_align + nw] =
*((*data_in) + offset_unalign + nw % unalign_nw);
}
}
}
*data_in = data_tmp;
free(tmp);
}
}
void quantize_to_fp16(float **data_in, int num, int height, int width,
float *scale_ptr) {
float *tmp = *data_in;
int size = num * height * width;
int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
}
}
}
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
void format_dwconv_filter(float **data_in, int num, int height, int width,
float *scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
convert_to_hwn(quantize_data, num, height, width);
align_element_nw(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num * width, FILTER_ELEMENT_ALIGNMENT) *
height * sizeof(char));
}
} // namespace filter
} // namespace fpga
} // namespace paddle_mobile
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace fpga {
namespace filter {
......@@ -38,6 +38,13 @@ void convert_fc_filter(char** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
void align_element_nw(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16(float** data_in, int num, int height, int width,
float* scale_ptr);
void format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr);
} // namespace filter
} // namespace fpga
} // namespace paddle_mobile
......@@ -24,14 +24,13 @@ limitations under the License. */
#include <time.h>
#include <iomanip>
#include <iostream>
//#include <iostream>
#endif
namespace paddle_mobile {
namespace fpga {
using namespace driver; // NOLINT
using namespace std;
using namespace std; // NOLINT
#define USE_RELU 1
#define USE_BIAS 2
......@@ -53,7 +52,6 @@ using namespace std;
#define INTERRUPT_CONV 0x0004
#define INTERRUPT_POOLING 0x0008
#define INTERRUPT_EW 0x0010
//#define INTERRUPT_RESIZE 0x0020
/* Register offset */
#define REG_INTERRUPT 0x000
......@@ -73,9 +71,6 @@ using namespace std;
#define REG_FLASH_STATUS 0x218
#define REG_SN 0x220
//#define REG_READ_SCALE
//#define REG_WRITE_SCALE
/*bypass*/
#define REG_CONVERT_CMD 0x400
#define REG_CONVERT_SRC_ADDR 0x408
......@@ -236,8 +231,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT
REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT
REG_CONV_FILTER_SCALE);
reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
......@@ -280,7 +277,6 @@ int ComputeBasicConv(const struct ConvArgs &args) {
return ret;
#endif
return 0;
} // ComputeBasicConv
int ComputeFpgaPool(const struct PoolingArgs &args) {
......@@ -406,13 +402,11 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
} // ComputeFpgaPool
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
......@@ -468,13 +462,10 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
} // ComputeFpgaEWAdd
int PerformBypass(const struct BypassArgs &args) {
......@@ -588,13 +579,10 @@ int PerformBypass(const struct BypassArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
} // PerformBypass
int ComputeFPGAConcat(const struct ConcatArgs &args) {
......@@ -647,13 +635,14 @@ void deconv_post_process(const struct DeconvArgs &args) {
for (int hh = 0; hh < origin_h; ++hh) {
int hx = (hh % sub_conv_n);
auto sub_t =
(int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address);
(int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT
.output.address);
int hi = (hh / sub_conv_n);
if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
omit_size * channel);
fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx,
sizeof(int16_t) * deconv_row_len);
fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT
sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT
deconv_idx += align_deconv_row_len;
}
}
......@@ -678,7 +667,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef COST_TIME_PRINT
timeval start, end;
long dif_sec, dif_usec;
long dif_sec, dif_usec; // NOLINT
#endif
for (int i = 0; i < sub_conv_num; i++) {
......@@ -723,18 +712,16 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#endif
// fpga_flush(args.output.scale_address, 2 * sizeof(float));
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
deconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
/*#ifdef COST_TIME_PRINT
gettimeofday(&start,NULL);
#endif
//deconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end,NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process " << " cost time: " <<
(dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/
}
return 0;
......
......@@ -25,6 +25,7 @@ namespace fpga {
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
#endif
enum DataType {
......@@ -222,7 +223,14 @@ struct DeconvArgs {
struct ImageOutputArgs output;
struct SplitConvArgs* split_conv_args;
};
struct DWconvArgs {
bool relu_enabled;
void* bias_address;
void* filter_address;
struct KernelArgs kernel;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static inline uint32_t align_to_x(int64_t num, int64_t x) {
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/kernel/conv_add_bn_kernel.h"
#include <math.h>
namespace paddle_mobile {
namespace operators {
......@@ -58,14 +58,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/kernel/conv_add_bn_relu_kernel.h"
#include <math.h>
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input());
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
vector<int> paddings = param->Paddings();
vector<int> strides = param->Strides();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
auto bn_bias_ptr = param->InputBias()->data<float>();
const float epsilon = param->Epsilon();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
bias->dims()[0] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] =
bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
const int groups = param->Groups();
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
strides[0], strides[1], paddings[0], paddings[1],
new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), strides[0], strides[1], paddings[0],
paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
}
return true;
}
template <>
void ConvAddBNReluKernel<FPGA, float>::Compute(
const FusionConvAddBNReluParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
// fpga::ComputeFpgaConv(param.FpgaDwconvArgs());
} else {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -38,15 +38,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
bs_ptr[i] = bias_ptr[i];
}
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -38,15 +38,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bs_ptr[i] = bias_ptr[i];
}
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -51,15 +51,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -51,15 +51,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -35,8 +35,8 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
sizeof(float)); // NOLINT
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
......@@ -49,17 +49,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
float max_value = fpga::filter_find_max(filter);
fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]);
int element_num_per_div =
fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
channel * sub_conv_n);
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -36,8 +36,8 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
sizeof(float)); // NOLINT
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
......@@ -50,17 +50,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
float max_value = fpga::filter_find_max(filter);
fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]);
int element_num_per_div =
fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
channel * sub_conv_n);
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#include "operators/kernel/reshape_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
return true;
}
template <>
void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -28,18 +28,26 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
fpga::format_fp32_ofm(out);
auto float_input = new Tensor;
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
fpga::format_fp32_ofm(float_input);
if (input->dims().size() == 2) {
float_input->mutable_data<float>({1, input->dims()[1]});
} else if (input->dims().size() == 4) {
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
} else {
DLOG << "wrong dimension of softmax input";
}
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.height =
(input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
......@@ -56,7 +64,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
in_x->numel() * sizeof(float));
// TODO: In general case, 0 should be squeezed before softmax input
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
......
......@@ -462,6 +462,13 @@ class ConvParam : public OpParam {
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
public:
fpga::DWconvArgs fpga_dwconv_args;
public:
const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; }
void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; }
#endif
};
template <typename Dtype>
......
......@@ -38,6 +38,9 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
#endif
......
......@@ -122,6 +122,11 @@ if (CON GREATER -1)
set(SPLIT_OP ON)
set(FUSION_DECONVADD_OP ON)
set(FUSION_DECONVADDRELU_OP ON)
set(RESHAPE_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON)
set(FOUND_MATCH ON)
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册