未验证 提交 c1372bf9 编写于 作者: J Jiaying Zhao 提交者: GitHub

[Mobile][FPGA]Fix code style. test=mobile (#2638)

上级 a13c592d
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "fpga/V2/bias_scale.h" #include "fpga/V2/bias_scale.h"
#include <memory.h>
#include <math.h> #include <math.h>
#include <memory.h>
#include "fpga/common/fpga_common.h" #include "fpga/common/fpga_common.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
*data_in = ptr_aligned; *data_in = ptr_aligned;
} }
void fixed_scale_bias_new(void*data_in, int data_len) { void fixed_scale_bias_new(void *data_in, int data_len) {
int* data_tmp = static_cast<int*>(data_in); int *data_tmp = static_cast<int *>(data_in);
for (int idx = 0; idx < data_len/2; ++idx) { for (int idx = 0; idx < data_len / 2; ++idx) {
float tmp = (static_cast<float*>(data_in))[idx]; float tmp = (static_cast<float *>(data_in))[idx];
data_tmp[idx] = static_cast<int>(round(tmp*pow(2.0, 23.0))); data_tmp[idx] = static_cast<int>(round(tmp * pow(2.0, 23.0)));
tmp = (static_cast<float*>(data_in))[idx+data_len/2]; tmp = (static_cast<float *>(data_in))[idx + data_len / 2];
data_tmp[idx+data_len/2] = static_cast<int>(round(tmp*pow(2.0, 30.0))); data_tmp[idx + data_len / 2] =
} static_cast<int>(round(tmp * pow(2.0, 30.0)));
return; }
return;
} }
void interleave(float **data_in, int num_after_alignment) { void interleave(float **data_in, int num_after_alignment) {
......
...@@ -94,11 +94,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, ...@@ -94,11 +94,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
for (i = 0; i < image_num; i++) { for (i = 0; i < image_num; i++) {
align_each_in_area_cw = align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy( memcpy((int8_t *)image_out + tmp_channel + // NOLINT
(int8_t *)image_out + tmp_channel + // NOLINT k * align_each_out_area_cw_differ,
k * align_each_out_area_cw_differ, images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, channel_num[i] * sizeof(int8_t));
channel_num[i] * sizeof(int8_t));
tmp_channel += channel_num[i]; tmp_channel += channel_num[i];
} }
......
...@@ -257,8 +257,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -257,8 +257,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
// reg_writeq(reg_ActivationArgs, // reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
// new // new
...@@ -274,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -274,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
args.driver.filter_pad_width_mul_channel, args.driver.filter_pad_width_mul_channel,
REG_CONV_REG1); REG_CONV_REG1);
reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) | reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
(args.driver.filter_row << 10) | (args.driver.filter_row << 10) |
(args.driver.filter_height << 5) | args.driver.filter_width, (args.driver.filter_height << 5) | args.driver.filter_width,
REG_CONV_REG2); REG_CONV_REG2);
reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
(args.driver.prog_full_cnt << 16) | (args.driver.prog_full_cnt << 16) |
...@@ -369,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -369,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t cmd = 0; uint64_t cmd = 0;
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0; uint64_t output_physical_address = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image_physical_address = vaddr_to_paddr(args.image.address); image_physical_address = vaddr_to_paddr(args.image.address);
output_physical_address = vaddr_to_paddr(args.output.address); output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t output_height = (uint64_t)( uint64_t output_height = (uint64_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) / (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h + 1); args.kernel.stride_h +
1);
uint64_t output_width = (uint64_t)( uint64_t output_width = (uint64_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) / (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + 1); args.kernel.stride_w +
1);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = (uint64_t)args.image.width * uint64_t image_one_pad_per_row =
(uint64_t)args.image.channels +(uint64_t)args.image.pad_width * (uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.channels; (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width * uint64_t result_amount_align_32 =
(uint64_t)args.image.channels, 32); align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_addr_row = uint64_t result_addr_row =
(result_amount_align_32 << 32) | output_physical_address; (result_amount_align_32 << 32) | output_physical_address;
uint64_t row_padding_down = uint64_t row_padding_down =
(uint64_t)args.image.height + (uint64_t)args.image.pad_height; (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 = uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
(uint64_t)args.kernel.width - 1;
uint64_t kernel_padding_step = row_padding_down | uint64_t kernel_padding_step = row_padding_down |
((uint64_t)args.image.pad_height << 16) | ((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) | ((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1<<32) | ((uint64_t)kernel_width_sub1 << 32) |
((uint64_t)args.kernel.height << 40) | ((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height-1) << 48); ((uint64_t)(args.kernel.height - 1) << 48);
uint64_t image_calcu_height = (uint64_t)args.kernel.height + uint64_t image_calcu_height =
(output_height - 1) * (uint64_t)args.kernel.stride_h; (uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t result_size_calcu_height = (output_height - 1) | uint64_t result_size_calcu_height = (output_height - 1) |
((output_width - 1) << 16) | (image_calcu_height << 32); ((output_width - 1) << 16) |
uint64_t col_padding_down = ((uint64_t)args.image.width + (image_calcu_height << 32);
(uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; uint64_t col_padding_down =
((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
(uint64_t)args.image.channels;
uint64_t image_row_col_padding_down = uint64_t image_row_col_padding_down =
image_amount_per_row | (col_padding_down << 32); image_amount_per_row | (col_padding_down << 32);
uint64_t image_rowXpadding_h = uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height; image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h = uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h; image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h = uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32); image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w = uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width; (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w = uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w = uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32); channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align = uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width;
C_align_32 * (uint64_t)args.kernel.width; uint64_t sub_filter_amount_align =
uint64_t sub_filter_amount_align = C_align_32 * C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
(uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
uint64_t mult_factor = 0; uint64_t mult_factor = 0;
float average_reciprocal = args.kernel_reciprocal; float average_reciprocal = args.kernel_reciprocal;
uint32_t* kernel_reciprocal; uint32_t *kernel_reciprocal;
kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal)); kernel_reciprocal = (reinterpret_cast<uint32_t *>(&average_reciprocal));
if (args.mode == 1) if (args.mode == 1)
mult_factor = (uint64_t)(*kernel_reciprocal) | mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) |
((uint64_t)1 << 32) | ((uint64_t)1 << 40); ((uint64_t)1 << 40);
else else
mult_factor = mult_factor =
(uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40); (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO; ret = -EIO;
...@@ -501,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -501,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
int ret = 0; int ret = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
...@@ -511,7 +514,6 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); ...@@ -511,7 +514,6 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
return ret; return ret;
} }
uint64_t image0_physical_address = 0; uint64_t image0_physical_address = 0;
uint64_t image1_physical_address = 0; uint64_t image1_physical_address = 0;
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
...@@ -519,26 +521,28 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); ...@@ -519,26 +521,28 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image0_physical_address = vaddr_to_paddr(args.image0.address); image0_physical_address = vaddr_to_paddr(args.image0.address);
image1_physical_address = vaddr_to_paddr(args.image1.address); image1_physical_address = vaddr_to_paddr(args.image1.address);
image_physical_address = image_physical_address =
image0_physical_address | (image1_physical_address << 32); image0_physical_address | (image1_physical_address << 32);
output_physical_address = vaddr_to_paddr(args.output.address); output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
(uint64_t)args.image0.channels, IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t result_addr_row = uint64_t result_addr_row =
output_physical_address | (image_amount_per_row << 32); output_physical_address | (image_amount_per_row << 32);
uint64_t kernel_padding_step = 0; uint64_t kernel_padding_step = 0;
kernel_padding_step = ((uint64_t)args.image0.height * 2) | kernel_padding_step = ((uint64_t)args.image0.height * 2) |
((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48); ((uint64_t)2 << 24) | ((uint64_t)2 << 40) |
uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) | ((uint64_t)1 << 48);
((image_amount_per_row / 32 - 1) << 16) | uint64_t result_size_calcu_height =
(((uint64_t)args.image0.height * 2) << 32); ((uint64_t)args.image0.height - 1) |
uint64_t image_row_col_padding_down = image_amount_per_row | ((image_amount_per_row / 32 - 1) << 16) |
(image_amount_per_row << 32); (((uint64_t)args.image0.height * 2) << 32);
float quantParam = uint64_t image_row_col_padding_down =
((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]); image_amount_per_row | (image_amount_per_row << 32);
uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam); float quantParam =
uint64_t ew_scale_mult_factor = (*ew_scale) | ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40); uint32_t *ew_scale = reinterpret_cast<uint32_t *>(&quantParam);
uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) |
((uint64_t)args.const1 << 40);
reg_writeq(0ul, REG_SCALE_PARAMETER); reg_writeq(0ul, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, 0x808); reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810); reg_writeq(result_addr_row, 0x810);
...@@ -546,7 +550,7 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); ...@@ -546,7 +550,7 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
reg_writeq(result_size_calcu_height, 0x820); reg_writeq(result_size_calcu_height, 0x820);
reg_writeq(32, 0x828); reg_writeq(32, 0x828);
reg_writeq(image_row_col_padding_down, 0x830); reg_writeq(image_row_col_padding_down, 0x830);
reg_writeq(((image_amount_per_row*2) << 32), 0x838); reg_writeq(((image_amount_per_row * 2) << 32), 0x838);
reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care
reg_writeq(((uint64_t)32 << 32), 0x848); reg_writeq(((uint64_t)32 << 32), 0x848);
reg_writeq(0, 0x858); reg_writeq(0, 0x858);
...@@ -924,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -924,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
<< " pad_height:" << args.image.pad_height << " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width; << " pad_width:" << args.image.pad_width;
DLOG << " filter_address:" << args.filter_address; DLOG << " filter_address:" << args.filter_address;
//<< " bias_address:" << args.bias_address; //<< " bias_address:" << args.bias_address;
DLOG << " kernel_height:" << args.kernel.height DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width << " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h << " stride_h:" << args.kernel.stride_h
...@@ -950,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -950,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) {
bias_physical_address = vaddr_to_paddr(args.bias_address); bias_physical_address = vaddr_to_paddr(args.bias_address);
uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64); uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t output_height = (uint64_t) uint64_t output_height = (uint64_t)(
((args.image.height + args.image.pad_height * 2 - (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.height) / args.kernel.stride_h +1); args.kernel.stride_h +
uint64_t output_width = (uint64_t) 1);
(((args.image.width + args.image.pad_width * 2 - args.kernel.width) / uint64_t output_width = (uint64_t)(
args.kernel.stride_w + 1) * args.sub_conv_num); ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1) *
args.sub_conv_num);
uint64_t image_amount_per_row = uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
(uint64_t)args.image.channels, IMAGE_ALIGNMENT); IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = uint64_t image_one_pad_per_row =
(uint64_t)args.image.width * (uint64_t)args.image.channels + (uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels; (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t result_amount_align_32 = align_to_x( uint64_t result_amount_align_32 =
(uint64_t)output_width * (uint64_t)args.image.channels, 32); align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_addr_row = uint64_t result_addr_row =
(result_amount_align_32 << 32) | output_physical_address; (result_amount_align_32 << 32) | output_physical_address;
uint64_t row_padding_down = uint64_t row_padding_down =
(uint64_t)args.image.height + (uint64_t)args.image.pad_height; (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
uint64_t kernel_padding_step = row_padding_down | uint64_t kernel_padding_step = row_padding_down |
((uint64_t)args.image.pad_height << 16) | ((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) | ((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1<<32) | ((uint64_t)kernel_width_sub1 << 32) |
((uint64_t)args.kernel.height << 40) | ((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height-1) << 48); ((uint64_t)(args.kernel.height - 1) << 48);
uint64_t image_calcu_height = (uint64_t)args.kernel.height + uint64_t image_calcu_height =
(output_height - 1) * (uint64_t)args.kernel.stride_h; (uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t result_size_calcu_height = (output_height - 1) | uint64_t result_size_calcu_height = (output_height - 1) |
((output_width - 1) << 16) | (image_calcu_height << 32); ((output_width - 1) << 16) |
uint64_t col_padding_down = ((uint64_t)args.image.width + (image_calcu_height << 32);
(uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; uint64_t col_padding_down =
((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
(uint64_t)args.image.channels;
uint64_t image_row_col_padding_down = uint64_t image_row_col_padding_down =
image_amount_per_row | (col_padding_down << 32); image_amount_per_row | (col_padding_down << 32);
uint64_t image_rowXpadding_h = uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height; image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h = uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h; image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h = uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32); image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w = uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width; (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w = uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w = uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32); channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align = uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width;
C_align_64 * (uint64_t)args.kernel.width; uint64_t sub_filter_amount_align =
uint64_t sub_filter_amount_align = C_align_64 * C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
(uint64_t)args.kernel.width *
(uint64_t)args.kernel.height;
uint64_t filter_amount_align = uint64_t filter_amount_align =
sub_filter_amount_align * (uint64_t)args.sub_conv_num; sub_filter_amount_align * (uint64_t)args.sub_conv_num;
uint64_t filter_param = filter_row_align | (filter_amount_align << 16) | uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
(sub_filter_amount_align << 32) | (sub_filter_amount_align << 32) |
(((uint64_t)args.sub_conv_num -1) << 48); (((uint64_t)args.sub_conv_num - 1) << 48);
uint64_t channel_parameter = uint64_t channel_parameter =
(uint64_t)args.image.channels | (C_align_64 << 16); (uint64_t)args.image.channels | (C_align_64 << 16);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO; ret = -EIO;
...@@ -1030,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -1030,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) {
reg_writeq(channelXpad_w_channelXstep_w, 0x848); reg_writeq(channelXpad_w_channelXstep_w, 0x848);
reg_writeq(filter_physical_address, 0x850); reg_writeq(filter_physical_address, 0x850);
reg_writeq(filter_param, 0x858); reg_writeq(filter_param, 0x858);
reg_writeq(((bias_physical_address+C_align_64*4) | reg_writeq(((bias_physical_address + C_align_64 * 4) |
(bias_physical_address << 32)), 0x860); (bias_physical_address << 32)),
0x860);
cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8); cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
reg_writeq(cmd, 0x800); reg_writeq(cmd, 0x800);
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef ELEMENTWISEADD_OP #ifdef ELEMENTWISEADD_OP
#include <math.h>
#include "operators/kernel/elementwise_add_kernel.h" #include "operators/kernel/elementwise_add_kernel.h"
#include <math.h>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) { ...@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
int inputh = ewaddArgs.image0.height; int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width; int inputw = ewaddArgs.image0.width;
float inScale0 = float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
float inScale1 = float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
float outScale = float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address); int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address); int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address); int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw; int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale; float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale; float const1 = inScale1 / outScale;
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_ELEMENTWISEADDRELU_OP #ifdef FUSION_ELEMENTWISEADDRELU_OP
#include <math.h>
#include "operators/kernel/elementwise_add_relu_kernel.h" #include "operators/kernel/elementwise_add_relu_kernel.h"
#include <math.h>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) { ...@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
int inputh = ewaddArgs.image0.height; int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width; int inputw = ewaddArgs.image0.width;
float inScale0 = float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
float inScale1 = float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
float outScale = float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0]; (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address); int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address); int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address); int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw; int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale; float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale; float const1 = inScale1 / outScale;
......
...@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage( ...@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
keep_nms.Resize({post_nms_top_n}); keep_nms.Resize({post_nms_top_n});
} }
proposals.mutable_data<T>({keep_nms.numel(), 4}); // original proposals.mutable_data<T>({keep_nms.numel(), 4}); // original
scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1}); // original scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1}); // original
CPUGather<T>(bbox_sel, keep_nms, &proposals); CPUGather<T>(bbox_sel, keep_nms, &proposals);
...@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
for (int h = 0; h < score_height; h++) { for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) { for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; ++c) { for (int c = 0; c < score_channels; ++c) {
int dstidx = h*unalignedCW + w*score_channels + c; int dstidx = h * unalignedCW + w * score_channels + c;
int srcidx = h*alignedCW + w*score_channels + c; int srcidx = h * alignedCW + w * score_channels + c;
score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx]; score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
} }
} }
...@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
for (int h = 0; h < bbox_height; h++) { for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) { for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; ++c) { for (int c = 0; c < bbox_channels; ++c) {
int dstidx = h*unalignedCW + w*bbox_channels + c; int dstidx = h * unalignedCW + w * bbox_channels + c;
int srcidx = h*alignedCW + w*bbox_channels + c; int srcidx = h * alignedCW + w * bbox_channels + c;
bbox_tensor->data<float>()[dstidx] = bbox_tensor->data<float>()[dstidx] =
(static_cast<int>(input_bbox_data[srcidx]))/127.0* (static_cast<int>(input_bbox_data[srcidx])) / 127.0 *
input_bbox->scale[0]; input_bbox->scale[0];
} }
} }
} }
...@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) { ...@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
float min_size = param.min_size_; float min_size = param.min_size_;
float eta = param.eta_; float eta = param.eta_;
rpn_rois->mutable_data<float>({bbox_tensor->numel()/4, 4}); rpn_rois->mutable_data<float>({bbox_tensor->numel() / 4, 4});
rpn_roi_probs->mutable_data<int8_t>({input_score->numel()/4, 1}); rpn_roi_probs->mutable_data<int8_t>({input_score->numel() / 4, 1});
framework::LoD lod; framework::LoD lod;
lod.resize(1); lod.resize(1);
auto &lod0 = lod[0]; auto &lod0 = lod[0];
lod0.push_back(0); lod0.push_back(0);
anchors.Resize({anchors.numel()/4, 4}); anchors.Resize({anchors.numel() / 4, 4});
variances.Resize({variances.numel()/4, 4}); variances.Resize({variances.numel() / 4, 4});
int64_t num_proposals = 0; int64_t num_proposals = 0;
for (int64_t i = 0; i < score_n; ++i) { for (int64_t i = 0; i < score_n; ++i) {
......
...@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"the channels of input X should equal the product of " "the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"); "output_channels x pooled_height x pooled_width");
auto output_data = out->mutable_data<float>(); auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>(); auto input_rois = rois->data<float>();
...@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) { ...@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
for (int ph = 0; ph < pooled_height; ph++) { for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) { for (int pw = 0; pw < pooled_width; pw++) {
PSROIPoolingForward<float>( PSROIPoolingForward<float>(input_data, height, width, input_channels,
input_data, height, width, input_channels, offset_output_data, offset_output_data, pooled_height,
pooled_height, pooled_width, output_channels, input_rois, pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph, bin_size_h, bin_size_w, roi_start_h,
scale, roi_batch_ind); roi_start_w, pw, ph, scale, roi_batch_ind);
} }
} }
} }
......
...@@ -118,11 +118,10 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) { ...@@ -118,11 +118,10 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
auto inputdimsize = input->dims().size(); auto inputdimsize = input->dims().size();
auto outputdimsize = output->dims().size(); auto outputdimsize = output->dims().size();
int smallersize = int smallersize =
inputdimsize > outputdimsize ? outputdimsize : inputdimsize; inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
int i = 0; int i = 0;
for (i = 0; i < smallersize; i++) { for (i = 0; i < smallersize; i++) {
if ((input->dims())[i] != (output->dims())[i]) if ((input->dims())[i] != (output->dims())[i]) break;
break;
} }
if (i == smallersize) { if (i == smallersize) {
reshapeNeedFlg = 0; reshapeNeedFlg = 0;
......
...@@ -57,31 +57,30 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { ...@@ -57,31 +57,30 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
int len = end - start; int len = end - start;
size_t size = len * sizeof(int8_t); size_t size = len * sizeof(int8_t);
DLOG << input->fpga_data_num; DLOG << input->fpga_data_num;
fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t)); fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t));
DLOG << output->fpga_data_num; DLOG << output->fpga_data_num;
fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t)); fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t));
int unalignedWC = len * W; int unalignedWC = len * W;
int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT); int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
if (unalignedWC != alignedWC) { if (unalignedWC != alignedWC) {
auto tmpOutput = reinterpret_cast<int8_t*> auto tmpOutput =
(fpga::fpga_malloc(len*HW * sizeof(int8_t))); reinterpret_cast<int8_t*>(fpga::fpga_malloc(len * HW * sizeof(int8_t)));
for (int i = 0; i < HW; i++) { for (int i = 0; i < HW; i++) {
memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size); memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
}
for (int i = 0; i < H; i++) {
for (int j = 0; j < unalignedWC; j++) {
*(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j);
} }
for (int i = 0; i < H; i++) { }
for (int j = 0; j < unalignedWC; j++) { fpga::fpga_free(tmpOutput);
*(output_ptr + alignedWC * i + j) =
*(tmpOutput + unalignedWC * i + j);
}
}
fpga::fpga_free(tmpOutput);
} else { } else {
for (int i = 0; i < HW; i++) { for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
} }
} }
fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t)); fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t));
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册