未验证 提交 c1372bf9 编写于 作者: J Jiaying Zhao 提交者: GitHub

[Mobile][FPGA]Fix code style. test=mobile (#2638)

上级 a13c592d
......@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/bias_scale.h"
#include <memory.h>
#include <math.h>
#include <memory.h>
#include "fpga/common/fpga_common.h"
namespace paddle_mobile {
......@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
*data_in = ptr_aligned;
}
void fixed_scale_bias_new(void*data_in, int data_len) {
int* data_tmp = static_cast<int*>(data_in);
for (int idx = 0; idx < data_len/2; ++idx) {
float tmp = (static_cast<float*>(data_in))[idx];
data_tmp[idx] = static_cast<int>(round(tmp*pow(2.0, 23.0)));
tmp = (static_cast<float*>(data_in))[idx+data_len/2];
data_tmp[idx+data_len/2] = static_cast<int>(round(tmp*pow(2.0, 30.0)));
}
return;
void fixed_scale_bias_new(void *data_in, int data_len) {
int *data_tmp = static_cast<int *>(data_in);
for (int idx = 0; idx < data_len / 2; ++idx) {
float tmp = (static_cast<float *>(data_in))[idx];
data_tmp[idx] = static_cast<int>(round(tmp * pow(2.0, 23.0)));
tmp = (static_cast<float *>(data_in))[idx + data_len / 2];
data_tmp[idx + data_len / 2] =
static_cast<int>(round(tmp * pow(2.0, 30.0)));
}
return;
}
void interleave(float **data_in, int num_after_alignment) {
......
......@@ -94,11 +94,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
for (i = 0; i < image_num; i++) {
align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy(
(int8_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int8_t));
memcpy((int8_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int8_t));
tmp_channel += channel_num[i];
}
......
......@@ -257,8 +257,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
// reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
// reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER);
// new
......@@ -274,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
args.driver.filter_pad_width_mul_channel,
REG_CONV_REG1);
reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
(args.driver.filter_row << 10) |
(args.driver.filter_height << 5) | args.driver.filter_width,
REG_CONV_REG2);
reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
(args.driver.filter_row << 10) |
(args.driver.filter_height << 5) | args.driver.filter_width,
REG_CONV_REG2);
reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
(args.driver.prog_full_cnt << 16) |
......@@ -369,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t cmd = 0;
uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image_physical_address = vaddr_to_paddr(args.image.address);
output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t output_height = (uint64_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h + 1);
args.kernel.stride_h +
1);
uint64_t output_width = (uint64_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + 1);
args.kernel.stride_w +
1);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row = (uint64_t)args.image.width *
(uint64_t)args.image.channels +(uint64_t)args.image.pad_width *
(uint64_t)args.image.channels;
uint64_t image_one_pad_per_row =
(uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width *
(uint64_t)args.image.channels, 32);
uint64_t result_amount_align_32 =
align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_addr_row =
(result_amount_align_32 << 32) | output_physical_address;
(result_amount_align_32 << 32) | output_physical_address;
uint64_t row_padding_down =
(uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 =
(uint64_t)args.kernel.width - 1;
(uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
uint64_t kernel_padding_step = row_padding_down |
((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1<<32) |
((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height-1) << 48);
uint64_t image_calcu_height = (uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1 << 32) |
((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height - 1) << 48);
uint64_t image_calcu_height =
(uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t result_size_calcu_height = (output_height - 1) |
((output_width - 1) << 16) | (image_calcu_height << 32);
uint64_t col_padding_down = ((uint64_t)args.image.width +
(uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
((output_width - 1) << 16) |
(image_calcu_height << 32);
uint64_t col_padding_down =
((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
(uint64_t)args.image.channels;
uint64_t image_row_col_padding_down =
image_amount_per_row | (col_padding_down << 32);
image_amount_per_row | (col_padding_down << 32);
uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height;
image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h;
image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32);
image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align =
C_align_32 * (uint64_t)args.kernel.width;
uint64_t sub_filter_amount_align = C_align_32 *
(uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width;
uint64_t sub_filter_amount_align =
C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
uint64_t mult_factor = 0;
float average_reciprocal = args.kernel_reciprocal;
uint32_t* kernel_reciprocal;
kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal));
uint32_t *kernel_reciprocal;
kernel_reciprocal = (reinterpret_cast<uint32_t *>(&average_reciprocal));
if (args.mode == 1)
mult_factor = (uint64_t)(*kernel_reciprocal) |
((uint64_t)1 << 32) | ((uint64_t)1 << 40);
mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) |
((uint64_t)1 << 40);
else
mult_factor =
(uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
(uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
......@@ -501,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#endif
#ifdef PADDLE_MOBILE_ZU5
int ret = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
......@@ -511,7 +514,6 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
return ret;
}
uint64_t image0_physical_address = 0;
uint64_t image1_physical_address = 0;
uint64_t image_physical_address = 0;
......@@ -519,26 +521,28 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image0_physical_address = vaddr_to_paddr(args.image0.address);
image1_physical_address = vaddr_to_paddr(args.image1.address);
image_physical_address =
image0_physical_address | (image1_physical_address << 32);
image0_physical_address | (image1_physical_address << 32);
output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width *
(uint64_t)args.image0.channels, IMAGE_ALIGNMENT);
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGNMENT);
uint64_t result_addr_row =
output_physical_address | (image_amount_per_row << 32);
output_physical_address | (image_amount_per_row << 32);
uint64_t kernel_padding_step = 0;
kernel_padding_step = ((uint64_t)args.image0.height * 2) |
((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48);
uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) |
((image_amount_per_row / 32 - 1) << 16) |
(((uint64_t)args.image0.height * 2) << 32);
uint64_t image_row_col_padding_down = image_amount_per_row |
(image_amount_per_row << 32);
float quantParam =
((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
uint64_t ew_scale_mult_factor = (*ew_scale) |
((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
((uint64_t)2 << 24) | ((uint64_t)2 << 40) |
((uint64_t)1 << 48);
uint64_t result_size_calcu_height =
((uint64_t)args.image0.height - 1) |
((image_amount_per_row / 32 - 1) << 16) |
(((uint64_t)args.image0.height * 2) << 32);
uint64_t image_row_col_padding_down =
image_amount_per_row | (image_amount_per_row << 32);
float quantParam =
((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
uint32_t *ew_scale = reinterpret_cast<uint32_t *>(&quantParam);
uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) |
((uint64_t)args.const1 << 40);
reg_writeq(0ul, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810);
......@@ -546,7 +550,7 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
reg_writeq(result_size_calcu_height, 0x820);
reg_writeq(32, 0x828);
reg_writeq(image_row_col_padding_down, 0x830);
reg_writeq(((image_amount_per_row*2) << 32), 0x838);
reg_writeq(((image_amount_per_row * 2) << 32), 0x838);
reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care
reg_writeq(((uint64_t)32 << 32), 0x848);
reg_writeq(0, 0x858);
......@@ -924,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " filter_address:" << args.filter_address;
//<< " bias_address:" << args.bias_address;
//<< " bias_address:" << args.bias_address;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
......@@ -950,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) {
bias_physical_address = vaddr_to_paddr(args.bias_address);
uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
uint64_t output_height = (uint64_t)
((args.image.height + args.image.pad_height * 2 -
args.kernel.height) / args.kernel.stride_h +1);
uint64_t output_width = (uint64_t)
(((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + 1) * args.sub_conv_num);
uint64_t output_height = (uint64_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1);
uint64_t output_width = (uint64_t)(
((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1) *
args.sub_conv_num);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width *
(uint64_t)args.image.channels, IMAGE_ALIGNMENT);
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row =
(uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
(uint64_t)args.image.width * (uint64_t)args.image.channels +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t result_amount_align_32 = align_to_x(
(uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_amount_align_32 =
align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
uint64_t result_addr_row =
(result_amount_align_32 << 32) | output_physical_address;
(result_amount_align_32 << 32) | output_physical_address;
uint64_t row_padding_down =
(uint64_t)args.image.height + (uint64_t)args.image.pad_height;
(uint64_t)args.image.height + (uint64_t)args.image.pad_height;
uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
uint64_t kernel_padding_step = row_padding_down |
((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1<<32) |
((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height-1) << 48);
uint64_t image_calcu_height = (uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
((uint64_t)args.image.pad_height << 16) |
((uint64_t)args.kernel.stride_h << 24) |
((uint64_t)kernel_width_sub1 << 32) |
((uint64_t)args.kernel.height << 40) |
((uint64_t)(args.kernel.height - 1) << 48);
uint64_t image_calcu_height =
(uint64_t)args.kernel.height +
(output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t result_size_calcu_height = (output_height - 1) |
((output_width - 1) << 16) | (image_calcu_height << 32);
uint64_t col_padding_down = ((uint64_t)args.image.width +
(uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
((output_width - 1) << 16) |
(image_calcu_height << 32);
uint64_t col_padding_down =
((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
(uint64_t)args.image.channels;
uint64_t image_row_col_padding_down =
image_amount_per_row | (col_padding_down << 32);
image_amount_per_row | (col_padding_down << 32);
uint64_t image_rowXpadding_h =
image_amount_per_row * (uint64_t)args.image.pad_height;
image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_rowXstep_h =
image_amount_per_row * (uint64_t)args.kernel.stride_h;
image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t image_rowXpad_h_rowXstep_h =
image_rowXpadding_h | (image_rowXstep_h << 32);
image_rowXpadding_h | (image_rowXstep_h << 32);
uint64_t channelXpad_w =
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
(uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
uint64_t channelXstep_w =
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
(uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
uint64_t channelXpad_w_channelXstep_w =
channelXpad_w | (channelXstep_w << 32);
channelXpad_w | (channelXstep_w << 32);
uint64_t filter_row_align =
C_align_64 * (uint64_t)args.kernel.width;
uint64_t sub_filter_amount_align = C_align_64 *
(uint64_t)args.kernel.width *
(uint64_t)args.kernel.height;
uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width;
uint64_t sub_filter_amount_align =
C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
uint64_t filter_amount_align =
sub_filter_amount_align * (uint64_t)args.sub_conv_num;
sub_filter_amount_align * (uint64_t)args.sub_conv_num;
uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
(sub_filter_amount_align << 32) |
(((uint64_t)args.sub_conv_num -1) << 48);
(sub_filter_amount_align << 32) |
(((uint64_t)args.sub_conv_num - 1) << 48);
uint64_t channel_parameter =
(uint64_t)args.image.channels | (C_align_64 << 16);
(uint64_t)args.image.channels | (C_align_64 << 16);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
......@@ -1030,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) {
reg_writeq(channelXpad_w_channelXstep_w, 0x848);
reg_writeq(filter_physical_address, 0x850);
reg_writeq(filter_param, 0x858);
reg_writeq(((bias_physical_address+C_align_64*4) |
(bias_physical_address << 32)), 0x860);
reg_writeq(((bias_physical_address + C_align_64 * 4) |
(bias_physical_address << 32)),
0x860);
cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
reg_writeq(cmd, 0x800);
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#include <math.h>
#include "operators/kernel/elementwise_add_kernel.h"
#include <math.h>
namespace paddle_mobile {
namespace operators {
......@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width;
float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
(reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
(reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
(reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale;
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_ELEMENTWISEADDRELU_OP
#include <math.h>
#include "operators/kernel/elementwise_add_relu_kernel.h"
#include <math.h>
namespace paddle_mobile {
namespace operators {
......@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
int inputh = ewaddArgs.image0.height;
int inputw = ewaddArgs.image0.width;
float inScale0 =
(reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
(reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
float inScale1 =
(reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
(reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
float outScale =
(reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
(reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
int datasize = inputc * inputh * inputw;
float const0 = inScale0 / outScale;
float const1 = inScale1 / outScale;
......
......@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
keep_nms.Resize({post_nms_top_n});
}
proposals.mutable_data<T>({keep_nms.numel(), 4}); // original
proposals.mutable_data<T>({keep_nms.numel(), 4}); // original
scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1}); // original
CPUGather<T>(bbox_sel, keep_nms, &proposals);
......@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
for (int h = 0; h < score_height; h++) {
for (int w = 0; w < score_width; w++) {
for (int c = 0; c < score_channels; ++c) {
int dstidx = h*unalignedCW + w*score_channels + c;
int srcidx = h*alignedCW + w*score_channels + c;
int dstidx = h * unalignedCW + w * score_channels + c;
int srcidx = h * alignedCW + w * score_channels + c;
score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
}
}
......@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
for (int h = 0; h < bbox_height; h++) {
for (int w = 0; w < bbox_width; w++) {
for (int c = 0; c < bbox_channels; ++c) {
int dstidx = h*unalignedCW + w*bbox_channels + c;
int srcidx = h*alignedCW + w*bbox_channels + c;
int dstidx = h * unalignedCW + w * bbox_channels + c;
int srcidx = h * alignedCW + w * bbox_channels + c;
bbox_tensor->data<float>()[dstidx] =
(static_cast<int>(input_bbox_data[srcidx]))/127.0*
input_bbox->scale[0];
(static_cast<int>(input_bbox_data[srcidx])) / 127.0 *
input_bbox->scale[0];
}
}
}
......@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
float min_size = param.min_size_;
float eta = param.eta_;
rpn_rois->mutable_data<float>({bbox_tensor->numel()/4, 4});
rpn_roi_probs->mutable_data<int8_t>({input_score->numel()/4, 1});
rpn_rois->mutable_data<float>({bbox_tensor->numel() / 4, 4});
rpn_roi_probs->mutable_data<int8_t>({input_score->numel() / 4, 1});
framework::LoD lod;
lod.resize(1);
auto &lod0 = lod[0];
lod0.push_back(0);
anchors.Resize({anchors.numel()/4, 4});
variances.Resize({variances.numel()/4, 4});
anchors.Resize({anchors.numel() / 4, 4});
variances.Resize({variances.numel() / 4, 4});
int64_t num_proposals = 0;
for (int64_t i = 0; i < score_n; ++i) {
......
......@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width");
auto output_data = out->mutable_data<float>();
auto input_rois = rois->data<float>();
......@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
PSROIPoolingForward<float>(
input_data, height, width, input_channels, offset_output_data,
pooled_height, pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
scale, roi_batch_ind);
PSROIPoolingForward<float>(input_data, height, width, input_channels,
offset_output_data, pooled_height,
pooled_width, output_channels, input_rois,
bin_size_h, bin_size_w, roi_start_h,
roi_start_w, pw, ph, scale, roi_batch_ind);
}
}
}
......
......@@ -118,11 +118,10 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
auto inputdimsize = input->dims().size();
auto outputdimsize = output->dims().size();
int smallersize =
inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
int i = 0;
for (i = 0; i < smallersize; i++) {
if ((input->dims())[i] != (output->dims())[i])
break;
if ((input->dims())[i] != (output->dims())[i]) break;
}
if (i == smallersize) {
reshapeNeedFlg = 0;
......
......@@ -57,31 +57,30 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
int len = end - start;
size_t size = len * sizeof(int8_t);
DLOG << input->fpga_data_num;
fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t));
DLOG << output->fpga_data_num;
fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t));
int unalignedWC = len * W;
int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
if (unalignedWC != alignedWC) {
auto tmpOutput = reinterpret_cast<int8_t*>
(fpga::fpga_malloc(len*HW * sizeof(int8_t)));
for (int i = 0; i < HW; i++) {
memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
auto tmpOutput =
reinterpret_cast<int8_t*>(fpga::fpga_malloc(len * HW * sizeof(int8_t)));
for (int i = 0; i < HW; i++) {
memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
}
for (int i = 0; i < H; i++) {
for (int j = 0; j < unalignedWC; j++) {
*(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j);
}
for (int i = 0; i < H; i++) {
for (int j = 0; j < unalignedWC; j++) {
*(output_ptr + alignedWC * i + j) =
*(tmpOutput + unalignedWC * i + j);
}
}
fpga::fpga_free(tmpOutput);
}
fpga::fpga_free(tmpOutput);
} else {
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
}
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
}
}
fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t));
}
} // namespace operators
} // namespace paddle_mobile
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册