提交 bb42168c 编写于 作者: H hjchen2

Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into ocr_ctc

......@@ -175,19 +175,17 @@ class ViewController: UIViewController {
override func viewDidLoad() {
super.viewDidLoad()
// if runner.load() {
// print(" load success ! ")
// } else {
// print(" load error ! ")
// }
//
modelPickerView.delegate = self
modelPickerView.dataSource = self
threadPickerView.delegate = self
threadPickerView.dataSource = self
if let image = UIImage.init(named: "test.jpg") {
selectImage = image
selectImageView.image = image
} else {
print("请添加测试图片")
}
selectImage = UIImage.init(named: "hand.jpg")
selectImageView.image = selectImage
// if platform == .CPU {
// inputPointer = runner.preproccess(image: selectImage!.cgImage!)
......
......@@ -902,8 +902,8 @@
baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
buildSettings = {
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_IDENTITY = "";
CODE_SIGN_STYLE = Manual;
CODE_SIGN_IDENTITY = "iPhone Developer";
CODE_SIGN_STYLE = Automatic;
DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = "";
DYLIB_COMPATIBILITY_VERSION = 1;
......@@ -922,7 +922,7 @@
"$(inherited)",
"$(PROJECT_DIR)/paddle-mobile/CPU",
);
MACH_O_TYPE = staticlib;
MACH_O_TYPE = mh_dylib;
MTL_LANGUAGE_REVISION = UseDeploymentTarget;
PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
......@@ -939,8 +939,8 @@
baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
buildSettings = {
CLANG_ENABLE_MODULES = YES;
CODE_SIGN_IDENTITY = "";
CODE_SIGN_STYLE = Manual;
CODE_SIGN_IDENTITY = "iPhone Developer";
CODE_SIGN_STYLE = Automatic;
DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = "";
DYLIB_COMPATIBILITY_VERSION = 1;
......@@ -959,7 +959,7 @@
"$(inherited)",
"$(PROJECT_DIR)/paddle-mobile/CPU",
);
MACH_O_TYPE = staticlib;
MACH_O_TYPE = mh_dylib;
MTL_LANGUAGE_REVISION = UseDeploymentTarget;
PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
......
......@@ -31,8 +31,8 @@ kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
int output_to = 4 * input_width * input_height;
output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
// output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
// output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
}
......@@ -52,8 +52,8 @@ kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0
int output_to = 4 * input_width * input_height;
output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
// output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
// output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
}
......
此差异已折叠。
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include "fpga/common/fpga_common.h"
#include "fpga/common/pe.h"
#include "framework/tensor.h"
......@@ -23,6 +24,7 @@ namespace fpga {
void format_image(framework::Tensor* image_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor);
float filter_find_max(framework::Tensor* filter_tensor);
......@@ -39,6 +41,7 @@ void format_filter(framework::Tensor* filter_tensor, float max_value,
void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
void format_concat_output(framework::Tensor* out, int height, int width,
int image_num, uint32_t* channel_num);
......@@ -50,16 +53,28 @@ void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int stride_h, int stride_w,
int padding_h, int padding_w, float* bias_ptr);
void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
int group_num, int stride);
void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
void format_conv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr, int group);
void format_deconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float** bs_ptr,
int group, int sub_conv_n);
void format_dwconv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* scale_ptr,
float** bias_ptr);
template <typename Dtype>
void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
float data;
std::ofstream out(filename.c_str());
for (int i = 0; i < dataSize; ++i) {
data = (((Dtype*)buffer)[i]);
data = (((Dtype*)buffer)[i]); // NOLINT
out << data << std::endl;
}
out.close();
......
......@@ -82,6 +82,20 @@ void format_bias_scale_array(float **bias_scale_array,
interleave(bias_scale_array, div_num * element_num_after_division);
fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
}
void format_bias_array(float **bias_array, int num) {
float *ptr_unaligned = *bias_array;
int num_before_align = num;
int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
int16_t *ptr_aligned =
(int16_t *)fpga_malloc(num_after_align * sizeof(int16_t)); // NOLINT
memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
for (int i = 0; i < num_before_align; i++) {
ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
}
*bias_array = (float *)ptr_aligned; // NOLINT
fpga_free(ptr_unaligned);
}
} // namespace bias_scale
} // namespace fpga
......
......@@ -22,6 +22,7 @@ void align_element(float** data_in, int num_per_div_before_alignment, int num);
void interleave(float** data_in, int num_after_alignment);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_bias_array(float** bias_array, int num);
} // namespace bias_scale
} // namespace fpga
......
......@@ -277,7 +277,75 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
}
void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
int16_t *tmp = *data_in;
int16_t *data_tmp =
(int16_t *)fpga_malloc(height * width * num * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
*(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_element_n(int16_t **data_in, int num, int height, int width) {
int unalign_n = num;
int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
if (unalign_n == align_n) {
return;
} else {
int16_t *tmp = *data_in;
int num_element = height * width * align_n;
int16_t *data_tmp =
(int16_t *)fpga_malloc(num_element * sizeof(int16_t)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(int16_t));
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int offset_unalign = h * width * unalign_n + w * unalign_n;
int offset_align = h * width * align_n + w * align_n;
for (int n = 0; n < unalign_n; n++) {
data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
}
}
}
*data_in = data_tmp;
free(tmp);
}
}
void quantize_to_fp16(float **data_in, int num, int height, int width,
float *scale_ptr) {
float *tmp = *data_in;
int size = num * height * width;
int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t)); // NOLINT
for (int n = 0; n < num; n++) {
float scale_val = scale_ptr[n];
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
int index = n * height * width + h * width + w;
tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
}
}
}
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
void format_dwconv_filter(float **data_in, int num, int height, int width,
float *scale_ptr) {
quantize_to_fp16(data_in, num, height, width, scale_ptr);
int16_t **quantize_data = (int16_t **)data_in; // NOLINT
convert_to_hwn(quantize_data, num, height, width);
align_element_n(quantize_data, num, height, width);
fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
height * width * sizeof(int16_t));
}
} // namespace filter
} // namespace fpga
} // namespace paddle_mobile
......@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace fpga {
namespace filter {
......@@ -38,6 +38,13 @@ void convert_fc_filter(char** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
void convert_to_hwn(int16_t** data_in, int num, int height, int width);
void align_element_n(int16_t** data_in, int num, int height, int width);
void quantize_to_fp16(float** data_in, int num, int height, int width,
float* scale_ptr);
void format_dwconv_filter(float** data_in, int num, int height, int width,
float* scale_ptr);
} // namespace filter
} // namespace fpga
} // namespace paddle_mobile
......@@ -24,14 +24,13 @@ limitations under the License. */
#include <time.h>
#include <iomanip>
#include <iostream>
//#include <iostream>
#endif
namespace paddle_mobile {
namespace fpga {
using namespace driver; // NOLINT
using namespace std;
using namespace std; // NOLINT
#define USE_RELU 1
#define USE_BIAS 2
......@@ -53,7 +52,6 @@ using namespace std;
#define INTERRUPT_CONV 0x0004
#define INTERRUPT_POOLING 0x0008
#define INTERRUPT_EW 0x0010
//#define INTERRUPT_RESIZE 0x0020
/* Register offset */
#define REG_INTERRUPT 0x000
......@@ -73,9 +71,6 @@ using namespace std;
#define REG_FLASH_STATUS 0x218
#define REG_SN 0x220
//#define REG_READ_SCALE
//#define REG_WRITE_SCALE
/*bypass*/
#define REG_CONVERT_CMD 0x400
#define REG_CONVERT_SRC_ADDR 0x408
......@@ -164,6 +159,12 @@ using namespace std;
#define REG_EW_IMAGE_PIXEL 0x0F30
#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
/*dwconv*/
#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
#define REG_DWCONV_FILTER_SHAPE 0xe10
#define REG_DWCONV_FILTER_N_ALIGN 0xe18
#define REG_DWCONV_CMD 0xe00
int ComputeFpgaConv(const struct SplitConvArgs &args) {
// ComputeBasicConv(args.conv_arg[0]);
#ifdef FPGA_PRINT_MODE
......@@ -236,8 +237,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
reg_writeq(*(uint64_t *)args.image.scale_address, // NOLINT
REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, // NOLINT
REG_CONV_FILTER_SCALE);
reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
......@@ -260,6 +263,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
reg_writeq(args.driver.prog_full_cnt, 0xd08);
reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
reg_writeq(args.driver.deconv_param, 0xd18);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD);
DLOG << "before reg poll";
......@@ -279,7 +283,6 @@ int ComputeBasicConv(const struct ConvArgs &args) {
return ret;
#endif
return 0;
} // ComputeBasicConv
int ComputeFpgaPool(const struct PoolingArgs &args) {
......@@ -405,13 +408,11 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
} // ComputeFpgaPool
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
......@@ -467,13 +468,10 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
} // ComputeFpgaEWAdd
int PerformBypass(const struct BypassArgs &args) {
......@@ -587,13 +585,10 @@ int PerformBypass(const struct BypassArgs &args) {
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
} // PerformBypass
int ComputeFPGAConcat(const struct ConcatArgs &args) {
......@@ -637,7 +632,7 @@ void deconv_post_process(const struct DeconvArgs &args) {
for (int idx = 0; idx < sub_conv_n; ++idx) {
paddle_mobile::fpga::fpga_invalidate(
args.split_conv_args[idx].output.address,
args.split_conv_args[idx]->output.address,
align_origin_w * origin_h * sizeof(int16_t));
}
......@@ -646,13 +641,14 @@ void deconv_post_process(const struct DeconvArgs &args) {
for (int hh = 0; hh < origin_h; ++hh) {
int hx = (hh % sub_conv_n);
auto sub_t =
(int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address);
(int16_t *)(args.split_conv_args[sub_conv_n - hx - 1] // NOLINT
->output.address);
int hi = (hh / sub_conv_n);
if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
omit_size * channel);
fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx,
sizeof(int16_t) * deconv_row_len);
fpga_copy((int16_t *)(args.output.address) + deconv_idx, // NOLINT
sub_t + sidx, sizeof(int16_t) * deconv_row_len); // NOLINT
deconv_idx += align_deconv_row_len;
}
}
......@@ -677,7 +673,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef COST_TIME_PRINT
timeval start, end;
long dif_sec, dif_usec;
long dif_sec, dif_usec; // NOLINT
#endif
for (int i = 0; i < sub_conv_num; i++) {
......@@ -685,7 +681,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
gettimeofday(&start, NULL);
#endif
ComputeFpgaConv(args.split_conv_args[i]);
ComputeFpgaConv(*args.split_conv_args[i]);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
......@@ -703,12 +699,12 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#endif
for (int i = 0; i < sub_conv_num; i++) {
paddle_mobile::fpga::fpga_invalidate(
args.split_conv_args[i].output.scale_address, 2 * sizeof(float));
float ptr_scale = (args.split_conv_args[i].output.scale_address)[0];
args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
if (ptr_scale > max_scale) {
args.output.scale_address[0] = ptr_scale;
args.output.scale_address[1] =
(args.split_conv_args[i].output.scale_address)[1];
(args.split_conv_args[i]->output.scale_address)[1];
}
}
......@@ -722,18 +718,16 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#endif
// fpga_flush(args.output.scale_address, 2 * sizeof(float));
#ifdef COST_TIME_PRINT
gettimeofday(&start, NULL);
#endif
deconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end, NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process "
<< " cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
<< std::endl;
#endif
/*#ifdef COST_TIME_PRINT
gettimeofday(&start,NULL);
#endif
//deconv_post_process(args);
#ifdef COST_TIME_PRINT
gettimeofday(&end,NULL);
dif_sec = end.tv_sec - start.tv_sec;
dif_usec = end.tv_usec - start.tv_usec;
std::cout << "deconv_post_process " << " cost time: " <<
(dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/
}
return 0;
......@@ -758,6 +752,162 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
args.height, args.width);
return 0;
} // ComputeFPGASplit
int ComputeDWConv(const struct DWconvArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeDWConv===========";
DLOG << " mode:" << args.relu_enabled;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " filter_address:" << args.filter_address
<< " bias_address:" << args.bias_address;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifdef PADDLE_MOBILE_ZU5
DLOG << "DWConv";
// return 0;
uint64_t output_scale = 0;
uint64_t timer_cnt = 0;
int ret = 0;
uint64_t cmd = args.relu_enabled;
uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0;
uint64_t filter_physical_address = 0;
uint64_t bias_physical_address = 0;
image_physical_address = vaddr_to_paddr(args.image.address);
output_physical_address = vaddr_to_paddr(args.output.address);
filter_physical_address = vaddr_to_paddr(args.filter_address);
bias_physical_address = vaddr_to_paddr(args.bias_address);
uint64_t filter_N_align =
align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
uint64_t filter_amount_per_row_align =
filter_N_align * (uint64_t)args.kernel.width;
uint64_t filter_amount_align = filter_N_align * (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height;
uint32_t output_height = (uint32_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1);
uint32_t output_width = (uint32_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
FILTER_ELEMENT_ALIGNMENT) +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t image_two_pad_per_row = align_to_x(
((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
(uint64_t)args.image.channels,
IMAGE_ALIGNMENT);
uint64_t image_row_mul_pooling_hight =
image_amount_per_row * (uint64_t)args.kernel.height;
uint64_t image_row_mul_pad_hight =
image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_row_mul_step_hight =
image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t result_amount_align_32 =
align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
FILTER_ELEMENT_ALIGNMENT);
uint64_t result_amount_align_64 = align_to_x(
(uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
uint64_t image_calcu_height =
(uint64_t)args.kernel.height +
((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t image_pad_left = args.image.channels * args.image.pad_width;
uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
uint64_t image_padleft_skipwindow =
(image_skip_window << 32) | image_pad_left;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
DLOG << "Conv Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
reg_writeq((bias_physical_address << 32 | filter_physical_address),
REG_DWCONV_FILTER_BASE_ADDR);
reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
REG_DWCONV_FILTER_SHAPE);
reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_POOLING_IMAGE_PIXEL);
reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_POOLING_WINDOW_SIZE);
reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
REG_POOLING_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32),
REG_POOLING_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32),
REG_POOLING_STEP_PIXEL);
reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
reg_writeq(image_row_mul_pooling_hight,
REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
/*SDK刷Cache保证数据一致性*/
reg_writeq(cmd, REG_DWCONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
ret = -EIO;
DLOG << "Pooling Wait Irq Timeout!";
}
DLOG << "after reg poll";
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -75,6 +75,9 @@ void *fpga_malloc(size_t size) {
}
void fpga_free(void *ptr) {
if (ptr == nullptr) {
return;
}
static uint64_t counter = 0;
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
......@@ -123,5 +126,6 @@ uint64_t vaddr_to_paddr(void *address) {
return 0;
#endif
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -16,6 +16,8 @@ limitations under the License. */
#include <cstddef>
#include <cstdint>
#include <memory>
#include <vector>
namespace paddle_mobile {
namespace fpga {
......@@ -25,6 +27,7 @@ namespace fpga {
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#define BIAS_NUM_ALIGNMENT 16
#endif
enum DataType {
......@@ -105,6 +108,8 @@ struct ConvDriverParam {
uint64_t post_prog_full_cnt;
uint64_t fpga_bias_scale_len;
uint64_t cmd;
uint64_t deconv_param;
};
struct EWAddDriverParam {
......@@ -117,6 +122,13 @@ struct EWAddDriverParam {
uint64_t coefficient;
uint64_t cmd;
};
struct DeconvTxParm {
uint32_t omit_size;
uint32_t sub_conv_num;
uint32_t deconv_en;
uint32_t out_addr_offset;
};
#endif
struct ConvArgs {
......@@ -136,6 +148,7 @@ struct ConvArgs {
#endif
#ifdef PADDLE_MOBILE_FPGA_V1
struct DeconvTxParm deconv_tx_param;
struct ConvDriverParam driver;
#endif
};
......@@ -147,7 +160,7 @@ struct ConcatArgs {
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t* aligned_channel_num;
uint32_t* aligned_channel_num; // Not used so far. Reserved for V2.
uint32_t out_channel;
uint32_t height;
uint32_t width;
......@@ -160,6 +173,9 @@ struct SplitConvArgs {
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
std::shared_ptr<ConvArgs> shared_conv_arg;
std::vector<std::shared_ptr<char>> vector_concat_space;
std::vector<std::shared_ptr<char>> vector_conv_space;
};
struct SplitArgs {
......@@ -210,9 +226,16 @@ struct DeconvArgs {
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
struct SplitConvArgs* split_conv_args;
std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
};
struct DWconvArgs {
bool relu_enabled;
void* bias_address;
void* filter_address;
struct KernelArgs kernel;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static inline uint32_t align_to_x(int64_t num, int64_t x) {
......
......@@ -27,6 +27,6 @@ int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
int ComputeFPGASplit(const struct SplitArgs& args);
int ComputeFpgaDeconv(const struct DeconvArgs& args);
int ComputeDWConv(const struct DWconvArgs& args);
} // namespace fpga
} // namespace paddle_mobile
......@@ -230,6 +230,10 @@ template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
const std::shared_ptr<VarDesc> &var_desc, Variable *var,
LoDTensor *tensor) const {
#ifdef PADDLE_MOBILE_FPGA
tensor->init(typeid(float));
return true;
#endif
auto type = var_desc->Tensor_desc().DataType();
switch (type) {
case VARTYPE_TYPE_FP32:
......
......@@ -202,6 +202,21 @@ class Tensor : public TensorBase {
inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT
}
inline void *init(std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = 1 * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type));
offset_ = 0;
}
return reinterpret_cast<void *>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
#endif
};
......
......@@ -91,6 +91,9 @@ class TensorBase {
}
inline void check_memory_size() const {
#ifdef PADDLE_MOBILE_FPGA
return;
#endif
PADDLE_MOBILE_ENFORCE(
holder_ != nullptr,
"Tensor holds no memory. Call Tensor::mutable_data first.");
......
......@@ -27,59 +27,119 @@
@end
@interface PaddleMobileCPUConfig: NSObject
/**
@b 默认为 1, 多线程时, 建议设置为 2
*/
@property (assign, nonatomic) int threadNum;
/**
@b 是否开启运行时 infershape
*/
@property (assign, nonatomic) BOOL loddable;
/**
@b 是否开启模型 op 融合优化
*/
@property (assign, nonatomic) BOOL optimize;
@end
@interface PaddleMobileCPU : NSObject
/*
创建对象
*/
- (instancetype)init;
/**
@b 创建对象
/*
load 模型, 开辟内存
*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
@param config 配置
@return paddlemobile CPU 对象
*/
- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config;
/*
加载散开形式的模型, 需传入模型的目录
*/
/**
@b 加载模型
@param modelPath 模型路径
@param weighsPath 权重路径
@return 是否加载成功
*/
- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
/**
@b 加载散开形式的模型, 需传入模型的目录
@param modelAndWeightPath 模型和权重的路径
@return 是否加载成功
*/
- (BOOL)load:(NSString *)modelAndWeightPath;
/*
* 从内存中加载模型
* */
/**
@b 从内存中加载模型
@param modelLen 模型大小(字节数)
@param modelBuf 模型在内存中的位置
@param combinedParamsLen 权重大小(字节数)
@param combinedParamsBuf 权重在内存中的位置
@return 是否加载成功
*/
- (BOOL)LoadCombinedMemory:(size_t)modelLen
andModelBuf:(const uint8_t *)modelBuf
andModelParamsLen:(size_t)combinedParamsLen
andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
/*
* 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
*
* */
/**
@b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
@param image 输入的图像
@param output 预处理后的输出
@param means 预处理中 means
@param scale 预处理中的 scale
@param dim 预处理后的维度
*/
-(void)preprocess:(CGImageRef)image
output:(float *)output
means:(NSArray<NSNumber *> *)means
scale:(float)scale
dim:(NSArray<NSNumber *> *)dim;
/*
* 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
* */
/**
进行预测
@param input 输入
@param dim 输入维度
@return 输出结果
*/
- (PaddleMobileCPUResult *)predictInput:(float *)input
dim:(NSArray<NSNumber *> *)dim;
/*
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/*
进行预测, 默认 means 为 0, scale 为 1.0
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/*
清理内存
*/
/**
@b 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
@param image 输入图像
@param dim 输入维度
@param means 预处理中 means
@param scale 预处理中 scale
@return 预测结果
*/
- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/**
进行预测, 预处理 means 值为 0, scale 值为 1
@param image 输入图像
@param dim 输入维度
@return 预测结果
*/
- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/**
@b 清理内存
*/
- (void)clear;
@end
......@@ -45,21 +45,44 @@
@end
@implementation PaddleMobileCPUConfig
-(instancetype)init {
if (self = [super init]) {
self.threadNum = 1;
self.optimize = YES;
}
return self;
}
@end
@interface PaddleMobileCPU()
{
paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
BOOL loaded_;
}
@property (strong, nonatomic) PaddleMobileCPUConfig *config;
@end
@implementation PaddleMobileCPU
static std::mutex shared_mutex;
- (instancetype)init {
- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
if (self = [super init]) {
pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
_config = config;
}
return self;
}
-(instancetype)init {
if (self = [super init]) {
_config = [[PaddleMobileCPUConfig alloc] init];
pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
}
return self;
}
......@@ -79,11 +102,11 @@ static std::mutex shared_mutex;
return sharedManager;
}
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath {
std::string model_path_str = std::string([modelPath UTF8String]);
std::string weights_path_str = std::string([weighsPath UTF8String]);
pam_->SetThreadNum(2);
if (loaded_ = pam_->Load(model_path_str, weights_path_str, true)) {
pam_->SetThreadNum(self.config.threadNum);
if (loaded_ = pam_->Load(model_path_str, weights_path_str, self.config.optimize, false, 1, self.config.loddable)) {
return YES;
} else {
return NO;
......@@ -94,14 +117,14 @@ static std::mutex shared_mutex;
andModelBuf:(const uint8_t *)modelBuf
andModelParamsLen:(size_t)combinedParamsLen
andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
pam_->SetThreadNum(2);
pam_->SetThreadNum(self.config.threadNum);
return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen,
const_cast<uint8_t*>(combinedParamsBuf));
const_cast<uint8_t*>(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable);
}
- (BOOL)load:(NSString *)modelAndWeightPath{
std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
if (loaded_ = pam_->Load(model_path_str)) {
if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) {
return YES;
} else {
return NO;
......@@ -116,6 +139,10 @@ static std::mutex shared_mutex;
dim:(NSArray<NSNumber *> *)dim {
std::lock_guard<std::mutex> lock(shared_mutex);
if (means == nil) {
means = @[@0, @0, @0];
}
// dim to c++ vector, get numel
std::vector<int64_t > dim_vec;
int numel = 1;
......@@ -235,7 +262,7 @@ static std::mutex shared_mutex;
return cpuResult;
}
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
// printf(" predict one ");
std::lock_guard<std::mutex> lock(shared_mutex);
if (!loaded_) {
......@@ -284,28 +311,22 @@ static std::mutex shared_mutex;
// predict
std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
// result
long count = 0;
count = cpp_result.size();
NSMutableArray *result = [[NSMutableArray alloc] init];
for (int i = 0; i < count; i++) {
[result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
}
float *output_pointer = new float[cpp_result.size()];
memcpy(output_pointer, cpp_result.data(),
cpp_result.size() * sizeof(float));
PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
[cpuResult toSetOutput: output_pointer];
[cpuResult toSetOutputSize: cpp_result.size()];
free(output);
// 待验证
// if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
CFRelease(cfData);
cfData = NULL;
// }
return result;
return cpuResult;
}
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
[self predict:image dim:dim means:nil scale:1];
- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
return [self predict:image dim:dim means:nil scale:1];
}
- (void)clear{
......
......@@ -91,14 +91,10 @@ PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
}
}
template <typename Device, typename T>
bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf,
size_t combined_params_len,
uint8_t *combined_params_buf) {
int batch_size = 1;
bool optimise = true;
bool quantification = false;
bool PaddleMobile<Device, T>::LoadCombinedMemory(
size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
uint8_t *combined_params_buf, bool optimize, bool quantification,
int batch_size, bool lod_mode) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<framework::Loader<Device, T>>();
} else {
......@@ -107,9 +103,9 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
combined_params_buf, optimise,
combined_params_buf, optimize,
quantification),
batch_size, optimise);
batch_size, optimize, lod_mode);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......
......@@ -72,7 +72,9 @@ class PaddleMobile {
bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
size_t combined_params_len,
uint8_t *combined_params_buf);
uint8_t *combined_params_buf, bool optimize = false,
bool quantification = false, int batch_size = 1,
bool lod_mode = false);
void SetThreadNum(int count);
void Clear();
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/kernel/conv_add_bn_relu_kernel.h"
#include <math.h>
#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
namespace paddle_mobile {
......
......@@ -96,15 +96,15 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
#endif // __aarch64__
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT:
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
nullptr, false);
nullptr, false, false);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT:
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), nullptr, false);
param.Output(), nullptr, false, false);
break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P0_FLOAT:
math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
nullptr, false);
nullptr, false, false);
break;
#ifndef __aarch64__
case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
......
......@@ -122,7 +122,7 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
param.Bias(), true);
param.Bias(), true, false);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
......@@ -133,10 +133,10 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
// param.Output(), false);
if (param.Paddings()[0] == 0) {
math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
param.Bias(), true);
param.Bias(), true, false);
} else {
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.Bias(), true);
param.Output(), param.Bias(), true, false);
}
} else {
ConvAddBasic(param);
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP
#pragma once
#include <operators/math/depthwise_conv3x3.h>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
......@@ -26,7 +27,7 @@ namespace paddle_mobile {
namespace operators {
template <typename Itype, typename Otype>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
void ConvAddReluBasic(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
......@@ -118,6 +119,34 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
}
}
template <typename Itype, typename Otype>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
param.Output()->mutable_data<float>();
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
param.Bias(), true, true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConv3x3(param.Input(), param.Strides(),
// param.Paddings(),
// param.Filter(), param.Bias(),
// param.Output(), false);
if (param.Paddings()[0] == 0) {
math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
param.Bias(), true, true);
} else {
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.Bias(), true, true);
}
} else {
ConvAddReluBasic<Itype, Otype>(param);
}
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/kernel/conv_add_bn_kernel.h"
#include <math.h>
namespace paddle_mobile {
namespace operators {
......@@ -58,14 +58,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/kernel/conv_add_bn_relu_kernel.h"
#include <math.h>
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true;
auto input = const_cast<Tensor *>(param->Input());
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
vector<int> paddings = param->Paddings();
vector<int> strides = param->Strides();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
auto bn_bias_ptr = param->InputBias()->data<float>();
const float epsilon = param->Epsilon();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
bias->dims()[0] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
for (int i = 0; i < channel; i++) {
new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] =
bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i];
bs_ptr[i] = new_bias_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
const int groups = param->Groups();
if (groups == channel) {
fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
fpga::DWconvArgs dwconv_arg = {0};
fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
strides[0], strides[1], paddings[0], paddings[1],
new_bias_ptr);
param->SetFpgaArgs(dwconv_arg);
} else {
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), strides[0], strides[1], paddings[0],
paddings[1], bs_ptr);
param->SetFpgaArgs(conv_arg);
}
return true;
}
template <>
void ConvAddBNReluKernel<FPGA, float>::Compute(
const FusionConvAddBNReluParam<FPGA> &param) {
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWConv(param.FpgaDwconvArgs());
} else {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -38,15 +38,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
bs_ptr[i] = bias_ptr[i];
}
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -38,15 +38,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bs_ptr[i] = bias_ptr[i];
}
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -51,15 +51,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -51,15 +51,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, param->Groups());
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -35,8 +35,8 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
sizeof(float)); // NOLINT
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
......@@ -49,20 +49,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
float max_value = fpga::filter_find_max(filter);
fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]);
int element_num_per_div =
fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
//
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
channel * sub_conv_n);
fpga::format_fp16_ofm(out);
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
......@@ -36,8 +36,8 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
sizeof(float)); // NOLINT
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
......@@ -50,19 +50,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
"filter width should be equal to filter height ");
PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
"filter axis should be the multiple of stride axis ");
float max_value = fpga::filter_find_max(filter);
fpga::format_deconv_filter(filter, max_value, param->Groups(),
param->Strides()[0]);
int element_num_per_div =
fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
channel * sub_conv_n);
fpga::format_fp16_ofm(out);
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0],
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#include "operators/kernel/reshape_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
param->Out()->ShareDataWith(*param->InputX());
return true;
}
template <>
void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -28,18 +28,26 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
fpga::format_fp32_ofm(out);
auto float_input = new Tensor;
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
fpga::format_fp32_ofm(float_input);
if (input->dims().size() == 2) {
float_input->mutable_data<float>({1, input->dims()[1]});
} else if (input->dims().size() == 4) {
float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
} else {
DLOG << "wrong dimension of softmax input";
}
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr;
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.height =
(input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1;
args.image.width =
(input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale;
......@@ -56,7 +64,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
in_x->numel() * sizeof(float));
// TODO: In general case, 0 should be squeezed before softmax input
// TODO: In general case, 0 should be squeezed before softmax input // NOLINT
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
......
......@@ -251,27 +251,31 @@ void DepthwiseConv3x3(const framework::Tensor *input,
void DepthwiseConv3x3s1p1(const framework::Tensor *input,
const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias,
bool if_bias) {
bool if_bias, bool if_relu) {
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->mutable_data<float>();
const float *bias_data;
if (if_bias) {
bias_data = bias->data<float>();
}
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
// const int l = h;
const float *bias_data = bias->data<float>();
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
const int hxw = h * w;
float32x4_t vbias = vdupq_n_f32(0.0);
for (int b = 0; b < batch_size; ++b) {
const float *filter_data_tmp = filter_data;
// const int l = h;
// leftTop, rightTop, leftBottom, rightBottom
const int lt = 0;
const int rt = w - 1;
const int lb = (h - 1) * w;
const int rb = h * w - 1;
float32x4_t zero = vdupq_n_f32(0.0);
for (int b = 0; b < batch_size; ++b) {
#pragma omp parallel for
for (int j = 0; j < c; ++j) {
const float *filter_data_tmp = filter->data<float>() + j * 9;
const float *input_data = input->data<float>() + j * hxw;
float *output_data = output->mutable_data<float>() + j * hxw;
float32x4_t vbias;
if (if_bias) {
vbias = vdupq_n_f32(bias_data[j]);
}
......@@ -287,39 +291,51 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
float w21 = filter_data_tmp[7];
float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[w] + w22 * input_data[w + 1];
output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
w20 * input_data[2 * w - 2] +
w21 * input_data[2 * w - 1];
output_data[(h - 1) * w] =
output_data[lt] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[w] + w22 * input_data[w + 1];
output_data[rt] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
w20 * input_data[2 * w - 2] +
w21 * input_data[2 * w - 1];
output_data[lb] =
w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[h * w - 1] =
output_data[rb] =
w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
if (if_bias) {
output_data[0] += bias_data[j];
output_data[w - 1] += bias_data[j];
output_data[(h - 1) * w] += bias_data[j];
output_data[h * w - 1] += bias_data[j];
output_data[lt] += bias_data[j];
output_data[rt] += bias_data[j];
output_data[lb] += bias_data[j];
output_data[rb] += bias_data[j];
}
if (if_relu) {
output_data[lt] = output_data[lt] < 0 ? 0 : output_data[lt];
output_data[rt] = output_data[rt] < 0 ? 0 : output_data[rt];
output_data[lb] = output_data[lb] < 0 ? 0 : output_data[lb];
output_data[rb] = output_data[rb] < 0 ? 0 : output_data[rb];
}
for (int i = 1; i < h - 1; ++i) {
output_data[i * w] =
int left = i * w;
int right = i * w + w - 1;
output_data[left] =
w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
w01 * input_data[i * w + w - 1 - w] +
w10 * input_data[i * w + w - 1 - 1] +
w11 * input_data[i * w + w - 1] +
w20 * input_data[i * w + w - 1 + w - 1] +
w21 * input_data[i * w + w - 1 + w];
output_data[right] = w00 * input_data[i * w + w - 1 - w - 1] +
w01 * input_data[i * w + w - 1 - w] +
w10 * input_data[i * w + w - 1 - 1] +
w11 * input_data[i * w + w - 1] +
w20 * input_data[i * w + w - 1 + w - 1] +
w21 * input_data[i * w + w - 1 + w];
if (if_bias) {
output_data[i * w] += bias_data[j];
output_data[i * w + w - 1] += bias_data[j];
output_data[left] += bias_data[j];
output_data[right] += bias_data[j];
}
if (if_relu) {
output_data[left] = output_data[left] < 0 ? 0 : output_data[left];
output_data[right] = output_data[right] < 0 ? 0 : output_data[right];
}
}
......@@ -352,7 +368,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4);
......@@ -370,7 +388,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_f32(output_ptr + (h - 1) * w, out0);
// can optimize to each 8 stride.
......@@ -399,6 +419,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
......@@ -428,6 +451,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
......@@ -471,6 +497,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_f32(output_ptr, out0);
......@@ -502,6 +531,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
......@@ -515,9 +547,6 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
}
}
}
output_data += hxw;
input_data += hxw;
filter_data_tmp += 9;
}
}
#endif
......@@ -1273,7 +1302,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,
void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias,
bool if_bias) {
bool if_bias, bool if_relu) {
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
......@@ -1361,6 +1390,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
vaddq_f32(elewise_res0, elewise_res1));
res3 = vaddq_f32(res3, vbias);
if (if_relu) {
res3 = vmaxq_f32(res3, zero);
}
vst1q_f32(output_row_ptr, res3);
input_row_ptr += 6;
......@@ -1395,6 +1427,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
vaddq_f32(elewise_res0, elewise_res1));
res3 = vaddq_f32(res3, vbias);
if (if_relu) {
res3 = vmaxq_f32(res3, zero);
}
if ((w4 != w_times)) {
vst1q_f32(output_row_ptr, res3);
......@@ -1410,12 +1445,18 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
output_row_ptr += 3;
}
output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_w] * w21 +
input_const[in_w + 1] * w22;
// leftTop, rightTop, leftBottom, rightBottom
int lt = 0;
int rt = out_w - 1;
int lb = out_w * (out_h - 1);
int rb = out_h * out_w - 1;
output_data_tmp[lt] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_w] * w21 +
input_const[in_w + 1] * w22;
out2in_mid = (out_w - 1) * 2;
output_data_tmp[out_w - 1] =
output_data_tmp[rt] =
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
w20 * input_const[out2in_mid + in_w - 1] +
w21 * input_const[out2in_mid + in_w] +
......@@ -1424,7 +1465,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
out2in_mid = (out_h - 1) * 2 * in_w;
output_data_tmp[out_w * (out_h - 1)] =
output_data_tmp[lb] =
w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
......@@ -1432,7 +1473,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[out_h * out_w - 1] =
output_data_tmp[rb] =
w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
......@@ -1443,22 +1484,30 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
(1 - if_pad_r) * (1 - if_pad_b) * w22 *
input_const[out2in_mid + in_w + 1];
if (if_bias) {
output_data_tmp[0] += bias_data[j];
output_data_tmp[out_w - 1] += bias_data[j];
output_data_tmp[out_w * (out_h - 1)] += bias_data[j];
output_data_tmp[out_h * out_w - 1] += bias_data[j];
output_data_tmp[lt] += bias_data[j];
output_data_tmp[rt] += bias_data[j];
output_data_tmp[lb] += bias_data[j];
output_data_tmp[rb] += bias_data[j];
}
if (if_relu) {
output_data_tmp[lt] = output_data_tmp[lt] < 0 ? 0 : output_data_tmp[lt];
output_data_tmp[rt] = output_data_tmp[rt] < 0 ? 0 : output_data_tmp[rt];
output_data_tmp[lb] = output_data_tmp[lb] < 0 ? 0 : output_data_tmp[lb];
output_data_tmp[rb] = output_data_tmp[rb] < 0 ? 0 : output_data_tmp[rb];
}
for (int i = 1; i < out_h - 1; i++) {
out2in_mid = i * 2 * in_w;
output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] +
w12 * input_const[out2in_mid + 1] +
w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1];
int left = i * out_w;
output_data_tmp[left] = w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] +
w12 * input_const[out2in_mid + 1] +
w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1];
out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[i * out_w + out_w - 1] =
int right = i * out_w + out_w - 1;
output_data_tmp[right] =
w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
......@@ -1468,8 +1517,14 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]);
if (if_bias) {
output_data_tmp[i * out_w] += bias_data[j];
output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
output_data_tmp[left] += bias_data[j];
output_data_tmp[right] += bias_data[j];
}
if (if_relu) {
output_data_tmp[left] =
output_data_tmp[left] < 0 ? 0 : output_data_tmp[left];
output_data_tmp[right] =
output_data_tmp[right] < 0 ? 0 : output_data_tmp[right];
}
}
filter_data_tmp += 9;
......@@ -1909,7 +1964,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
void DepthwiseConv3x3s2p0(const framework::Tensor *input,
const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias,
bool if_bias) {
bool if_bias, bool if_relu) {
#if __ARM_NEON
const int batch_size = static_cast<int>(input->dims()[0]);
......@@ -1977,6 +2032,9 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
if (if_bias) {
out0 = vaddq_f32(out0, biasv);
}
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_lane_f32(output_ptr, out0, 0);
vst1q_lane_f32(output_ptr + 1, out0, 1);
vst1q_lane_f32(output_ptr + 2, out0, 2);
......@@ -1985,7 +2043,8 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
for (m = 0; m < output_width - 2; m += 3) {
}
for (int j = m; j < output_width; j++) {
output_data[i * output_width + j] =
int index = i * output_width + j;
output_data[index] =
input_data[(2 * i) * input_width + 2 * j] * w00 +
input_data[(2 * i) * input_width + 2 * j + 1] * w01 +
input_data[(2 * i) * input_width + 2 * j + 2] * w02 +
......@@ -1996,7 +2055,11 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 +
input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22;
if (if_bias) {
output_data[i * output_width + j] += *bias_data;
output_data[index] += *bias_data;
}
if (if_relu) {
output_data[index] =
output_data[index] < 0 ? 0 : output_data[index];
}
}
}
......
......@@ -32,7 +32,7 @@ void DepthwiseConv3x3(const framework::Tensor *input,
void DepthwiseConv3x3s1p1(const framework::Tensor *input,
const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias,
bool if_bias);
bool if_bias, bool if_relu);
void DepthwiseConvAddBNRelu3x3s1p1(const framework::Tensor *input,
const framework::Tensor *filter,
......@@ -51,7 +51,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,
void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias,
bool if_bias);
bool if_bias, bool if_relu);
void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
const framework::Tensor *filter,
......@@ -63,7 +63,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
void DepthwiseConv3x3s2p0(const framework::Tensor *input,
const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias,
bool if_bias);
bool if_bias, bool if_relu);
// TODO(hjchen2) need to be implemented
// template<typename Itype, typename Otype>
......
......@@ -464,6 +464,13 @@ class ConvParam : public OpParam {
public:
const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
public:
fpga::DWconvArgs fpga_dwconv_args;
public:
const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; }
void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; }
#endif
};
template <typename Dtype>
......
......@@ -38,6 +38,9 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
#endif
#ifdef PADDLE_MOBILE_CL
REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
#endif
......
......@@ -129,6 +129,15 @@ if (CON GREATER -1)
endif ()
list(FIND NET "super" CON)
if (CON GREATER -1)
# gen test
ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-super paddle-mobile)
set(FOUND_MATCH ON)
endif ()
if (NOT FOUND_MATCH)
# gen test
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
......
#!/usr/bin/env bash
NETS=""
declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet")
declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super")
build_for_mac() {
if [ ! `which brew` ]; then
......@@ -162,7 +162,7 @@ build_for_ios() {
fi
cd "${BUILD_DIR}"
make -j 8
cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
cd ./build
# 生成符号表
ranlib *.a
......
......@@ -122,6 +122,11 @@ if (CON GREATER -1)
set(SPLIT_OP ON)
set(FUSION_DECONVADD_OP ON)
set(FUSION_DECONVADDRELU_OP ON)
set(RESHAPE_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON)
set(FOUND_MATCH ON)
endif()
......@@ -197,6 +202,16 @@ if (CON GREATER -1)
set(FOUND_MATCH ON)
endif()
list(FIND NET "super" CON)
if (CON GREATER -1)
message("super enabled")
set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADDRELU_OP ON)
set(ELEMENTWISEADD_OP ON)
set(FOUND_MATCH ON)
endif()
if(NOT FOUND_MATCH)
message("--default--")
......
......@@ -5,7 +5,7 @@ TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "PaddleMobileCPU.h"); do
cpplint $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册