提交 19308114 编写于 作者: R Ray Liu 提交者: GitHub

Merge pull request #1363 from codeWorm2015/develop

modify ios interface
...@@ -175,19 +175,17 @@ class ViewController: UIViewController { ...@@ -175,19 +175,17 @@ class ViewController: UIViewController {
override func viewDidLoad() { override func viewDidLoad() {
super.viewDidLoad() super.viewDidLoad()
// if runner.load() {
// print(" load success ! ")
// } else {
// print(" load error ! ")
// }
//
modelPickerView.delegate = self modelPickerView.delegate = self
modelPickerView.dataSource = self modelPickerView.dataSource = self
threadPickerView.delegate = self threadPickerView.delegate = self
threadPickerView.dataSource = self threadPickerView.dataSource = self
if let image = UIImage.init(named: "test.jpg") {
selectImage = image
selectImageView.image = image
} else {
print("请添加测试图片")
}
selectImage = UIImage.init(named: "hand.jpg")
selectImageView.image = selectImage
// if platform == .CPU { // if platform == .CPU {
// inputPointer = runner.preproccess(image: selectImage!.cgImage!) // inputPointer = runner.preproccess(image: selectImage!.cgImage!)
......
...@@ -902,8 +902,8 @@ ...@@ -902,8 +902,8 @@
baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */; baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
buildSettings = { buildSettings = {
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CODE_SIGN_IDENTITY = ""; CODE_SIGN_IDENTITY = "iPhone Developer";
CODE_SIGN_STYLE = Manual; CODE_SIGN_STYLE = Automatic;
DEFINES_MODULE = YES; DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = ""; DEVELOPMENT_TEAM = "";
DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_COMPATIBILITY_VERSION = 1;
...@@ -922,7 +922,7 @@ ...@@ -922,7 +922,7 @@
"$(inherited)", "$(inherited)",
"$(PROJECT_DIR)/paddle-mobile/CPU", "$(PROJECT_DIR)/paddle-mobile/CPU",
); );
MACH_O_TYPE = staticlib; MACH_O_TYPE = mh_dylib;
MTL_LANGUAGE_REVISION = UseDeploymentTarget; MTL_LANGUAGE_REVISION = UseDeploymentTarget;
PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile"; PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
...@@ -939,8 +939,8 @@ ...@@ -939,8 +939,8 @@
baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */; baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
buildSettings = { buildSettings = {
CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_MODULES = YES;
CODE_SIGN_IDENTITY = ""; CODE_SIGN_IDENTITY = "iPhone Developer";
CODE_SIGN_STYLE = Manual; CODE_SIGN_STYLE = Automatic;
DEFINES_MODULE = YES; DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = ""; DEVELOPMENT_TEAM = "";
DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_COMPATIBILITY_VERSION = 1;
...@@ -959,7 +959,7 @@ ...@@ -959,7 +959,7 @@
"$(inherited)", "$(inherited)",
"$(PROJECT_DIR)/paddle-mobile/CPU", "$(PROJECT_DIR)/paddle-mobile/CPU",
); );
MACH_O_TYPE = staticlib; MACH_O_TYPE = mh_dylib;
MTL_LANGUAGE_REVISION = UseDeploymentTarget; MTL_LANGUAGE_REVISION = UseDeploymentTarget;
PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile"; PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
......
...@@ -31,8 +31,8 @@ kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]], ...@@ -31,8 +31,8 @@ kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
int output_to = 4 * input_width * input_height; int output_to = 4 * input_width * input_height;
output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
// output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
// output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
} }
...@@ -52,8 +52,8 @@ kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0 ...@@ -52,8 +52,8 @@ kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0
int output_to = 4 * input_width * input_height; int output_to = 4 * input_width * input_height;
output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
// output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
// output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
} }
......
...@@ -27,59 +27,119 @@ ...@@ -27,59 +27,119 @@
@end @end
@interface PaddleMobileCPUConfig: NSObject
/**
@b 默认为 1, 多线程时, 建议设置为 2
*/
@property (assign, nonatomic) int threadNum;
/**
@b 是否开启运行时 infershape
*/
@property (assign, nonatomic) BOOL loddable;
/**
@b 是否开启模型 op 融合优化
*/
@property (assign, nonatomic) BOOL optimize;
@end
@interface PaddleMobileCPU : NSObject @interface PaddleMobileCPU : NSObject
/* /**
创建对象 @b 创建对象
*/
- (instancetype)init;
/* @param config 配置
load 模型, 开辟内存 @return paddlemobile CPU 对象
*/ */
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath; - (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config;
/* /**
加载散开形式的模型, 需传入模型的目录 @b 加载模型
*/
@param modelPath 模型路径
@param weighsPath 权重路径
@return 是否加载成功
*/
- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
/**
@b 加载散开形式的模型, 需传入模型的目录
@param modelAndWeightPath 模型和权重的路径
@return 是否加载成功
*/
- (BOOL)load:(NSString *)modelAndWeightPath; - (BOOL)load:(NSString *)modelAndWeightPath;
/* /**
* 从内存中加载模型 @b 从内存中加载模型
* */
@param modelLen 模型大小(字节数)
@param modelBuf 模型在内存中的位置
@param combinedParamsLen 权重大小(字节数)
@param combinedParamsBuf 权重在内存中的位置
@return 是否加载成功
*/
- (BOOL)LoadCombinedMemory:(size_t)modelLen - (BOOL)LoadCombinedMemory:(size_t)modelLen
andModelBuf:(const uint8_t *)modelBuf andModelBuf:(const uint8_t *)modelBuf
andModelParamsLen:(size_t)combinedParamsLen andModelParamsLen:(size_t)combinedParamsLen
andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf; andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
/* /*
* 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存 *
* */ * */
/**
@b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
@param image 输入的图像
@param output 预处理后的输出
@param means 预处理中 means
@param scale 预处理中的 scale
@param dim 预处理后的维度
*/
-(void)preprocess:(CGImageRef)image -(void)preprocess:(CGImageRef)image
output:(float *)output output:(float *)output
means:(NSArray<NSNumber *> *)means means:(NSArray<NSNumber *> *)means
scale:(float)scale scale:(float)scale
dim:(NSArray<NSNumber *> *)dim; dim:(NSArray<NSNumber *> *)dim;
/* /**
* 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放 进行预测
* */
@param input 输入
@param dim 输入维度
@return 输出结果
*/
- (PaddleMobileCPUResult *)predictInput:(float *)input - (PaddleMobileCPUResult *)predictInput:(float *)input
dim:(NSArray<NSNumber *> *)dim; dim:(NSArray<NSNumber *> *)dim;
/* /**
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict @b 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale; @param image 输入图像
@param dim 输入维度
/* @param means 预处理中 means
进行预测, 默认 means 为 0, scale 为 1.0 @param scale 预处理中 scale
*/ @return 预测结果
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim; */
- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/*
清理内存 /**
*/ 进行预测, 预处理 means 值为 0, scale 值为 1
@param image 输入图像
@param dim 输入维度
@return 预测结果
*/
- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/**
@b 清理内存
*/
- (void)clear; - (void)clear;
@end @end
...@@ -45,21 +45,44 @@ ...@@ -45,21 +45,44 @@
@end @end
@implementation PaddleMobileCPUConfig
-(instancetype)init {
if (self = [super init]) {
self.threadNum = 1;
self.optimize = YES;
}
return self;
}
@end
@interface PaddleMobileCPU() @interface PaddleMobileCPU()
{ {
paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_; paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
BOOL loaded_; BOOL loaded_;
} }
@property (strong, nonatomic) PaddleMobileCPUConfig *config;
@end @end
@implementation PaddleMobileCPU @implementation PaddleMobileCPU
static std::mutex shared_mutex; static std::mutex shared_mutex;
- (instancetype)init { - (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
if (self = [super init]) { if (self = [super init]) {
pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>(); pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
_config = config;
}
return self;
}
-(instancetype)init {
if (self = [super init]) {
_config = [[PaddleMobileCPUConfig alloc] init];
pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
} }
return self; return self;
} }
...@@ -79,11 +102,11 @@ static std::mutex shared_mutex; ...@@ -79,11 +102,11 @@ static std::mutex shared_mutex;
return sharedManager; return sharedManager;
} }
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{ - (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath {
std::string model_path_str = std::string([modelPath UTF8String]); std::string model_path_str = std::string([modelPath UTF8String]);
std::string weights_path_str = std::string([weighsPath UTF8String]); std::string weights_path_str = std::string([weighsPath UTF8String]);
pam_->SetThreadNum(2); pam_->SetThreadNum(self.config.threadNum);
if (loaded_ = pam_->Load(model_path_str, weights_path_str, true)) { if (loaded_ = pam_->Load(model_path_str, weights_path_str, self.config.optimize, false, 1, self.config.loddable)) {
return YES; return YES;
} else { } else {
return NO; return NO;
...@@ -94,14 +117,14 @@ static std::mutex shared_mutex; ...@@ -94,14 +117,14 @@ static std::mutex shared_mutex;
andModelBuf:(const uint8_t *)modelBuf andModelBuf:(const uint8_t *)modelBuf
andModelParamsLen:(size_t)combinedParamsLen andModelParamsLen:(size_t)combinedParamsLen
andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf { andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
pam_->SetThreadNum(2); pam_->SetThreadNum(self.config.threadNum);
return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen,
const_cast<uint8_t*>(combinedParamsBuf)); const_cast<uint8_t*>(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable);
} }
- (BOOL)load:(NSString *)modelAndWeightPath{ - (BOOL)load:(NSString *)modelAndWeightPath{
std::string model_path_str = std::string([modelAndWeightPath UTF8String]); std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
if (loaded_ = pam_->Load(model_path_str)) { if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) {
return YES; return YES;
} else { } else {
return NO; return NO;
...@@ -116,6 +139,10 @@ static std::mutex shared_mutex; ...@@ -116,6 +139,10 @@ static std::mutex shared_mutex;
dim:(NSArray<NSNumber *> *)dim { dim:(NSArray<NSNumber *> *)dim {
std::lock_guard<std::mutex> lock(shared_mutex); std::lock_guard<std::mutex> lock(shared_mutex);
if (means == nil) {
means = @[@0, @0, @0];
}
// dim to c++ vector, get numel // dim to c++ vector, get numel
std::vector<int64_t > dim_vec; std::vector<int64_t > dim_vec;
int numel = 1; int numel = 1;
...@@ -235,7 +262,7 @@ static std::mutex shared_mutex; ...@@ -235,7 +262,7 @@ static std::mutex shared_mutex;
return cpuResult; return cpuResult;
} }
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{ - (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
// printf(" predict one "); // printf(" predict one ");
std::lock_guard<std::mutex> lock(shared_mutex); std::lock_guard<std::mutex> lock(shared_mutex);
if (!loaded_) { if (!loaded_) {
...@@ -284,28 +311,22 @@ static std::mutex shared_mutex; ...@@ -284,28 +311,22 @@ static std::mutex shared_mutex;
// predict // predict
std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec); std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
// result float *output_pointer = new float[cpp_result.size()];
long count = 0; memcpy(output_pointer, cpp_result.data(),
count = cpp_result.size(); cpp_result.size() * sizeof(float));
NSMutableArray *result = [[NSMutableArray alloc] init]; PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
for (int i = 0; i < count; i++) { [cpuResult toSetOutput: output_pointer];
[result addObject:[NSNumber numberWithFloat:cpp_result[i]]]; [cpuResult toSetOutputSize: cpp_result.size()];
}
free(output); free(output);
// 待验证
// if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
CFRelease(cfData); CFRelease(cfData);
cfData = NULL; cfData = NULL;
// }
return result; return cpuResult;
} }
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim { - (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
[self predict:image dim:dim means:nil scale:1]; return [self predict:image dim:dim means:nil scale:1];
} }
- (void)clear{ - (void)clear{
......
...@@ -74,13 +74,10 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path, ...@@ -74,13 +74,10 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
} }
template <typename Device, typename T> template <typename Device, typename T>
bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len, bool PaddleMobile<Device, T>::LoadCombinedMemory(
const uint8_t *model_buf, size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
size_t combined_params_len, uint8_t *combined_params_buf, bool optimize, bool quantification,
uint8_t *combined_params_buf) { int batch_size, bool loddable) {
int batch_size = 1;
bool optimise = true;
bool quantification = false;
if (loader_.get() == nullptr) { if (loader_.get() == nullptr) {
loader_ = std::make_shared<framework::Loader<Device, T>>(); loader_ = std::make_shared<framework::Loader<Device, T>>();
} else { } else {
...@@ -89,9 +86,9 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len, ...@@ -89,9 +86,9 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(size_t model_len,
if (executor_.get() == nullptr) { if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Device, T>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
combined_params_buf, optimise, combined_params_buf, optimize,
quantification), quantification),
batch_size, optimise); batch_size, optimize, loddable);
} else { } else {
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
......
...@@ -73,7 +73,9 @@ class PaddleMobile { ...@@ -73,7 +73,9 @@ class PaddleMobile {
bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
size_t combined_params_len, size_t combined_params_len,
uint8_t *combined_params_buf); uint8_t *combined_params_buf, bool optimize = false,
bool quantification = false, int batch_size = 1,
bool loddable = false);
void SetThreadNum(int count); void SetThreadNum(int count);
void Clear(); void Clear();
......
...@@ -77,15 +77,15 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) { ...@@ -77,15 +77,15 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
break; break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT: case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT:
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
nullptr, false); nullptr, false, false);
break; break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT: case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P1_FLOAT:
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), nullptr, false); param.Output(), nullptr, false, false);
break; break;
case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P0_FLOAT: case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2P0_FLOAT:
math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
nullptr, false); nullptr, false, false);
break; break;
case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT: case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
WinogradConv3x3<8, 3>(param); WinogradConv3x3<8, 3>(param);
......
...@@ -122,7 +122,7 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) { ...@@ -122,7 +122,7 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
param.Bias(), true); param.Bias(), true, false);
} else if (param.Groups() == param.Input()->dims()[1] && } else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
...@@ -133,10 +133,10 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) { ...@@ -133,10 +133,10 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
// param.Output(), false); // param.Output(), false);
if (param.Paddings()[0] == 0) { if (param.Paddings()[0] == 0) {
math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
param.Bias(), true); param.Bias(), true, false);
} else { } else {
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.Bias(), true); param.Output(), param.Bias(), true, false);
} }
} else { } else {
ConvAddBasic(param); ConvAddBasic(param);
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP #ifdef FUSION_CONVADDRELU_OP
#pragma once #pragma once
#include <operators/math/depthwise_conv3x3.h>
#include <vector> #include <vector>
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
...@@ -26,7 +27,7 @@ namespace paddle_mobile { ...@@ -26,7 +27,7 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Itype, typename Otype> template <typename Itype, typename Otype>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { void ConvAddReluBasic(const FusionConvAddReluParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor bias = *param.Bias(); Tensor bias = *param.Bias();
...@@ -118,6 +119,34 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) { ...@@ -118,6 +119,34 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
} }
} }
template <typename Itype, typename Otype>
void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
param.Output()->mutable_data<float>();
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
param.Bias(), true, true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConv3x3(param.Input(), param.Strides(),
// param.Paddings(),
// param.Filter(), param.Bias(),
// param.Output(), false);
if (param.Paddings()[0] == 0) {
math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
param.Bias(), true, true);
} else {
math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.Bias(), true, true);
}
} else {
ConvAddReluBasic<Itype, Otype>(param);
}
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -251,27 +251,31 @@ void DepthwiseConv3x3(const framework::Tensor *input, ...@@ -251,27 +251,31 @@ void DepthwiseConv3x3(const framework::Tensor *input,
void DepthwiseConv3x3s1p1(const framework::Tensor *input, void DepthwiseConv3x3s1p1(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias, framework::Tensor *output, framework::Tensor *bias,
bool if_bias) { bool if_bias, bool if_relu) {
#if __ARM_NEON #if __ARM_NEON
const float *input_data = input->data<float>(); const float *bias_data = bias->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->mutable_data<float>();
const float *bias_data;
if (if_bias) {
bias_data = bias->data<float>();
}
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
// const int l = h;
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]); const int c = static_cast<int>(input->dims()[1]);
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
const int hxw = h * w; const int hxw = h * w;
float32x4_t vbias = vdupq_n_f32(0.0); // const int l = h;
for (int b = 0; b < batch_size; ++b) {
const float *filter_data_tmp = filter_data; // leftTop, rightTop, leftBottom, rightBottom
const int lt = 0;
const int rt = w - 1;
const int lb = (h - 1) * w;
const int rb = h * w - 1;
float32x4_t zero = vdupq_n_f32(0.0);
for (int b = 0; b < batch_size; ++b) {
#pragma omp parallel for
for (int j = 0; j < c; ++j) { for (int j = 0; j < c; ++j) {
const float *filter_data_tmp = filter->data<float>() + j * 9;
const float *input_data = input->data<float>() + j * hxw;
float *output_data = output->mutable_data<float>() + j * hxw;
float32x4_t vbias;
if (if_bias) { if (if_bias) {
vbias = vdupq_n_f32(bias_data[j]); vbias = vdupq_n_f32(bias_data[j]);
} }
...@@ -287,39 +291,51 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -287,39 +291,51 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
float w21 = filter_data_tmp[7]; float w21 = filter_data_tmp[7];
float w22 = filter_data_tmp[8]; float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] + output_data[lt] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[w] + w22 * input_data[w + 1]; w21 * input_data[w] + w22 * input_data[w + 1];
output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] + output_data[rt] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
w20 * input_data[2 * w - 2] + w20 * input_data[2 * w - 2] +
w21 * input_data[2 * w - 1]; w21 * input_data[2 * w - 1];
output_data[(h - 1) * w] = output_data[lb] =
w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
output_data[h * w - 1] = output_data[rb] =
w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] + w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1]; w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
if (if_bias) { if (if_bias) {
output_data[0] += bias_data[j]; output_data[lt] += bias_data[j];
output_data[w - 1] += bias_data[j]; output_data[rt] += bias_data[j];
output_data[(h - 1) * w] += bias_data[j]; output_data[lb] += bias_data[j];
output_data[h * w - 1] += bias_data[j]; output_data[rb] += bias_data[j];
}
if (if_relu) {
output_data[lt] = output_data[lt] < 0 ? 0 : output_data[lt];
output_data[rt] = output_data[rt] < 0 ? 0 : output_data[rt];
output_data[lb] = output_data[lb] < 0 ? 0 : output_data[lb];
output_data[rb] = output_data[rb] < 0 ? 0 : output_data[rb];
} }
for (int i = 1; i < h - 1; ++i) { for (int i = 1; i < h - 1; ++i) {
output_data[i * w] = int left = i * w;
int right = i * w + w - 1;
output_data[left] =
w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + output_data[right] = w00 * input_data[i * w + w - 1 - w - 1] +
w01 * input_data[i * w + w - 1 - w] + w01 * input_data[i * w + w - 1 - w] +
w10 * input_data[i * w + w - 1 - 1] + w10 * input_data[i * w + w - 1 - 1] +
w11 * input_data[i * w + w - 1] + w11 * input_data[i * w + w - 1] +
w20 * input_data[i * w + w - 1 + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] +
w21 * input_data[i * w + w - 1 + w]; w21 * input_data[i * w + w - 1 + w];
if (if_bias) { if (if_bias) {
output_data[i * w] += bias_data[j]; output_data[left] += bias_data[j];
output_data[i * w + w - 1] += bias_data[j]; output_data[right] += bias_data[j];
}
if (if_relu) {
output_data[left] = output_data[left] < 0 ? 0 : output_data[left];
output_data[right] = output_data[right] < 0 ? 0 : output_data[right];
} }
} }
...@@ -352,7 +368,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -352,7 +368,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22); out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias); out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_f32(output_ptr, out0); vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4); in5 = vld1q_f32(input_tmp_end + 4);
...@@ -370,7 +388,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -370,7 +388,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias); out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_f32(output_ptr + (h - 1) * w, out0); vst1q_f32(output_ptr + (h - 1) * w, out0);
// can optimize to each 8 stride. // can optimize to each 8 stride.
...@@ -399,6 +419,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -399,6 +419,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w21); out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22); out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias); out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
for (int i = 0; i < c_mid; ++i) { for (int i = 0; i < c_mid; ++i) {
if (i == 0) { if (i == 0) {
...@@ -428,6 +451,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -428,6 +451,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp2, w11); out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias); out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
for (int i = 0; i < c_mid; ++i) { for (int i = 0; i < c_mid; ++i) {
if (i == 0) { if (i == 0) {
...@@ -471,6 +497,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -471,6 +497,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22); out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias); out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_f32(output_ptr, out0); vst1q_f32(output_ptr, out0);
...@@ -502,6 +531,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -502,6 +531,9 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22); out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias); out0 = vaddq_f32(out0, vbias);
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
for (int i = 0; i < c_mid; ++i) { for (int i = 0; i < c_mid; ++i) {
if (i == 0) { if (i == 0) {
...@@ -515,9 +547,6 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input, ...@@ -515,9 +547,6 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
} }
} }
} }
output_data += hxw;
input_data += hxw;
filter_data_tmp += 9;
} }
} }
#endif #endif
...@@ -1273,7 +1302,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input, ...@@ -1273,7 +1302,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,
void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias, framework::Tensor *output, framework::Tensor *bias,
bool if_bias) { bool if_bias, bool if_relu) {
#if __ARM_NEON #if __ARM_NEON
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>(); const float *filter_data = filter->data<float>();
...@@ -1361,6 +1390,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1361,6 +1390,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
vaddq_f32(elewise_res0, elewise_res1)); vaddq_f32(elewise_res0, elewise_res1));
res3 = vaddq_f32(res3, vbias); res3 = vaddq_f32(res3, vbias);
if (if_relu) {
res3 = vmaxq_f32(res3, zero);
}
vst1q_f32(output_row_ptr, res3); vst1q_f32(output_row_ptr, res3);
input_row_ptr += 6; input_row_ptr += 6;
...@@ -1395,6 +1427,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1395,6 +1427,9 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1), res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
vaddq_f32(elewise_res0, elewise_res1)); vaddq_f32(elewise_res0, elewise_res1));
res3 = vaddq_f32(res3, vbias); res3 = vaddq_f32(res3, vbias);
if (if_relu) {
res3 = vmaxq_f32(res3, zero);
}
if ((w4 != w_times)) { if ((w4 != w_times)) {
vst1q_f32(output_row_ptr, res3); vst1q_f32(output_row_ptr, res3);
...@@ -1410,12 +1445,18 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1410,12 +1445,18 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
output_row_ptr += 3; output_row_ptr += 3;
} }
output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + // leftTop, rightTop, leftBottom, rightBottom
input_const[in_w] * w21 + int lt = 0;
input_const[in_w + 1] * w22; int rt = out_w - 1;
int lb = out_w * (out_h - 1);
int rb = out_h * out_w - 1;
output_data_tmp[lt] = input_const[0] * w11 + input_const[1] * w12 +
input_const[in_w] * w21 +
input_const[in_w + 1] * w22;
out2in_mid = (out_w - 1) * 2; out2in_mid = (out_w - 1) * 2;
output_data_tmp[out_w - 1] = output_data_tmp[rt] =
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
w20 * input_const[out2in_mid + in_w - 1] + w20 * input_const[out2in_mid + in_w - 1] +
w21 * input_const[out2in_mid + in_w] + w21 * input_const[out2in_mid + in_w] +
...@@ -1424,7 +1465,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1424,7 +1465,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
out2in_mid = (out_h - 1) * 2 * in_w; out2in_mid = (out_h - 1) * 2 * in_w;
output_data_tmp[out_w * (out_h - 1)] = output_data_tmp[lb] =
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w02 * input_const[out2in_mid - in_w + 1] + w02 * input_const[out2in_mid - in_w + 1] +
w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
...@@ -1432,7 +1473,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1432,7 +1473,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2; out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[out_h * out_w - 1] = output_data_tmp[rb] =
w00 * input_const[out2in_mid - in_w - 1] + w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
...@@ -1443,22 +1484,30 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1443,22 +1484,30 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
(1 - if_pad_r) * (1 - if_pad_b) * w22 * (1 - if_pad_r) * (1 - if_pad_b) * w22 *
input_const[out2in_mid + in_w + 1]; input_const[out2in_mid + in_w + 1];
if (if_bias) { if (if_bias) {
output_data_tmp[0] += bias_data[j]; output_data_tmp[lt] += bias_data[j];
output_data_tmp[out_w - 1] += bias_data[j]; output_data_tmp[rt] += bias_data[j];
output_data_tmp[out_w * (out_h - 1)] += bias_data[j]; output_data_tmp[lb] += bias_data[j];
output_data_tmp[out_h * out_w - 1] += bias_data[j]; output_data_tmp[rb] += bias_data[j];
}
if (if_relu) {
output_data_tmp[lt] = output_data_tmp[lt] < 0 ? 0 : output_data_tmp[lt];
output_data_tmp[rt] = output_data_tmp[rt] < 0 ? 0 : output_data_tmp[rt];
output_data_tmp[lb] = output_data_tmp[lb] < 0 ? 0 : output_data_tmp[lb];
output_data_tmp[rb] = output_data_tmp[rb] < 0 ? 0 : output_data_tmp[rb];
} }
for (int i = 1; i < out_h - 1; i++) { for (int i = 1; i < out_h - 1; i++) {
out2in_mid = i * 2 * in_w; out2in_mid = i * 2 * in_w;
output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] + int left = i * out_w;
w02 * input_const[out2in_mid - in_w + 1] + output_data_tmp[left] = w01 * input_const[out2in_mid - in_w] +
w11 * input_const[out2in_mid] + w02 * input_const[out2in_mid - in_w + 1] +
w12 * input_const[out2in_mid + 1] + w11 * input_const[out2in_mid] +
w21 * input_const[out2in_mid + in_w] + w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]; w21 * input_const[out2in_mid + in_w] +
w22 * input_const[out2in_mid + in_w + 1];
out2in_mid = i * 2 * in_w + (out_w - 1) * 2; out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
output_data_tmp[i * out_w + out_w - 1] = int right = i * out_w + out_w - 1;
output_data_tmp[right] =
w00 * input_const[out2in_mid - in_w - 1] + w00 * input_const[out2in_mid - in_w - 1] +
w01 * input_const[out2in_mid - in_w] + w01 * input_const[out2in_mid - in_w] +
w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
...@@ -1468,8 +1517,14 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, ...@@ -1468,8 +1517,14 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
w12 * input_const[out2in_mid + 1] + w12 * input_const[out2in_mid + 1] +
w22 * input_const[out2in_mid + in_w + 1]); w22 * input_const[out2in_mid + in_w + 1]);
if (if_bias) { if (if_bias) {
output_data_tmp[i * out_w] += bias_data[j]; output_data_tmp[left] += bias_data[j];
output_data_tmp[i * out_w + out_w - 1] += bias_data[j]; output_data_tmp[right] += bias_data[j];
}
if (if_relu) {
output_data_tmp[left] =
output_data_tmp[left] < 0 ? 0 : output_data_tmp[left];
output_data_tmp[right] =
output_data_tmp[right] < 0 ? 0 : output_data_tmp[right];
} }
} }
filter_data_tmp += 9; filter_data_tmp += 9;
...@@ -1909,7 +1964,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, ...@@ -1909,7 +1964,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
void DepthwiseConv3x3s2p0(const framework::Tensor *input, void DepthwiseConv3x3s2p0(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias, framework::Tensor *output, framework::Tensor *bias,
bool if_bias) { bool if_bias, bool if_relu) {
#if __ARM_NEON #if __ARM_NEON
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
...@@ -1977,6 +2032,9 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, ...@@ -1977,6 +2032,9 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
if (if_bias) { if (if_bias) {
out0 = vaddq_f32(out0, biasv); out0 = vaddq_f32(out0, biasv);
} }
if (if_relu) {
out0 = vmaxq_f32(out0, zero);
}
vst1q_lane_f32(output_ptr, out0, 0); vst1q_lane_f32(output_ptr, out0, 0);
vst1q_lane_f32(output_ptr + 1, out0, 1); vst1q_lane_f32(output_ptr + 1, out0, 1);
vst1q_lane_f32(output_ptr + 2, out0, 2); vst1q_lane_f32(output_ptr + 2, out0, 2);
...@@ -1985,7 +2043,8 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, ...@@ -1985,7 +2043,8 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
for (m = 0; m < output_width - 2; m += 3) { for (m = 0; m < output_width - 2; m += 3) {
} }
for (int j = m; j < output_width; j++) { for (int j = m; j < output_width; j++) {
output_data[i * output_width + j] = int index = i * output_width + j;
output_data[index] =
input_data[(2 * i) * input_width + 2 * j] * w00 + input_data[(2 * i) * input_width + 2 * j] * w00 +
input_data[(2 * i) * input_width + 2 * j + 1] * w01 + input_data[(2 * i) * input_width + 2 * j + 1] * w01 +
input_data[(2 * i) * input_width + 2 * j + 2] * w02 + input_data[(2 * i) * input_width + 2 * j + 2] * w02 +
...@@ -1996,7 +2055,11 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, ...@@ -1996,7 +2055,11 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 + input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 +
input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22; input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22;
if (if_bias) { if (if_bias) {
output_data[i * output_width + j] += *bias_data; output_data[index] += *bias_data;
}
if (if_relu) {
output_data[index] =
output_data[index] < 0 ? 0 : output_data[index];
} }
} }
} }
......
...@@ -32,7 +32,7 @@ void DepthwiseConv3x3(const framework::Tensor *input, ...@@ -32,7 +32,7 @@ void DepthwiseConv3x3(const framework::Tensor *input,
void DepthwiseConv3x3s1p1(const framework::Tensor *input, void DepthwiseConv3x3s1p1(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias, framework::Tensor *output, framework::Tensor *bias,
bool if_bias); bool if_bias, bool if_relu);
void DepthwiseConvAddBNRelu3x3s1p1(const framework::Tensor *input, void DepthwiseConvAddBNRelu3x3s1p1(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
...@@ -51,7 +51,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input, ...@@ -51,7 +51,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input,
void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias, framework::Tensor *output, framework::Tensor *bias,
bool if_bias); bool if_bias, bool if_relu);
void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
...@@ -63,7 +63,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, ...@@ -63,7 +63,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
void DepthwiseConv3x3s2p0(const framework::Tensor *input, void DepthwiseConv3x3s2p0(const framework::Tensor *input,
const framework::Tensor *filter, const framework::Tensor *filter,
framework::Tensor *output, framework::Tensor *bias, framework::Tensor *output, framework::Tensor *bias,
bool if_bias); bool if_bias, bool if_relu);
// TODO(hjchen2) need to be implemented // TODO(hjchen2) need to be implemented
// template<typename Itype, typename Otype> // template<typename Itype, typename Otype>
......
...@@ -129,6 +129,15 @@ if (CON GREATER -1) ...@@ -129,6 +129,15 @@ if (CON GREATER -1)
endif () endif ()
list(FIND NET "super" CON)
if (CON GREATER -1)
# gen test
ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-super paddle-mobile)
set(FOUND_MATCH ON)
endif ()
if (NOT FOUND_MATCH) if (NOT FOUND_MATCH)
# gen test # gen test
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
......
#!/usr/bin/env bash #!/usr/bin/env bash
NETS="" NETS=""
declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet") declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super")
build_for_mac() { build_for_mac() {
if [ ! `which brew` ]; then if [ ! `which brew` ]; then
...@@ -162,7 +162,7 @@ build_for_ios() { ...@@ -162,7 +162,7 @@ build_for_ios() {
fi fi
cd "${BUILD_DIR}" cd "${BUILD_DIR}"
make -j 8 make -j 8
cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
cd ./build cd ./build
# 生成符号表 # 生成符号表
ranlib *.a ranlib *.a
......
...@@ -202,6 +202,16 @@ if (CON GREATER -1) ...@@ -202,6 +202,16 @@ if (CON GREATER -1)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
list(FIND NET "super" CON)
if (CON GREATER -1)
message("super enabled")
set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADDRELU_OP ON)
set(ELEMENTWISEADD_OP ON)
set(FOUND_MATCH ON)
endif()
if(NOT FOUND_MATCH) if(NOT FOUND_MATCH)
message("--default--") message("--default--")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册