未验证 提交 fdd03a4c 编写于 作者: K kk12333 提交者: GitHub

Merge pull request #4 from PaddlePaddle/develop

merge dev
...@@ -92,3 +92,4 @@ metal/images/ ...@@ -92,3 +92,4 @@ metal/images/
metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
*.xcuserdatad/ *.xcuserdatad/
*/xcuserdata/ */xcuserdata/
/venv/
cmake_minimum_required(VERSION 3.0) cmake_minimum_required(VERSION 3.0)
option(USE_OPENMP "openmp support" OFF)
project(paddle-mobile) project(paddle-mobile)
# select the platform to build
option(CPU "armv7 with neon support" ON)
option(MALI_GPU "mali gpu support" OFF)
option(FPGA "fpga support" OFF)
option(USE_OPENMP "openmp support" OFF)
option(DEBUGING "enable debug mode" ON) option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF) option(USE_EXCEPTION "use std exception" OFF)
option(LOG_PROFILE "log profile" OFF) option(LOG_PROFILE "log profile" OFF)
# select the platform to build
option(CPU "armv7 with neon" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/) include_directories(src/)
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
if(IS_IOS) if(IS_IOS)
set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
-std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
add_compile_options(-fembed-bitcode)
else() else()
set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
endif() endif()
if (DEBUGING) if(DEBUGING)
message(STATUS "debug") message(STATUS "debugging mode")
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-DPADDLE_MOBILE_DEBUG) add_definitions(-DPADDLE_MOBILE_DEBUG)
else () else()
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif () endif()
if (USE_EXCEPTION) if(USE_EXCEPTION)
message(STATUS "use exception") message(STATUS "use exception")
add_definitions(-DENABLE_EXCEPTION) add_definitions(-DENABLE_EXCEPTION -fexceptions)
add_definitions(-fexceptions)
else() else()
add_definitions(-fno-exceptions) add_definitions(-fno-exceptions)
endif () endif()
if (LOG_PROFILE) if(LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE) add_definitions(-DPADDLE_MOBILE_PROFILE)
endif() endif()
...@@ -50,12 +49,12 @@ if(USE_OPENMP) ...@@ -50,12 +49,12 @@ if(USE_OPENMP)
endif() endif()
# platform control # platform control
if (ARM_LINUX) if(ARM_LINUX)
include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake") include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
endif () endif()
if (CPU) if(CPU)
add_definitions(-DPADDLE_MOBILE_CPU) add_definitions(-DPADDLE_MOBILE_CPU)
else() else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc) file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
foreach(f ${_tmp_list}) foreach(f ${_tmp_list})
...@@ -68,7 +67,7 @@ else() ...@@ -68,7 +67,7 @@ else()
endforeach() endforeach()
endif() endif()
if (MALI_GPU) if(MALI_GPU)
add_definitions(-DPADDLE_MOBILE_MALI_GPU) add_definitions(-DPADDLE_MOBILE_MALI_GPU)
add_definitions(-DUSE_ACL=1) add_definitions(-DUSE_ACL=1)
add_definitions(-DUSE_OPENCL) add_definitions(-DUSE_OPENCL)
...@@ -120,20 +119,20 @@ else() ...@@ -120,20 +119,20 @@ else()
endforeach() endforeach()
endif() endif()
if (ANDROID_NDK_TOOLCHAIN_INCLUDED) if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
else() else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp) list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
endif () endif()
if (IS_IOS) if(IS_IOS)
else() else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm) list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
endif () endif()
set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
...@@ -142,7 +141,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) ...@@ -142,7 +141,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default # NET default
if (FPGA) if(FPGA)
set(NET "FPGAnets" CACHE STRING "select net type") set(NET "FPGAnets" CACHE STRING "select net type")
else() else()
set(NET "default" CACHE STRING "select net type") set(NET "default" CACHE STRING "select net type")
...@@ -153,7 +152,7 @@ include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake") ...@@ -153,7 +152,7 @@ include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
# build library # build library
if (ANDROID_NDK_TOOLCHAIN_INCLUDED) if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS) list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
elseif(IS_IOS) elseif(IS_IOS)
...@@ -168,9 +167,9 @@ elseif(IS_IOS) ...@@ -168,9 +167,9 @@ elseif(IS_IOS)
else() else()
add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
endif() endif()
else () else()
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
endif () endif()
# unit test # unit test
if(DEBUGING) if(DEBUGING)
......
...@@ -110,7 +110,8 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ...@@ -110,7 +110,8 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平
### 开发文档 ### 开发文档
开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。 开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。
[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md) * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
### 贡献文档 ### 贡献文档
- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md) - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
......
|mobilenet arm v7|1线程|2线程|4线程|
|------------|----|-----|-----|
|麒麟970(ms)|108.180|63.935|37.545|
|麒麟960(ms)|108.588|63.073|36.822|
|高通845(ms)|85.952|48.890|28.641|
|高通835(ms)|105.434|62.752|37.131|
|||||
|mobilenetssd arm v7|1线程|2线程|4线程|
|麒麟970(ms)|212.686|127.205|77.485|
|麒麟960(ms)|212.641|125.338|75.250|
|高通845(ms)|182.863|95.671|56.857|
|高通835(ms)|213.849|127.717|77.006|
|||||
|googlenet(v1) arm v7|1线程|2线程|4线程|
|麒麟970(ms)|335.288|234.559|161.295|
|麒麟960(ms)|354.443|232.642|157.815|
|高通845(ms)|282.007|173.146|122.148|
|高通835(ms)|341.250|233.354|158.554|
|||||
|squeezenet arm v7|1线程|2线程|4线程|
|麒麟970(ms)|83.726|57.944|36.923|
|麒麟960(ms)|85.835|55.762|36.496|
|高通845(ms)|71.301|41.618|28.785|
|高通835(ms)|82.407|56.176|36.455|
|||||
|yolo arm v7|1线程|2线程|4线程|
|麒麟970(ms)|129.658|79.993|49.969|
|麒麟960(ms)|130.208|78.791|48.390|
|高通845(ms)|109.244|61.736|40.600|
|高通835(ms)|130.402|80.863|50.359|
测试机型信息:
麒麟970:荣耀v10 (2.36GHz * 4 + 1.8GHz * 4)
麒麟960:华为mate9 (2.36GHz * 4 + 1.8GHz * 4)
骁龙835:小米6 (2.45GHz * 4 + 1.9GHz * 4)
骁龙845:OPPO FindX (2.80GHz * 4 + 1.8GHz * 4)
\ No newline at end of file
|mobilenetfssd|速度|
|------------|-----|
|A9(ms)|33.78|
|A10(ms)|24.05|
|A11(ms)|17.15|
|||
|genet|速度|
|A9(ms) |3.49|
|A10(ms)|2.54|
|A11(ms)|1.43|
\ No newline at end of file
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#### 以下是 paddle-mobile 代码的执行流程图: #### 以下是 paddle-mobile 代码的执行流程图:
![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png) ![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png)
#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块 #### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
先来看一下模型, 模型分为两种结构: 先来看一下模型, 模型分为两种结构:
一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png) ![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png)
另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件 另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png) ![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png)
loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu). loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
...@@ -161,7 +161,7 @@ sh build.sh android yolo ...@@ -161,7 +161,7 @@ sh build.sh android yolo
### 五. kernel ### 五. kernel
kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示: kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png) ![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png)
不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现. 不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
......
### iOS&Android开发文档
# iOS开发文档
## 编译
```sh
# 在 paddle-mobile 目录下:
cd tools
sh build.sh ios
# 如果只想编译某个特定模型的 op, 则需执行以下命令
sh build.sh ios googlenet
# 在这个文件夹下, 你可以拿到生成的 .a 库
cd ../build/release/ios/build
```
#### 常见问题:
1. No iOS SDK's found in default search path ...
这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定,
以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
## 集成
```
将上一步生成的:
libpaddle-mobile.a
/src/ios_io/ 下的
PaddleMobile.h
```
拖入工程
#### oc 接口
接口如下:
```
/*
创建对象
*/
- (instancetype)init;
/*
load 模型, 开辟内存
*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
/*
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/*
进行预测
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/*
清理内存
*/
- (void)clear;
```
# Android开发文档 # Android开发文档
用户可通过如下两种方式,交叉编译Android平台上适用的paddle-mobile库: 用户可通过如下两种方式,交叉编译Android平台上适用的paddle-mobile库:
......
# iOS开发文档
## CPU
需要: xcode
### 编译
```sh
# 在 paddle-mobile 目录下:
cd tools
sh build.sh ios
# 如果只想编译某个特定模型的 op, 则需执行以下命令
sh build.sh ios googlenet
# 在这个文件夹下, 你可以拿到生成的 .a 库
cd ../build/release/ios/build
```
#### 常见问题:
1. No iOS SDK's found in default search path ...
这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定,
以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
### 集成
```
将上一步生成的:
libpaddle-mobile.a
/src/ios_io/ 下的
PaddleMobile.h
```
拖入工程
#### oc 接口
接口如下:
```
/*
创建对象
*/
- (instancetype)init;
/*
load 模型, 开辟内存
*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
/*
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/*
进行预测
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/*
清理内存
*/
- (void)clear;
```
## GPU
需要: xcode、cocoapods
```
# 在 paddle-mobile 目录下:
cd metal
pod install
open paddle-mobile.xcworkspace
```
## Paddle-Mobile ## Paddle-Mobile
This folder is used to develop metal version for ios gpu 需要: xcode、 cocoapods
```
pod install
open paddle-mobile.xcworkspace
```
Demo 所需依赖的模型可在[这里](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)下载
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_EXECUTOR_MULTITHREAD #ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
...@@ -60,6 +61,7 @@ class depCore { ...@@ -60,6 +61,7 @@ class depCore {
std::vector<std::vector<int>> deps; std::vector<std::vector<int>> deps;
std::vector<std::vector<int>> next; std::vector<std::vector<int>> next;
}; };
} // namespace paddle_mobile } // namespace paddle_mobile
#endif #endif
...@@ -63,6 +63,9 @@ const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp"; ...@@ -63,6 +63,9 @@ const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
const char *G_OP_TYPE_FLATTEN = "flatten"; const char *G_OP_TYPE_FLATTEN = "flatten";
const char *G_OP_TYPE_SHAPE = "shape"; const char *G_OP_TYPE_SHAPE = "shape";
const char *G_OP_TYPE_QUANTIZE = "quantize";
const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
std::unordered_map< std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key = { op_input_output_key = {
...@@ -111,6 +114,8 @@ std::unordered_map< ...@@ -111,6 +114,8 @@ std::unordered_map<
{G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}}, {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
{G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}}, {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
{G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}}, {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
{G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}}; {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
{G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
{G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -79,6 +79,13 @@ enum PMStatus { ...@@ -79,6 +79,13 @@ enum PMStatus {
PMWrongDevice = 0x08 /*!< un-correct device. */ PMWrongDevice = 0x08 /*!< un-correct device. */
}; };
enum RoundType {
ROUND_UNK = 0,
ROUND_NEAREST_AWAY_ZERO = 1,
ROUND_NEAREST_TOWARDS_ZERO = 2,
ROUND_NEAREST_TO_EVEN = 3
};
extern const char *G_OP_TYPE_CONV; extern const char *G_OP_TYPE_CONV;
extern const char *G_OP_TYPE_BATCHNORM; extern const char *G_OP_TYPE_BATCHNORM;
extern const char *G_OP_TYPE_BOX_CODER; extern const char *G_OP_TYPE_BOX_CODER;
...@@ -120,6 +127,9 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN; ...@@ -120,6 +127,9 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN;
extern const char *G_OP_TYPE_CONV_TRANSPOSE; extern const char *G_OP_TYPE_CONV_TRANSPOSE;
extern const char *G_OP_TYPE_PRELU; extern const char *G_OP_TYPE_PRELU;
extern const char *G_OP_TYPE_QUANTIZE;
extern const char *G_OP_TYPE_DEQUANTIZE;
extern std::unordered_map< extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key; op_input_output_key;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "common/util.h"
namespace paddle_mobile {
char *ReadFileToBuff(std::string filename) {
FILE *file = fopen(filename.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str());
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
rewind(file);
char *data = new char[size];
size_t bytes_read = fread(data, 1, size, file);
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
fclose(file);
return data;
}
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "common/enforce.h"
namespace paddle_mobile {
char *ReadFileToBuff(std::string filename);
} // namespace paddle_mobile
...@@ -57,7 +57,12 @@ class RawData { ...@@ -57,7 +57,12 @@ class RawData {
public: public:
char data[size]; char data[size];
RawData() {} RawData() {}
RawData(const RawData &raw_data) { strcpy(data, raw_data.data); } RawData(const RawData &raw_data) { memcpy(data, raw_data.data, size); }
RawData &operator=(const RawData &raw_data) {
memcpy(data, raw_data.data, size);
return *this;
}
}; };
template <typename... Ts> template <typename... Ts>
...@@ -74,15 +79,37 @@ struct Variant { ...@@ -74,15 +79,37 @@ struct Variant {
template <typename T, typename... Args> template <typename T, typename... Args>
void Set(Args &&... args) { void Set(Args &&... args) {
helper::Destroy(type_id, &data); helper::Destroy(type_id, data.data);
new (&data) T(std::forward<Args>(args)...); new (data.data) T(std::forward<Args>(args)...);
type_id = typeid(T).hash_code(); type_id = typeid(T).hash_code();
} }
void SetString(std::string &string) {
helper::Destroy(type_id, data.data);
type_id = typeid(std::string).hash_code();
strcpy(data.data, string.c_str());
}
std::string GetString() const {
if (type_id == typeid(std::string).hash_code()) {
return std::string(data.data);
} else {
PADDLE_MOBILE_THROW_EXCEPTION(
" bad cast in variant data type not a string ");
exit(0);
}
}
template <typename T> template <typename T>
T &Get() const { T &Get() const {
if (type_id == typeid(T).hash_code()) { if (type_id == typeid(std::string).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(&data)); PADDLE_MOBILE_THROW_EXCEPTION(
"Please use getString to get an string (to avoid of an issue with "
"gcc "
"stl lib with string copy)");
exit(0);
} else if (type_id == typeid(T).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(data.data));
} else { } else {
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant"); PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
exit(0); exit(0);
...@@ -95,7 +122,8 @@ struct Variant { ...@@ -95,7 +122,8 @@ struct Variant {
static inline size_t invalid_type() { return typeid(void).hash_code(); } static inline size_t invalid_type() { return typeid(void).hash_code(); }
typedef VariantHelper<Ts...> helper; typedef VariantHelper<Ts...> helper;
size_t type_id; size_t type_id;
RawData<helper::size> data; // todo use an anto size to suite this.
RawData<64> data;
}; };
template <typename T> template <typename T>
......
...@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "api.h" #include "fpga/api.h"
#include <fcntl.h> #include <fcntl.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include "bias_scale.h" #include "fpga/bias_scale.h"
#include "filter.h" #include "fpga/filter.h"
#include "image.h" #include "fpga/image.h"
#define FPGA_TEST_MODE #define FPGA_TEST_MODE
#define PADDLE_MOBILE_OS_LINUX #define PADDLE_MOBILE_OS_LINUX
...@@ -59,8 +59,8 @@ void *fpga_malloc(size_t size) { ...@@ -59,8 +59,8 @@ void *fpga_malloc(size_t size) {
#endif #endif
counter += size; counter += size;
memory_map.insert(std::make_pair(ptr, size)); memory_map.insert(std::make_pair(ptr, size));
DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
<< counter << " bytes"; // << counter << " bytes";
return ptr; return ptr;
} }
...@@ -78,8 +78,8 @@ void fpga_free(void *ptr) { ...@@ -78,8 +78,8 @@ void fpga_free(void *ptr) {
free(ptr); free(ptr);
#endif #endif
counter += size; counter += size;
DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
<< counter << " bytes"; // << counter << " bytes";
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer";
} }
...@@ -103,6 +103,27 @@ int fpga_invalidate(void *address, size_t size) { ...@@ -103,6 +103,27 @@ int fpga_invalidate(void *address, size_t size) {
return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
} }
half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
}
float fp16_2_fp32(half fp16_num) {
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
}
int ComputeBasicConv(const struct ConvArgs &args) { int ComputeBasicConv(const struct ConvArgs &args) {
DLOG << "======Compute Basic Conv======"; DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled DLOG << " relu_enabled:" << args.relu_enabled
...@@ -148,6 +169,8 @@ int ComputeFpgaConv(const struct WrapperConvArgs &args) { ...@@ -148,6 +169,8 @@ int ComputeFpgaConv(const struct WrapperConvArgs &args) {
int ComputeFpgaPool(const struct PoolingArgs &args) { int ComputeFpgaPool(const struct PoolingArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaPool==========="; DLOG << "=============ComputeFpgaPool===========";
DLOG << " mode:" << args.mode
<< " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
DLOG << " image_address:" << args.image.address DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address << " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels << " image_channels:" << args.image.channels
...@@ -240,7 +263,7 @@ void format_image(framework::Tensor *image_tensor) { ...@@ -240,7 +263,7 @@ void format_image(framework::Tensor *image_tensor) {
auto channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>(); auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float); size_t memory_size = channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size); fpga_copy(new_data, data_ptr, memory_size);
image::format_image(&new_data, channel, height, width); image::format_image(&new_data, channel, height, width);
image_tensor->reset_data_ptr(new_data); image_tensor->reset_data_ptr(new_data);
...@@ -311,19 +334,33 @@ int get_aligned_filter_num(int num) { ...@@ -311,19 +334,33 @@ int get_aligned_filter_num(int num) {
void format_filter(framework::Tensor *filter_tensor, float max_value, void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) { int group_num) {
filter_tensor->scale[0] = float(max_value / 127.0); filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
auto dims = filter_tensor->dims(); auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>(); auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float); size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size); fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num, filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value); max_value);
filter_tensor->reset_data_ptr(new_data); filter_tensor->reset_data_ptr(new_data);
} }
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
fpga_copy(new_data, data_ptr, memory_size);
filter::format_fc_filter(&new_data, num, channel, height, width, 1,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_bias_scale_array(float **bias_scale_array, void format_bias_scale_array(float **bias_scale_array,
int element_num_per_division, int num) { int element_num_per_division, int num) {
bias_scale::format_bias_scale_array(bias_scale_array, bias_scale::format_bias_scale_array(bias_scale_array,
...@@ -358,7 +395,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -358,7 +395,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr; arg->output.address = out_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); arg->conv_args =
(ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_num = arg->split_num;
arg->concat_arg.image_out = out_ptr; arg->concat_arg.image_out = out_ptr;
...@@ -367,12 +405,15 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -367,12 +405,15 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->concat_arg.width = (uint32_t)filter->dims()[3]; arg->concat_arg.width = (uint32_t)filter->dims()[3];
int n = arg->split_num; int n = arg->split_num;
arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *)); arg->concat_arg.images_in =
arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *)); (half **)fpga_malloc(n * sizeof(int *)); // NOLINT
arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); arg->concat_arg.scales_in =
(float **)fpga_malloc(n * sizeof(float *)); // NOLINT
arg->concat_arg.channel_num =
(uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT
arg->concat_arg.image_out = out_ptr; arg->concat_arg.image_out = out_ptr;
auto channel = (int)out->dims()[1]; auto channel = (int)out->dims()[1]; // NOLINT
int filter_num_per_div = get_filter_num_per_div(filter, group_num); int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = get_aligned_filter_element_num( int element_num = get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
...@@ -392,29 +433,28 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -392,29 +433,28 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->conv_args[i].image.pad_height = (uint32_t)padding_h; arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
arg->conv_args[i].image.pad_width = (uint32_t)padding_w; arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
arg->conv_args[i].filter_scale_address = filter->scale; arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].filter_address = arg->conv_args[i].filter_address = &(
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT
arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_args[i].filter_num = arg->conv_args[i].filter_num = (uint32_t)(
(uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div); : filter_num_per_div);
if (n > 1) { if (n > 1) {
arg->conv_args[i].output.scale_address = arg->conv_args[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); (float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_args[i].output.address = fpga_malloc( arg->conv_args[i].output.address = fpga_malloc(
input->dims()[2] * input->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_args[i].filter_num, align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
IMAGE_ALIGNMENT) * IMAGE_ALIGNMENT) *
sizeof(half)); sizeof(half));
} } else {
else {
arg->conv_args[i].output.scale_address = out->scale; arg->conv_args[i].output.scale_address = out->scale;
arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.address = out_ptr;
} }
arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address; arg->concat_arg.images_in[i] =
(half *)arg->conv_args[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address; arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
} }
......
...@@ -99,6 +99,8 @@ struct WrapperConvArgs { ...@@ -99,6 +99,8 @@ struct WrapperConvArgs {
}; };
struct PoolingArgs { struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
half kernel_reciprocal;
struct KernelArgs kernel; struct KernelArgs kernel;
struct ImageInputArgs image; // input image; struct ImageInputArgs image; // input image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
...@@ -212,6 +214,7 @@ int get_aligned_filter_element_num(int chw); ...@@ -212,6 +214,7 @@ int get_aligned_filter_element_num(int chw);
int get_aligned_filter_num(int num); int get_aligned_filter_num(int num);
void format_filter(framework::Tensor* filter_tensor, float max_value, void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num); int group_num);
void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
void format_bias_scale_array(float** bias_scale_array, void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num); int element_num_per_division, int num);
void format_concat_output(framework::Tensor* out, int height, int width, void format_concat_output(framework::Tensor* out, int height, int width,
...@@ -222,5 +225,8 @@ void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input, ...@@ -222,5 +225,8 @@ void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
bool relu_enabled, int group_num, int stride_h, int stride_w, bool relu_enabled, int group_num, int stride_h, int stride_w,
int padding_h, int padding_w, float* bs_ptr); int padding_h, int padding_w, float* bs_ptr);
half fp32_2_fp16(float fp32_num);
float fp16_2_fp32(half fp16_num);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "bias_scale.h" #include "fpga/bias_scale.h"
#include <memory.h> #include <memory.h>
#include "api.h" #include "fpga/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -29,7 +29,8 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -29,7 +29,8 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
int num_element = int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale 2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float)); float *ptr_aligned =
(float *)fpga_malloc(num_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, num_element * sizeof(float)); memset(ptr_aligned, 0, num_element * sizeof(float));
...@@ -59,7 +60,7 @@ void interleave(float **data_in, int num_after_alignment) { ...@@ -59,7 +60,7 @@ void interleave(float **data_in, int num_after_alignment) {
float *ptr_uninterleaved = *data_in; float *ptr_uninterleaved = *data_in;
float *ptr_interleaved = float *ptr_interleaved =
(float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); (float *)fpga_malloc(2 * num_after_alignment * sizeof(float)); // NOLINT
int num = num_after_alignment / 4; int num = num_after_alignment / 4;
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
......
...@@ -11,9 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,9 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "filter.h"
#include "fpga/filter.h"
#include <memory.h> #include <memory.h>
#include "api.h" #include <algorithm>
#include "fpga/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -55,7 +57,7 @@ void convert_to_hwc(char **data_in, int num, int channel, int height, ...@@ -55,7 +57,7 @@ void convert_to_hwc(char **data_in, int num, int channel, int height,
int width) { int width) {
char *tmp = *data_in; char *tmp = *data_in;
int chw = channel * height * width; int chw = channel * height * width;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) { for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel; int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
...@@ -83,16 +85,26 @@ float find_max(float *data_in, int data_size) { ...@@ -83,16 +85,26 @@ float find_max(float *data_in, int data_size) {
return max; return max;
} }
signed char float_to_int8(float fdata) {
if (fdata < 0.0) {
fdata -= 0.5;
} else {
fdata += 0.5;
}
return (signed char)fdata;
}
void quantize(float **data_in, int data_size, float max) { void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in; float *tmp = *data_in;
float fix_range = 127; float fix_range = 127;
float scale = fix_range / max; float scale = fix_range / max;
char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char)); signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) { for (int i = 0; i < data_size; i++) {
tmp_data[i] = (char)((*data_in)[i] * scale); tmp_data[i] = float_to_int8(
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
} }
*data_in = (float *)tmp_data; *data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp); fpga_free(tmp);
} }
...@@ -102,7 +114,8 @@ void align_element(char **data_in, int num, int chw) { ...@@ -102,7 +114,8 @@ void align_element(char **data_in, int num, int chw) {
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
if (align_chw != chw) { if (align_chw != chw) {
char *tmp = *data_in; char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char)); char *data_tmp =
(char *)fpga_malloc(num * align_chw * sizeof(char)); // NOLINT
memset(data_tmp, 0, num * align_chw); memset(data_tmp, 0, num * align_chw);
for (j = 0; j < num; j++) { for (j = 0; j < num; j++) {
...@@ -124,7 +137,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, ...@@ -124,7 +137,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw; int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char)); memset(data_tmp, 0, num_element * sizeof(char));
...@@ -146,7 +159,8 @@ void reorder(char **data_in, int num_after_alignment, int chw) { ...@@ -146,7 +159,8 @@ void reorder(char **data_in, int num_after_alignment, int chw) {
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp = char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char)); (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in; char *tmp = *data_in;
for (index = 0; index < num_after_alignment; index++) { for (index = 0; index < num_after_alignment; index++) {
new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) + new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
...@@ -163,10 +177,11 @@ void interleave(char **data_in, int num_after_alignment, int chw) { ...@@ -163,10 +177,11 @@ void interleave(char **data_in, int num_after_alignment, int chw) {
int j = 0; int j = 0;
int k = 0; int k = 0;
int interleave_per_num = 16; int interleave_per_num = 16;
;
int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
char *data_tmp = char *data_tmp =
(char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char)); (char *)fpga_malloc(chw_align * num_after_alignment * // NOLINT
sizeof(char));
char *tmp = *data_in; char *tmp = *data_in;
int interleave_num = chw_align * 2 / interleave_per_num; int interleave_num = chw_align * 2 / interleave_per_num;
for (i = 0; i < num_after_alignment; i += 2) { for (i = 0; i < num_after_alignment; i += 2) {
...@@ -199,7 +214,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -199,7 +214,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
quantize(data_in, data_size, max); quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width); convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw); align_element(quantize_data, num, chw);
...@@ -210,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -210,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
num_after_alignment * sizeof(char)); num_after_alignment * sizeof(char));
} }
void convert_fc_filter(char **data_in, int num, int chw) {
char *tmp = *data_in;
char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT
for (int n = 0; n < num; n++) {
for (int c = 0; c < chw; c++) {
data_tmp[n * chw + c] = (*data_in)[num * c + n];
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_fc_filter(float **data_in, int num, int channel, int height,
int width, int group_num, float max) {
int data_size = channel * height * width * num;
int chw = channel * height * width;
int division_capacity = calc_division_capacity(chw);
int num_per_div_before_alignment =
calc_num_per_div(num, group_num, division_capacity);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num;
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw);
align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw);
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
num_after_alignment * sizeof(char));
}
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -25,7 +25,7 @@ int calc_division_capacity(int chw); ...@@ -25,7 +25,7 @@ int calc_division_capacity(int chw);
int calc_split_num(int num, int division_capacity); int calc_split_num(int num, int division_capacity);
int calc_division_number(int num, int group_num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity);
int calc_num_per_div(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity);
void convert_to_hwc(float** data_in, int num, int channel, int height, void convert_to_hwc(char** data_in, int num, int channel, int height,
int width); int width);
float find_max(float* data_in, int data_size); float find_max(float* data_in, int data_size);
void quantize(float** data_in, int data_size, float max); void quantize(float** data_in, int data_size, float max);
...@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw); ...@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw);
void interleave(float** data_in, int num_after_alignment, int chw); void interleave(float** data_in, int num_after_alignment, int chw);
void format_filter(float** data_in, int num, int channel, int height, int width, void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max); int group_num, float max);
void convert_fc_filter(char** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
} // namespace filter } // namespace filter
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "image.h" #include "fpga/image.h"
#include <memory.h> #include <memory.h>
#include "api.h" #include <algorithm>
#include "fpga/api.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -23,7 +24,7 @@ namespace image { ...@@ -23,7 +24,7 @@ namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) { void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in; float *tmp = *data_in;
float *data_tmp = float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel; int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) { for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
...@@ -42,12 +43,14 @@ void align_element_conv(float **data_in, int height, int cw) { ...@@ -42,12 +43,14 @@ void align_element_conv(float **data_in, int height, int cw) {
int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
if (align_cw != cw) { if (align_cw != cw) {
float *tmp = *data_in; float *tmp = *data_in;
float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float)); float *data_tmp =
(float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT
memset(data_tmp, 0, height * align_cw * sizeof(float)); memset(data_tmp, 0, height * align_cw * sizeof(float));
for (h = 0; h < height; h++) { for (h = 0; h < height; h++) {
memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw), memcpy((void *)(data_tmp + h * align_cw), // NOLINT
(void *)(*data_in + h * cw), // NOLINT
cw * sizeof(float)); cw * sizeof(float));
} }
...@@ -95,7 +98,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -95,7 +98,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
for (i = 0; i < image_num; i++) { for (i = 0; i < image_num; i++) {
align_each_in_area_cw = align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy((int16_t *)image_out + tmp_channel + memcpy((int16_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ, k * align_each_out_area_cw_differ,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int16_t)); channel_num[i] * sizeof(int16_t));
......
...@@ -51,7 +51,7 @@ class Attribute { ...@@ -51,7 +51,7 @@ class Attribute {
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: {
attr.Set<std::string>(std::string(attr_desc->s)); attr.SetString(std::string(attr_desc->s));
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
...@@ -108,6 +108,13 @@ class Attribute { ...@@ -108,6 +108,13 @@ class Attribute {
return variant_.Get<T>(); return variant_.Get<T>();
} }
Attribute &SetString(std::string string) {
variant_.SetString(string);
return *this;
}
std::string GetString() const { return variant_.GetString(); }
template <typename Vistor> template <typename Vistor>
static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) { static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
if (attr.variant_.TypeId() == typeid(int).hash_code()) { if (attr.variant_.TypeId() == typeid(int).hash_code()) {
...@@ -115,7 +122,7 @@ class Attribute { ...@@ -115,7 +122,7 @@ class Attribute {
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) { } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
return vistor(attr.variant_.Get<float>()); return vistor(attr.variant_.Get<float>());
} else if (attr.variant_.TypeId() == typeid(string).hash_code()) { } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
return vistor(attr.variant_.Get<string>()); return vistor(attr.variant_.GetString());
} else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) { } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
return vistor(attr.variant_.Get<vector<int>>()); return vistor(attr.variant_.Get<vector<int>>());
} else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) { } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstdlib> #include <cstdlib>
#include <initializer_list> #include <initializer_list>
#include <string>
#include <typeinfo> #include <typeinfo>
#include <vector> #include <vector>
......
...@@ -7,6 +7,35 @@ ...@@ -7,6 +7,35 @@
#endif #endif
#include "framework.pb-c.h" #include "framework.pb-c.h"
void paddle_mobile__framework__proto__version__init(
PaddleMobile__Framework__Proto__Version *message) {
static const PaddleMobile__Framework__Proto__Version init_value =
PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
*message = init_value;
}
size_t paddle_mobile__framework__proto__version__get_packed_size(
const PaddleMobile__Framework__Proto__Version *message) {
assert(message->base.descriptor ==
&paddle_mobile__framework__proto__version__descriptor);
return protobuf_c_message_get_packed_size(
(const ProtobufCMessage *)(message));
}
PaddleMobile__Framework__Proto__Version *
paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
size_t len,
const uint8_t *data) {
return (PaddleMobile__Framework__Proto__Version *)protobuf_c_message_unpack(
&paddle_mobile__framework__proto__version__descriptor, allocator, len,
data);
}
void paddle_mobile__framework__proto__version__free_unpacked(
PaddleMobile__Framework__Proto__Version *message,
ProtobufCAllocator *allocator) {
if (!message) return;
assert(message->base.descriptor ==
&paddle_mobile__framework__proto__version__descriptor);
protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
}
void paddle_mobile__framework__proto__op_desc__attr__init( void paddle_mobile__framework__proto__op_desc__attr__init(
PaddleMobile__Framework__Proto__OpDesc__Attr *message) { PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value = static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
...@@ -32,7 +61,6 @@ size_t paddle_mobile__framework__proto__op_desc__get_packed_size( ...@@ -32,7 +61,6 @@ size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
return protobuf_c_message_get_packed_size( return protobuf_c_message_get_packed_size(
(const ProtobufCMessage *)(message)); (const ProtobufCMessage *)(message));
} }
PaddleMobile__Framework__Proto__OpDesc * PaddleMobile__Framework__Proto__OpDesc *
paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator, paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
size_t len, size_t len,
...@@ -74,7 +102,6 @@ size_t paddle_mobile__framework__proto__op_proto__get_packed_size( ...@@ -74,7 +102,6 @@ size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
return protobuf_c_message_get_packed_size( return protobuf_c_message_get_packed_size(
(const ProtobufCMessage *)(message)); (const ProtobufCMessage *)(message));
} }
PaddleMobile__Framework__Proto__OpProto * PaddleMobile__Framework__Proto__OpProto *
paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator, paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
size_t len, size_t len,
...@@ -171,7 +198,6 @@ size_t paddle_mobile__framework__proto__var_desc__get_packed_size( ...@@ -171,7 +198,6 @@ size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
return protobuf_c_message_get_packed_size( return protobuf_c_message_get_packed_size(
(const ProtobufCMessage *)(message)); (const ProtobufCMessage *)(message));
} }
PaddleMobile__Framework__Proto__VarDesc * PaddleMobile__Framework__Proto__VarDesc *
paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator, paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
size_t len, size_t len,
...@@ -201,7 +227,6 @@ size_t paddle_mobile__framework__proto__block_desc__get_packed_size( ...@@ -201,7 +227,6 @@ size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
return protobuf_c_message_get_packed_size( return protobuf_c_message_get_packed_size(
(const ProtobufCMessage *)(message)); (const ProtobufCMessage *)(message));
} }
PaddleMobile__Framework__Proto__BlockDesc * PaddleMobile__Framework__Proto__BlockDesc *
paddle_mobile__framework__proto__block_desc__unpack( paddle_mobile__framework__proto__block_desc__unpack(
ProtobufCAllocator *allocator, size_t len, const uint8_t *data) { ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
...@@ -230,7 +255,6 @@ size_t paddle_mobile__framework__proto__program_desc__get_packed_size( ...@@ -230,7 +255,6 @@ size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
return protobuf_c_message_get_packed_size( return protobuf_c_message_get_packed_size(
(const ProtobufCMessage *)(message)); (const ProtobufCMessage *)(message));
} }
PaddleMobile__Framework__Proto__ProgramDesc * PaddleMobile__Framework__Proto__ProgramDesc *
paddle_mobile__framework__proto__program_desc__unpack( paddle_mobile__framework__proto__program_desc__unpack(
ProtobufCAllocator *allocator, size_t len, const uint8_t *data) { ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
...@@ -247,8 +271,46 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked( ...@@ -247,8 +271,46 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked(
&paddle_mobile__framework__proto__program_desc__descriptor); &paddle_mobile__framework__proto__program_desc__descriptor);
protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator); protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
} }
static const int64_t
paddle_mobile__framework__proto__version__version__default_value = 0ll;
static const ProtobufCFieldDescriptor static const ProtobufCFieldDescriptor
paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = { paddle_mobile__framework__proto__version__field_descriptors[1] = {
{
"version", 1, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
offsetof(PaddleMobile__Framework__Proto__Version, has_version),
offsetof(PaddleMobile__Framework__Proto__Version, version), NULL,
&paddle_mobile__framework__proto__version__version__default_value,
0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */
},
};
static const unsigned
paddle_mobile__framework__proto__version__field_indices_by_name[] = {
0, /* field[0] = version */
};
static const ProtobufCIntRange
paddle_mobile__framework__proto__version__number_ranges[1 + 1] = {{1, 0},
{0, 1}};
const ProtobufCMessageDescriptor
paddle_mobile__framework__proto__version__descriptor = {
PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
"paddle_mobile.framework.proto.Version",
"Version",
"PaddleMobile__Framework__Proto__Version",
"paddle_mobile.framework.proto",
sizeof(PaddleMobile__Framework__Proto__Version),
1,
paddle_mobile__framework__proto__version__field_descriptors,
paddle_mobile__framework__proto__version__field_indices_by_name,
1,
paddle_mobile__framework__proto__version__number_ranges,
(ProtobufCMessageInit)paddle_mobile__framework__proto__version__init,
NULL,
NULL,
NULL /* reserved[123] */
};
static const ProtobufCFieldDescriptor
paddle_mobile__framework__proto__op_desc__attr__field_descriptors[13] = {
{ {
"name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
0, /* quantifier_offset */ 0, /* quantifier_offset */
...@@ -335,11 +397,20 @@ static const ProtobufCFieldDescriptor ...@@ -335,11 +397,20 @@ static const ProtobufCFieldDescriptor
NULL, 0, /* flags */ NULL, 0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */ 0, NULL, NULL /* reserved1,reserved2, etc */
}, },
{
"blocks_idx", 14, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
n_blocks_idx),
offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, blocks_idx),
NULL, NULL, 0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */
},
}; };
static const unsigned static const unsigned
paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = { paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
8, /* field[8] = b */ 8, /* field[8] = b */
10, /* field[10] = block_idx */ 10, /* field[10] = block_idx */
12, /* field[12] = blocks_idx */
9, /* field[9] = bools */ 9, /* field[9] = bools */
3, /* field[3] = f */ 3, /* field[3] = f */
6, /* field[6] = floats */ 6, /* field[6] = floats */
...@@ -353,7 +424,7 @@ static const unsigned ...@@ -353,7 +424,7 @@ static const unsigned
}; };
static const ProtobufCIntRange static const ProtobufCIntRange
paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = { paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
{1, 0}, {10, 8}, {0, 12}}; {1, 0}, {10, 8}, {0, 13}};
const ProtobufCMessageDescriptor const ProtobufCMessageDescriptor
paddle_mobile__framework__proto__op_desc__attr__descriptor = { paddle_mobile__framework__proto__op_desc__attr__descriptor = {
PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
...@@ -362,7 +433,7 @@ const ProtobufCMessageDescriptor ...@@ -362,7 +433,7 @@ const ProtobufCMessageDescriptor
"PaddleMobile__Framework__Proto__OpDesc__Attr", "PaddleMobile__Framework__Proto__OpDesc__Attr",
"paddle_mobile.framework.proto", "paddle_mobile.framework.proto",
sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr), sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
12, 13,
paddle_mobile__framework__proto__op_desc__attr__field_descriptors, paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name, paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
2, 2,
...@@ -500,7 +571,7 @@ static const protobuf_c_boolean ...@@ -500,7 +571,7 @@ static const protobuf_c_boolean
paddle_mobile__framework__proto__op_proto__var__dispensable__default_value = paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
0; 0;
static const ProtobufCFieldDescriptor static const ProtobufCFieldDescriptor
paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = { paddle_mobile__framework__proto__op_proto__var__field_descriptors[6] = {
{ {
"name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING, "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
0, /* quantifier_offset */ 0, /* quantifier_offset */
...@@ -546,6 +617,13 @@ static const ProtobufCFieldDescriptor ...@@ -546,6 +617,13 @@ static const ProtobufCFieldDescriptor
0, /* flags */ 0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */ 0, NULL, NULL /* reserved1,reserved2, etc */
}, },
{
"reuse", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
0, /* quantifier_offset */
offsetof(PaddleMobile__Framework__Proto__OpProto__Var, reuse), NULL,
NULL, 0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */
},
}; };
static const unsigned static const unsigned
paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = { paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
...@@ -554,10 +632,11 @@ static const unsigned ...@@ -554,10 +632,11 @@ static const unsigned
2, /* field[2] = duplicable */ 2, /* field[2] = duplicable */
3, /* field[3] = intermediate */ 3, /* field[3] = intermediate */
0, /* field[0] = name */ 0, /* field[0] = name */
5, /* field[5] = reuse */
}; };
static const ProtobufCIntRange static const ProtobufCIntRange
paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = { paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
{1, 0}, {0, 5}}; {1, 0}, {0, 6}};
const ProtobufCMessageDescriptor const ProtobufCMessageDescriptor
paddle_mobile__framework__proto__op_proto__var__descriptor = { paddle_mobile__framework__proto__op_proto__var__descriptor = {
PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
...@@ -566,7 +645,7 @@ const ProtobufCMessageDescriptor ...@@ -566,7 +645,7 @@ const ProtobufCMessageDescriptor
"PaddleMobile__Framework__Proto__OpProto__Var", "PaddleMobile__Framework__Proto__OpProto__Var",
"paddle_mobile.framework.proto", "paddle_mobile.framework.proto",
sizeof(PaddleMobile__Framework__Proto__OpProto__Var), sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
5, 6,
paddle_mobile__framework__proto__op_proto__var__field_descriptors, paddle_mobile__framework__proto__op_proto__var__field_descriptors,
paddle_mobile__framework__proto__op_proto__var__field_indices_by_name, paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
1, 1,
...@@ -1012,7 +1091,7 @@ const ProtobufCMessageDescriptor ...@@ -1012,7 +1091,7 @@ const ProtobufCMessageDescriptor
NULL /* reserved[123] */ NULL /* reserved[123] */
}; };
static const ProtobufCEnumValue static const ProtobufCEnumValue
paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] = paddle_mobile__framework__proto__var_type__type__enum_values_by_number[22] =
{ {
{"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL", {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
0}, 0},
...@@ -1057,31 +1136,29 @@ static const ProtobufCEnumValue ...@@ -1057,31 +1136,29 @@ static const ProtobufCEnumValue
{"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17}, {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
{"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE", {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
18}, 18},
{"SIZE_T",
"PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T", 19},
{"UINT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8",
20},
{"INT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8",
21},
}; };
static const ProtobufCIntRange static const ProtobufCIntRange
paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0}, paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
{0, 19}}; {0, 22}};
static const ProtobufCEnumValueIndex static const ProtobufCEnumValueIndex
paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = { paddle_mobile__framework__proto__var_type__type__enum_values_by_name[22] = {
{"BOOL", 0}, {"BOOL", 0}, {"CHANNEL", 16},
{"CHANNEL", 16}, {"FEED_MINIBATCH", 9}, {"FETCH_LIST", 10},
{"FEED_MINIBATCH", 9}, {"FP16", 4}, {"FP32", 5},
{"FETCH_LIST", 10}, {"FP64", 6}, {"INT16", 1},
{"FP16", 4}, {"INT32", 2}, {"INT64", 3},
{"FP32", 5}, {"INT8", 21}, {"LOD_RANK_TABLE", 12},
{"FP64", 6}, {"LOD_TENSOR", 7}, {"LOD_TENSOR_ARRAY", 13},
{"INT16", 1}, {"PLACE_LIST", 14}, {"RAW", 17},
{"INT32", 2}, {"READER", 15}, {"SELECTED_ROWS", 8},
{"INT64", 3}, {"SIZE_T", 19}, {"STEP_SCOPES", 11},
{"LOD_RANK_TABLE", 12}, {"TUPLE", 18}, {"UINT8", 20},
{"LOD_TENSOR", 7},
{"LOD_TENSOR_ARRAY", 13},
{"PLACE_LIST", 14},
{"RAW", 17},
{"READER", 15},
{"SELECTED_ROWS", 8},
{"STEP_SCOPES", 11},
{"TUPLE", 18},
}; };
const ProtobufCEnumDescriptor const ProtobufCEnumDescriptor
paddle_mobile__framework__proto__var_type__type__descriptor = { paddle_mobile__framework__proto__var_type__type__descriptor = {
...@@ -1090,9 +1167,9 @@ const ProtobufCEnumDescriptor ...@@ -1090,9 +1167,9 @@ const ProtobufCEnumDescriptor
"Type", "Type",
"PaddleMobile__Framework__Proto__VarType__Type", "PaddleMobile__Framework__Proto__VarType__Type",
"paddle_mobile.framework.proto", "paddle_mobile.framework.proto",
19, 22,
paddle_mobile__framework__proto__var_type__type__enum_values_by_number, paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
19, 22,
paddle_mobile__framework__proto__var_type__type__enum_values_by_name, paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
1, 1,
paddle_mobile__framework__proto__var_type__type__value_ranges, paddle_mobile__framework__proto__var_type__type__value_ranges,
...@@ -1325,7 +1402,7 @@ const ProtobufCMessageDescriptor ...@@ -1325,7 +1402,7 @@ const ProtobufCMessageDescriptor
NULL /* reserved[123] */ NULL /* reserved[123] */
}; };
static const ProtobufCFieldDescriptor static const ProtobufCFieldDescriptor
paddle_mobile__framework__proto__program_desc__field_descriptors[1] = { paddle_mobile__framework__proto__program_desc__field_descriptors[2] = {
{ {
"blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE, "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks), offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
...@@ -1334,14 +1411,23 @@ static const ProtobufCFieldDescriptor ...@@ -1334,14 +1411,23 @@ static const ProtobufCFieldDescriptor
0, /* flags */ 0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */ 0, NULL, NULL /* reserved1,reserved2, etc */
}, },
{
"version", 2, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
0, /* quantifier_offset */
offsetof(PaddleMobile__Framework__Proto__ProgramDesc, version),
&paddle_mobile__framework__proto__version__descriptor, NULL,
0, /* flags */
0, NULL, NULL /* reserved1,reserved2, etc */
},
}; };
static const unsigned static const unsigned
paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = { paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
0, /* field[0] = blocks */ 0, /* field[0] = blocks */
1, /* field[1] = version */
}; };
static const ProtobufCIntRange static const ProtobufCIntRange
paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = { paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
{1, 0}, {0, 1}}; {1, 0}, {0, 2}};
const ProtobufCMessageDescriptor const ProtobufCMessageDescriptor
paddle_mobile__framework__proto__program_desc__descriptor = { paddle_mobile__framework__proto__program_desc__descriptor = {
PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC, PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
...@@ -1350,7 +1436,7 @@ const ProtobufCMessageDescriptor ...@@ -1350,7 +1436,7 @@ const ProtobufCMessageDescriptor
"PaddleMobile__Framework__Proto__ProgramDesc", "PaddleMobile__Framework__Proto__ProgramDesc",
"paddle_mobile.framework.proto", "paddle_mobile.framework.proto",
sizeof(PaddleMobile__Framework__Proto__ProgramDesc), sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
1, 2,
paddle_mobile__framework__proto__program_desc__field_descriptors, paddle_mobile__framework__proto__program_desc__field_descriptors,
paddle_mobile__framework__proto__program_desc__field_indices_by_name, paddle_mobile__framework__proto__program_desc__field_indices_by_name,
1, 1,
...@@ -1362,7 +1448,7 @@ const ProtobufCMessageDescriptor ...@@ -1362,7 +1448,7 @@ const ProtobufCMessageDescriptor
NULL /* reserved[123] */ NULL /* reserved[123] */
}; };
static const ProtobufCEnumValue static const ProtobufCEnumValue
paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = { paddle_mobile__framework__proto__attr_type__enum_values_by_number[11] = {
{"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0}, {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
{"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1}, {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
{"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2}, {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
...@@ -1373,15 +1459,16 @@ static const ProtobufCEnumValue ...@@ -1373,15 +1459,16 @@ static const ProtobufCEnumValue
{"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7}, {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
{"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8}, {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
{"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9}, {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
{"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
}; };
static const ProtobufCIntRange static const ProtobufCIntRange
paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0}, paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
{0, 10}}; {0, 11}};
static const ProtobufCEnumValueIndex static const ProtobufCEnumValueIndex
paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = { paddle_mobile__framework__proto__attr_type__enum_values_by_name[11] = {
{"BLOCK", 8}, {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1}, {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
{"FLOATS", 4}, {"INT", 0}, {"INTS", 3}, {"LONG", 9}, {"FLOAT", 1}, {"FLOATS", 4}, {"INT", 0}, {"INTS", 3},
{"STRING", 2}, {"STRINGS", 5}, {"LONG", 9}, {"STRING", 2}, {"STRINGS", 5},
}; };
const ProtobufCEnumDescriptor const ProtobufCEnumDescriptor
paddle_mobile__framework__proto__attr_type__descriptor = { paddle_mobile__framework__proto__attr_type__descriptor = {
...@@ -1390,9 +1477,9 @@ const ProtobufCEnumDescriptor ...@@ -1390,9 +1477,9 @@ const ProtobufCEnumDescriptor
"AttrType", "AttrType",
"PaddleMobile__Framework__Proto__AttrType", "PaddleMobile__Framework__Proto__AttrType",
"paddle_mobile.framework.proto", "paddle_mobile.framework.proto",
10, 11,
paddle_mobile__framework__proto__attr_type__enum_values_by_number, paddle_mobile__framework__proto__attr_type__enum_values_by_number,
10, 11,
paddle_mobile__framework__proto__attr_type__enum_values_by_name, paddle_mobile__framework__proto__attr_type__enum_values_by_name,
1, 1,
paddle_mobile__framework__proto__attr_type__value_ranges, paddle_mobile__framework__proto__attr_type__value_ranges,
......
...@@ -4,16 +4,18 @@ ...@@ -4,16 +4,18 @@
#ifndef PROTOBUF_C_framework_2eproto__INCLUDED #ifndef PROTOBUF_C_framework_2eproto__INCLUDED
#define PROTOBUF_C_framework_2eproto__INCLUDED #define PROTOBUF_C_framework_2eproto__INCLUDED
#include "common/protobuf-c.h" #include <protobuf-c/protobuf-c.h>
PROTOBUF_C__BEGIN_DECLS PROTOBUF_C__BEGIN_DECLS
#if PROTOBUF_C_VERSION_NUMBER < 1000000 #if PROTOBUF_C_VERSION_NUMBER < 1000000
# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers. # error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION #elif 1003001 < PROTOBUF_C_MIN_COMPILER_VERSION
# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c. # error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
#endif #endif
typedef struct _PaddleMobile__Framework__Proto__Version
PaddleMobile__Framework__Proto__Version;
typedef struct _PaddleMobile__Framework__Proto__OpDesc typedef struct _PaddleMobile__Framework__Proto__OpDesc
PaddleMobile__Framework__Proto__OpDesc; PaddleMobile__Framework__Proto__OpDesc;
typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
...@@ -60,6 +62,12 @@ typedef enum _PaddleMobile__Framework__Proto__VarType__Type { ...@@ -60,6 +62,12 @@ typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
/*
* Tensor<size_t> is used in C++.
*/
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T = 19,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8 = 20,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8 = 21,
/* /*
* Other types that may need additional descriptions * Other types that may need additional descriptions
*/ */
...@@ -93,13 +101,32 @@ typedef enum _PaddleMobile__Framework__Proto__AttrType { ...@@ -93,13 +101,32 @@ typedef enum _PaddleMobile__Framework__Proto__AttrType {
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE( PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS =
10 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE) PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
} PaddleMobile__Framework__Proto__AttrType; } PaddleMobile__Framework__Proto__AttrType;
/* --- messages --- */ /* --- messages --- */
/*
* Any incompatible changes to ProgramDesc and its dependencies should
* raise the version defined version.h.
* Serailization and Deserialization codes should be modified in a way
* that supports old versions following the version and compatibility policy.
*/
struct _PaddleMobile__Framework__Proto__Version {
ProtobufCMessage base;
protobuf_c_boolean has_version;
int64_t version;
};
#define PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT \
{ \
PROTOBUF_C_MESSAGE_INIT( \
&paddle_mobile__framework__proto__version__descriptor) \
, 0, 0ll \
}
struct _PaddleMobile__Framework__Proto__OpDesc__Attr { struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
ProtobufCMessage base; ProtobufCMessage base;
char *name; char *name;
...@@ -123,13 +150,15 @@ struct _PaddleMobile__Framework__Proto__OpDesc__Attr { ...@@ -123,13 +150,15 @@ struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
int32_t block_idx; int32_t block_idx;
protobuf_c_boolean has_l; protobuf_c_boolean has_l;
int64_t l; int64_t l;
size_t n_blocks_idx;
int32_t *blocks_idx;
}; };
#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT \ #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT \
{ \ { \
PROTOBUF_C_MESSAGE_INIT( \ PROTOBUF_C_MESSAGE_INIT( \
&paddle_mobile__framework__proto__op_desc__attr__descriptor) \ &paddle_mobile__framework__proto__op_desc__attr__descriptor) \
, NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \ , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0 \ 0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL \
} }
struct _PaddleMobile__Framework__Proto__OpDesc__Var { struct _PaddleMobile__Framework__Proto__OpDesc__Var {
...@@ -181,12 +210,13 @@ struct _PaddleMobile__Framework__Proto__OpProto__Var { ...@@ -181,12 +210,13 @@ struct _PaddleMobile__Framework__Proto__OpProto__Var {
protobuf_c_boolean intermediate; protobuf_c_boolean intermediate;
protobuf_c_boolean has_dispensable; protobuf_c_boolean has_dispensable;
protobuf_c_boolean dispensable; protobuf_c_boolean dispensable;
char *reuse;
}; };
#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT \ #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT \
{ \ { \
PROTOBUF_C_MESSAGE_INIT( \ PROTOBUF_C_MESSAGE_INIT( \
&paddle_mobile__framework__proto__op_proto__var__descriptor) \ &paddle_mobile__framework__proto__op_proto__var__descriptor) \
, NULL, NULL, 0, 0, 0, 0, 0, 0 \ , NULL, NULL, 0, 0, 0, 0, 0, 0, NULL \
} }
/* /*
...@@ -375,14 +405,27 @@ struct _PaddleMobile__Framework__Proto__ProgramDesc { ...@@ -375,14 +405,27 @@ struct _PaddleMobile__Framework__Proto__ProgramDesc {
ProtobufCMessage base; ProtobufCMessage base;
size_t n_blocks; size_t n_blocks;
PaddleMobile__Framework__Proto__BlockDesc **blocks; PaddleMobile__Framework__Proto__BlockDesc **blocks;
PaddleMobile__Framework__Proto__Version *version;
}; };
#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT \ #define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT \
{ \ { \
PROTOBUF_C_MESSAGE_INIT( \ PROTOBUF_C_MESSAGE_INIT( \
&paddle_mobile__framework__proto__program_desc__descriptor) \ &paddle_mobile__framework__proto__program_desc__descriptor) \
, 0, NULL \ , 0, NULL, NULL \
} }
/* PaddleMobile__Framework__Proto__Version methods */
void paddle_mobile__framework__proto__version__init(
PaddleMobile__Framework__Proto__Version *message);
size_t paddle_mobile__framework__proto__version__get_packed_size(
const PaddleMobile__Framework__Proto__Version *message);
PaddleMobile__Framework__Proto__Version *
paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
size_t len,
const uint8_t *data);
void paddle_mobile__framework__proto__version__free_unpacked(
PaddleMobile__Framework__Proto__Version *message,
ProtobufCAllocator *allocator);
/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */ /* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
void paddle_mobile__framework__proto__op_desc__attr__init( void paddle_mobile__framework__proto__op_desc__attr__init(
PaddleMobile__Framework__Proto__OpDesc__Attr *message); PaddleMobile__Framework__Proto__OpDesc__Attr *message);
...@@ -392,10 +435,8 @@ void paddle_mobile__framework__proto__op_desc__var__init( ...@@ -392,10 +435,8 @@ void paddle_mobile__framework__proto__op_desc__var__init(
/* PaddleMobile__Framework__Proto__OpDesc methods */ /* PaddleMobile__Framework__Proto__OpDesc methods */
void paddle_mobile__framework__proto__op_desc__init( void paddle_mobile__framework__proto__op_desc__init(
PaddleMobile__Framework__Proto__OpDesc *message); PaddleMobile__Framework__Proto__OpDesc *message);
size_t paddle_mobile__framework__proto__op_desc__get_packed_size( size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
const PaddleMobile__Framework__Proto__OpDesc *message); const PaddleMobile__Framework__Proto__OpDesc *message);
PaddleMobile__Framework__Proto__OpDesc * PaddleMobile__Framework__Proto__OpDesc *
paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator, paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
size_t len, size_t len,
...@@ -487,6 +528,8 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked( ...@@ -487,6 +528,8 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked(
ProtobufCAllocator *allocator); ProtobufCAllocator *allocator);
/* --- per-message closures --- */ /* --- per-message closures --- */
typedef void (*PaddleMobile__Framework__Proto__Version_Closure)(
const PaddleMobile__Framework__Proto__Version *message, void *closure_data);
typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)( typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
const PaddleMobile__Framework__Proto__OpDesc__Attr *message, const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
void *closure_data); void *closure_data);
...@@ -539,6 +582,8 @@ typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)( ...@@ -539,6 +582,8 @@ typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
extern const ProtobufCEnumDescriptor extern const ProtobufCEnumDescriptor
paddle_mobile__framework__proto__attr_type__descriptor; paddle_mobile__framework__proto__attr_type__descriptor;
extern const ProtobufCMessageDescriptor
paddle_mobile__framework__proto__version__descriptor;
extern const ProtobufCMessageDescriptor extern const ProtobufCMessageDescriptor
paddle_mobile__framework__proto__op_desc__descriptor; paddle_mobile__framework__proto__op_desc__descriptor;
extern const ProtobufCMessageDescriptor extern const ProtobufCMessageDescriptor
......
...@@ -16,6 +16,13 @@ syntax = "proto2"; ...@@ -16,6 +16,13 @@ syntax = "proto2";
option optimize_for = LITE_RUNTIME; option optimize_for = LITE_RUNTIME;
package paddle_mobile.framework.proto; package paddle_mobile.framework.proto;
// Any incompatible changes to ProgramDesc and its dependencies should
// raise the version defined version.h.
//
// Serailization and Deserialization codes should be modified in a way
// that supports old versions following the version and compatibility policy.
message Version { optional int64 version = 1 [ default = 0 ]; }
enum AttrType { enum AttrType {
INT = 0; INT = 0;
FLOAT = 1; FLOAT = 1;
...@@ -27,6 +34,7 @@ enum AttrType { ...@@ -27,6 +34,7 @@ enum AttrType {
BOOLEANS = 7; BOOLEANS = 7;
BLOCK = 8; BLOCK = 8;
LONG = 9; LONG = 9;
BLOCKS = 10;
} }
// OpDesc describes an instance of a C++ framework::OperatorBase // OpDesc describes an instance of a C++ framework::OperatorBase
...@@ -46,6 +54,7 @@ message OpDesc { ...@@ -46,6 +54,7 @@ message OpDesc {
repeated bool bools = 11; repeated bool bools = 11;
optional int32 block_idx = 12; optional int32 block_idx = 12;
optional int64 l = 13; optional int64 l = 13;
repeated int32 blocks_idx = 14;
}; };
message Var { message Var {
...@@ -71,6 +80,7 @@ message OpProto { ...@@ -71,6 +80,7 @@ message OpProto {
optional bool duplicable = 3 [ default = false ]; optional bool duplicable = 3 [ default = false ];
optional bool intermediate = 4 [ default = false ]; optional bool intermediate = 4 [ default = false ];
optional bool dispensable = 5 [ default = false ]; optional bool dispensable = 5 [ default = false ];
optional string reuse = 6;
} }
// AttrProto describes the C++ type Attribute. // AttrProto describes the C++ type Attribute.
...@@ -101,6 +111,10 @@ message VarType { ...@@ -101,6 +111,10 @@ message VarType {
FP16 = 4; FP16 = 4;
FP32 = 5; FP32 = 5;
FP64 = 6; FP64 = 6;
// Tensor<size_t> is used in C++.
SIZE_T = 19;
UINT8 = 20;
INT8 = 21;
// Other types that may need additional descriptions // Other types that may need additional descriptions
LOD_TENSOR = 7; LOD_TENSOR = 7;
...@@ -173,4 +187,8 @@ message BlockDesc { ...@@ -173,4 +187,8 @@ message BlockDesc {
// for more details. // for more details.
// TODO(panyx0718): A model can have multiple programs. Need a // TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name? // way to distinguish them. Maybe ID or name?
message ProgramDesc { repeated BlockDesc blocks = 1; } message ProgramDesc {
repeated BlockDesc blocks = 1;
optional Version version = 2;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_MOBILE_CPU
#define LOAD_CPU_OP(op_type) \
extern int TouchOpRegistrar_##op_type##_##cpu(); \
static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
TouchOpRegistrar_##op_type##_##cpu()
#else
#define LOAD_CPU_OP(op_type)
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#define LOAD_MALI_GPU_OP(op_type) \
extern int TouchOpRegistrar_##op_type##_##mali_gpu(); \
static int use_op_itself_##op_type##_##mali_gpu __attribute__((unused)) = \
TouchOpRegistrar_##op_type##_##mali_gpu()
#else
#define LOAD_MALI_GPU_OP(op_type)
#endif
#ifdef PADDLE_MOBILE_FPGA
#define LOAD_FPGA_OP(op_type) \
extern int TouchOpRegistrar_##op_type##_##fpga(); \
static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
TouchOpRegistrar_##op_type##_##fpga()
#else
#define LOAD_FPGA_OP(op_type)
#endif
#define LOAD_FUSION_MATCHER(op_type) \
extern int TouchFusionMatcherRegistrar_##op_type(); \
static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
TouchFusionMatcherRegistrar_##op_type();
#define LOAD_OP(op_type) \
LOAD_CPU_OP(op_type); \
LOAD_MALI_GPU_OP(op_type); \
LOAD_FPGA_OP(op_type);
#define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
#define LOAD_OP2(op_type, device_type1, device_type2) \
LOAD_OP1(op_type, device_type1) \
LOAD_OP1(op_type, device_type2)
#define LOAD_OP3(op_type, device_type1, device_type2, device_type3) \
LOAD_OP2(op_type, device_type1, device_type2) \
LOAD_OP1(op_type, device_type3)
// load requared ops
LOAD_OP(feed)
LOAD_OP(fetch)
#ifdef BATCHNORM_OP
LOAD_OP2(batch_norm, CPU, MALI_GPU);
#endif
#ifdef BILINEAR_INTERP_OP
LOAD_OP1(bilinear_interp, CPU);
#endif
#ifdef BOXCODER_OP
LOAD_OP1(box_coder, CPU);
#endif
#ifdef CONCAT_OP
LOAD_OP3(concat, CPU, MALI_GPU, FPGA);
#endif
#ifdef CONV_OP
LOAD_OP3(conv2d, CPU, MALI_GPU, FPGA);
#endif
#ifdef LRN_OP
LOAD_OP2(lrn, CPU, MALI_GPU);
#endif
#ifdef SIGMOID_OP
LOAD_OP1(sigmoid, CPU);
#endif
#ifdef FUSION_FC_RELU_OP
LOAD_OP3(fusion_fc_relu, CPU, MALI_GPU, FPGA);
LOAD_FUSION_MATCHER(fusion_fc_relu);
#endif
#ifdef FUSION_ELEMENTWISEADDRELU_OP
LOAD_OP3(fusion_elementwise_add_relu, CPU, MALI_GPU, FPGA);
LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
#endif
#ifdef SPLIT_OP
LOAD_OP1(split, CPU);
#endif
#ifdef RESIZE_OP
LOAD_OP2(resize, CPU, MALI_GPU);
#endif
#ifdef FUSION_CONVADDBNRELU_OP
LOAD_OP2(fusion_conv_add_bn_relu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
#endif
#ifdef RESHAPE_OP
LOAD_OP2(reshape, CPU, MALI_GPU);
#endif
#ifdef TRANSPOSE_OP
LOAD_OP1(transpose, CPU);
#endif
#ifdef PRIORBOX_OP
LOAD_OP1(prior_box, CPU);
#endif
#ifdef FUSION_CONVADDRELU_OP
LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_relu);
#endif
#ifdef FUSION_CONVADDADDPRELU_OP
LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
#endif
#ifdef FUSION_CONVADD_OP
LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
LOAD_FUSION_MATCHER(fusion_conv_add);
#endif
#ifdef SOFTMAX_OP
LOAD_OP2(softmax, CPU, MALI_GPU);
#endif
#ifdef SHAPE_OP
LOAD_OP1(shape, CPU);
#endif
#ifdef DEPTHWISECONV_OP
LOAD_OP1(depthwise_conv2d, CPU);
#endif
#ifdef CONV_TRANSPOSE_OP
LOAD_OP1(conv2d_transpose, CPU);
#endif
#ifdef SCALE_OP
LOAD_OP2(scale, CPU, MALI_GPU);
#endif
#ifdef ELEMENTWISEADD_OP
LOAD_OP2(elementwise_add, CPU, MALI_GPU);
#endif
#ifdef PRELU_OP
LOAD_OP2(prelu, CPU, MALI_GPU);
#endif
#ifdef FLATTEN_OP
LOAD_OP1(flatten, CPU);
#endif
#ifdef FUSION_CONVBNADDRELU_OP
LOAD_OP2(fusion_conv_bn_add_relu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu);
#endif
#ifdef FUSION_CONVBNRELU_OP
LOAD_OP2(fusion_conv_bn_relu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
#endif
#ifdef GRU_OP
LOAD_OP1(gru, CPU);
#endif
#ifdef FUSION_CONVADDBN_OP
LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_bn);
#endif
#ifdef DROPOUT_OP
LOAD_OP2(dropout, CPU, FPGA);
#endif
#ifdef FUSION_CONVADDPRELU_OP
LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
#endif
#ifdef FUSION_DWCONVBNRELU_OP
LOAD_OP1(fusion_dwconv_bn_relu, CPU);
LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
#endif
#ifdef CRF_OP
LOAD_OP1(crf_decoding, CPU);
#endif
#ifdef MUL_OP
LOAD_OP2(mul, CPU, MALI_GPU);
#endif
#ifdef RELU_OP
LOAD_OP2(relu, CPU, MALI_GPU);
#endif
#ifdef IM2SEQUENCE_OP
LOAD_OP1(im2sequence, CPU);
#endif
#ifdef LOOKUP_OP
LOAD_OP1(lookup_table, CPU);
#endif
#ifdef FUSION_FC_OP
LOAD_OP3(fusion_fc, CPU, MALI_GPU, FPGA);
LOAD_FUSION_MATCHER(fusion_fc);
#endif
#ifdef POOL_OP
LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
#endif
#ifdef MULTICLASSNMS_OP
LOAD_OP1(multiclass_nms, CPU);
#endif
#ifdef SLICE_OP
LOAD_OP2(slice, CPU, MALI_GPU);
#endif
#ifdef FUSION_CONVBN_OP
LOAD_OP2(fusion_conv_bn, CPU, FPGA);
LOAD_FUSION_MATCHER(fusion_conv_bn);
#endif
LOAD_OP1(quantize, CPU);
LOAD_OP1(dequantize, CPU);
...@@ -97,6 +97,7 @@ class OpRegistry { ...@@ -97,6 +97,7 @@ class OpRegistry {
}; };
#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \ #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \
template class op_class<device_type, float>; \
template <typename Dtype, typename T> \ template <typename Dtype, typename T> \
class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \ class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \
public: \ public: \
...@@ -119,16 +120,5 @@ class OpRegistry { ...@@ -119,16 +120,5 @@ class OpRegistry {
#define REGISTER_OPERATOR_FPGA(op_type, op_class) \ #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA); REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
#define USE_OP(op_type, device_name) \
extern int TouchOpRegistrar_##op_type##_##device_name(); \
static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
TouchOpRegistrar_##op_type##_##device_name()
#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -67,7 +67,16 @@ class FusionOpRegistrar { ...@@ -67,7 +67,16 @@ class FusionOpRegistrar {
explicit FusionOpRegistrar(FusionOpMatcher* matcher) { explicit FusionOpRegistrar(FusionOpMatcher* matcher) {
FusionOpRegister::Instance()->regist(matcher); FusionOpRegister::Instance()->regist(matcher);
} }
void Touch() {}
}; };
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
#define REGISTER_FUSION_MATCHER(op_type, matcher) \
static paddle_mobile::framework::FusionOpRegistrar \
__fusion_matcher_registrar_##op_type(new matcher()); \
int TouchFusionMatcherRegistrar_##op_type() { \
__fusion_matcher_registrar_##op_type.Touch(); \
return 0; \
}
...@@ -33,8 +33,6 @@ class Program { ...@@ -33,8 +33,6 @@ class Program {
bool quantification = false; bool quantification = false;
size_t combined_params_len; size_t combined_params_len;
const uint8_t *combined_params_buf; const uint8_t *combined_params_buf;
private:
}; };
} // namespace framework } // namespace framework
......
...@@ -40,7 +40,10 @@ enum VarType_Type { ...@@ -40,7 +40,10 @@ enum VarType_Type {
VARTYPE_TYPE_READER = 15, VARTYPE_TYPE_READER = 15,
VARTYPE_TYPE_CHANNEL = 16, VARTYPE_TYPE_CHANNEL = 16,
VARTYPE_TYPE_RAW = 17, VARTYPE_TYPE_RAW = 17,
VARTYPE_TYPE_TUPLE = 18 VARTYPE_TYPE_TUPLE = 18,
VARTYPE_TYPE_SIZE_T = 19,
VARTYPE_TYPE_UINT8 = 20,
VARTYPE_TYPE_INT8 = 21,
}; };
class TensorDesc { class TensorDesc {
...@@ -58,8 +61,9 @@ class TensorDesc { ...@@ -58,8 +61,9 @@ class TensorDesc {
} }
data_type_ = (VarType_Type)desc->data_type; data_type_ = (VarType_Type)desc->data_type;
} }
// return tensor dim as a vector
std::vector<int64_t> Dims() const { return dims_; }; std::vector<int64_t> Dims() const { return dims_; };
// return tensor data type
VarType_Type DataType() const { return data_type_; } VarType_Type DataType() const { return data_type_; }
private: private:
......
...@@ -31,6 +31,7 @@ class VarDesc { ...@@ -31,6 +31,7 @@ class VarDesc {
this->tensor_desc_ = var_desc.tensor_desc_; this->tensor_desc_ = var_desc.tensor_desc_;
this->type_ = var_desc.type_; this->type_ = var_desc.type_;
} }
VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) { VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
type_ = (VarType_Type)desc->type->type; type_ = (VarType_Type)desc->type->type;
name_ = std::string(desc->name); name_ = std::string(desc->name);
...@@ -44,9 +45,7 @@ class VarDesc { ...@@ -44,9 +45,7 @@ class VarDesc {
tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor); tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
break; break;
case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY: case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
desc->type->tensor_array->tensor->data_type;
tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor); tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
break; break;
default: default:
break; break;
...@@ -60,6 +59,7 @@ class VarDesc { ...@@ -60,6 +59,7 @@ class VarDesc {
break; break;
} }
} }
std::string Name() const { return name_; } std::string Name() const { return name_; }
VarType_Type Type() const { return type_; } VarType_Type Type() const { return type_; }
......
...@@ -319,10 +319,11 @@ class Tensor { ...@@ -319,10 +319,11 @@ class Tensor {
* begins. * begins.
*/ */
size_t offset_; size_t offset_;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
public: public: // NOLINT
inline void reset_data_ptr(void *p) { inline void reset_data_ptr(void *p) {
((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT
} }
float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
#endif #endif
...@@ -335,11 +336,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) { ...@@ -335,11 +336,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
stride = stride > 0 ? stride : 1; stride = stride > 0 ? stride : 1;
#ifndef PADDLE_MOBILE_FPGA #ifndef PADDLE_MOBILE_FPGA
for (int i = 0; i < tensor.numel(); i += stride) { for (int i = 0; i < tensor.numel(); i += stride) {
// 这不一定是float的
if (tensor.type() == typeid(float)) { if (tensor.type() == typeid(float)) {
printer << tensor.data<float>()[i] << " "; printer << tensor.data<float>()[i] << " ";
} else if (tensor.type() == typeid(int64_t)) { } else if (tensor.type() == typeid(int64_t)) {
printer << tensor.data<int64_t>()[i] << " "; printer << tensor.data<int64_t>()[i] << " ";
} else if (tensor.type() == typeid(int8_t)) {
printer << tensor.data<int8_t>()[i] << " ";
} }
} }
#endif #endif
......
...@@ -33,6 +33,13 @@ class Variable { ...@@ -33,6 +33,13 @@ class Variable {
template <typename T> template <typename T>
const T GetValue() const { const T GetValue() const {
if (typeid(T) == typeid(std::string)) {
PADDLE_MOBILE_THROW_EXCEPTION(
"Please use getString to get an string (to avoid of an issue with "
"gcc "
"stl lib with string copy)");
exit(0);
}
return variant.Get<T>(); return variant.Get<T>();
} }
......
...@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run( ...@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
return true; return true;
} }
template <typename Dtype, Precision P>
PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() {
paddle_mobile_->Clear();
}
// A factory to help create difference predictor. // A factory to help create difference predictor.
template <> template <>
std::unique_ptr<PaddlePredictor> std::unique_ptr<PaddlePredictor>
......
...@@ -32,7 +32,7 @@ namespace paddle_mobile { ...@@ -32,7 +32,7 @@ namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Dtype = CPU, Precision P = Precision::FP32>
class PaddleMobilePredictor : public PaddlePredictor { class PaddleMobilePredictor : public PaddlePredictor {
public: public:
PaddleMobilePredictor() {} PaddleMobilePredictor() = delete;
explicit PaddleMobilePredictor(const PaddleMobileConfig& config); explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
...@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor { ...@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
std::vector<PaddleTensor>* output_data, std::vector<PaddleTensor>* output_data,
int batch_size = -1) override; int batch_size = -1) override;
~PaddleMobilePredictor() override{}; ~PaddleMobilePredictor() override;
private: private:
std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_; std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "io/executor.h" #include "io/executor.h"
#include <operators/math/gemm.h>
#include <algorithm> #include <algorithm>
#include <utility>
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
#include "common/log.h" #include "common/log.h"
...@@ -26,74 +26,45 @@ limitations under the License. */ ...@@ -26,74 +26,45 @@ limitations under the License. */
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD #include "operators/math/gemm.h"
#include <queue>
#include <utility>
#include "common/threadpool.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
using framework::Variable;
char *Get_binary_data(std::string filename) { using framework::Variable;
FILE *file = fopen(filename.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str());
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
rewind(file);
char *data = new char[size];
size_t bytes_read = fread(data, 1, size, file);
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
fclose(file);
return data;
}
#pragma mark - executor
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
bool use_optimize, bool loddable) const bool use_optimize, const bool loddable)
: program_(p), : program_(p),
batch_size_(batch_size), batch_size_(batch_size),
use_optimize_(use_optimize), use_optimize_(use_optimize),
loddable_(loddable) { loddable_(loddable) {
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
}
Variable *variable_ptr = program_.scope->Var("batch_size"); Variable *variable_ptr = program_.scope->Var("batch_size");
variable_ptr[0].SetValue<int>(batch_size); variable_ptr->SetValue<int>(batch_size);
to_predict_program_ =
use_optimize_ ? program_.optimizeProgram : program_.originProgram;
PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr, PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
"to_predict_program_ == NULL!"); "to_predict_program_ == NULL!");
const std::vector<std::shared_ptr<framework::BlockDesc>> blocks = const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
to_predict_program_->Blocks(); to_predict_program_->Blocks();
#ifdef PADDLE_EXECUTOR_MULTITHREAD
depManager.resize(blocks.size()); DLOG << "executor in loaddable mode: " << loddable_;
#endif
DLOG << "executer in loaddable mode: " << loddable_;
for (int i = 0; i < blocks.size(); ++i) { for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<framework::BlockDesc> block_desc = blocks[i]; std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
for (int j = 0; j < ops.size(); ++j) { for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<framework::OpDesc> op = ops[j]; std::shared_ptr<framework::OpDesc> op = ops[j];
DLOG << "create op: " << j << " " << op->Type(); DLOG << "create op: " << op->Type();
auto op_base = framework::OpRegistry<Dtype>::CreateOp( auto op_base = framework::OpRegistry<Dtype>::CreateOp(
op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
program_.scope); program_.scope);
// use pre_infershape to pre resize , but if u use an lod mode tensor u // infer shape to reshape tensor before predict,
// need to resize in runtime // but for lod tensor, it will need to reshape in runtime
if (!loddable_) { if (!loddable_) {
op_base->InferShape(); op_base->InferShape();
} }
ops_of_block_[*block_desc.get()].push_back(op_base); ops_of_block_[*block_desc.get()].push_back(op_base);
#ifdef PADDLE_EXECUTOR_MULTITHREAD
depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
#endif
} }
DLOG << "Total " << ops.size() << " ops have been created ";
} }
if (program_.combined) { if (program_.combined) {
InitCombineMemory(); InitCombineMemory();
...@@ -103,118 +74,83 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -103,118 +74,83 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std::shared_ptr<framework::BlockDesc> to_predict_block = std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0); to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()]; auto &ops = ops_of_block_[*to_predict_block.get()];
int i = 0;
for (const auto &op : ops) { for (const auto &op : ops) {
DLOG << "Init op: " << i++ << " " << op->Type();
op->Init(); op->Init();
} }
} }
template <typename Dtype, Precision P> template <typename Dtype>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc, void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
framework::LoDTensor *tensor, char **data) { char **data_buf = reinterpret_cast<char **>(data);
// 1. version int64_t size = tensor->numel();
uint32_t version = *reinterpret_cast<uint32_t *>(*data); Dtype *tensor_data = tensor->mutable_data<Dtype>();
if (0) {
(*data) += sizeof(uint32_t); // TODO(hjchen2) should be moved into operator init function
float min_value;
// 2 Lod information float max_value;
uint64_t *lod_level_ptr = new uint64_t(); memcpy(&min_value, data_buf, sizeof(float));
memcpy(lod_level_ptr, (*data), sizeof(uint64_t)); memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
uint64_t lod_level = *lod_level_ptr; data_buf += 2 * sizeof(float);
delete lod_level_ptr; const float factor = (max_value - min_value) / 255.0;
(*data) += sizeof(uint64_t); const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
for (int k = 0; k < size; ++k) {
auto &lod = *tensor->mutable_lod(); tensor_data[k] = uint8_data[k] * factor + min_value;
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(*data);
(*data) += sizeof(uint64_t);
std::vector<size_t> tmp(size / sizeof(size_t));
for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *reinterpret_cast<size_t *>(*data);
(*data) += sizeof(size_t);
}
for (auto j : tmp) {
LOG(kLOG_DEBUG1) << " lod - " << j;
} }
lod[i] = tmp; data_buf += size * sizeof(uint8_t);
} } else {
memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
// 3. tensor version *data_buf += size * sizeof(Dtype);
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(*data);
(*data) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (*data)[m];
} }
(*data) += (sizeof(char) * size); }
const framework::TensorDesc &desc = var_desc.Tensor_desc(); template <typename Dtype, Precision P>
int memory_size = 1; void Executor<Dtype, P>::LoadMemory(
for (auto l : desc.Dims()) { void **data, const std::shared_ptr<framework::VarDesc> var_desc,
memory_size *= l; framework::LoDTensor *tensor) {
char **data_buf = reinterpret_cast<char **>(data);
// version
uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
*data_buf += sizeof(uint32_t);
// lod information
// uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
uint64_t lod_level = 0;
memcpy(&lod_level, *data_buf, sizeof(uint64_t));
*data_buf += sizeof(uint64_t);
auto *lod = tensor->mutable_lod();
lod->resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
*data_buf += sizeof(uint64_t);
std::vector<size_t> tmp_dim(size / sizeof(size_t));
memcpy(tmp_dim.data(), *data_buf, size);
(*lod)[i] = std::move(tmp_dim);
*data_buf += size;
} }
// tensor version
tensor->Resize(framework::make_ddim(desc.Dims())); uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
*data_buf += sizeof(uint32_t);
void *memory = nullptr; // tensor desc size
int type_size = 0; int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
switch (desc.DataType()) { *data_buf += sizeof(int32_t);
case framework::VARTYPE_TYPE_FP16: // skip tensor desc
type_size = 2; *data_buf += tensor_desc_size;
break;
const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
// parse tensor from stream
switch (tensor_desc.DataType()) {
case framework::VARTYPE_TYPE_FP32: case framework::VARTYPE_TYPE_FP32:
type_size = 4; LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
memory = tensor->mutable_data<float>();
break; break;
case framework::VARTYPE_TYPE_FP64: case framework::VARTYPE_TYPE_INT8:
type_size = 8; LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
break; break;
case framework::VARTYPE_TYPE_INT32: case framework::VARTYPE_TYPE_INT32:
memory = tensor->mutable_data<int32_t>(); LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
type_size = 4;
break;
case framework::VARTYPE_TYPE_INT64:
type_size = 8;
break;
case framework::VARTYPE_TYPE_BOOL:
type_size = 1;
break; break;
default: default:
break; LOG(kLOG_ERROR) << "data type is not supported";
}
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size; n++) {
float value;
memcpy(&value, *data + n * type_size, type_size);
if (value < 1e-30 && value > -1e-30) {
static_cast<float *>(memory)[n] = 0.0;
} else {
static_cast<float *>(memory)[n] = value;
}
}
(*data) += (sizeof(char) * memory_size * type_size);
} }
} }
...@@ -223,35 +159,19 @@ void Executor<Dtype, P>::InitMemory() { ...@@ -223,35 +159,19 @@ void Executor<Dtype, P>::InitMemory() {
for (const auto &block : to_predict_program_->Blocks()) { for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) { for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name()); auto var = program_.scope->Var(var_desc->Name());
auto tensor = var->template GetMutable<framework::LoDTensor>();
if (var_desc->Persistable()) { if (var_desc->Persistable()) {
auto tensor = var->template GetMutable<framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue; continue;
} }
char *origin_data = char *origin_data =
Get_binary_data(program_.model_path + "/" + var_desc->Name()); ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
char *data = origin_data; char *data = origin_data;
LoadMemory(*var_desc, tensor, &data); LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
delete[] origin_data;
// DLOG << "----- " << var_desc->Name();
// DLOG << "----- " << tensor->dims();
// float *pDouble = tensor->template data<float>();
// for (int i = 0; i < tensor->numel() && i < 30; ++i) {
// std::cout << pDouble[i] << std::endl;
// }
delete origin_data;
} else { } else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
bool is_mute_match; varInputMemory(var_desc, var, tensor);
framework::LoDTensor *tensor = nullptr;
is_mute_match = varInputMemory(var_desc, var, tensor);
PADDLE_MOBILE_ENFORCE(
is_mute_match,
"got unhandled var_desc->Tensor_desc().DataType(): %d",
var_desc->Tensor_desc().DataType());
} }
} }
} }
...@@ -260,84 +180,65 @@ void Executor<Dtype, P>::InitMemory() { ...@@ -260,84 +180,65 @@ void Executor<Dtype, P>::InitMemory() {
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitCombineMemory() { void Executor<Dtype, P>::InitCombineMemory() {
char *origin_data; char *origin_data = nullptr;
bool self_alloc = false;
if (program_.combined_params_buf && program_.combined_params_len) { if (program_.combined_params_buf && program_.combined_params_len) {
LOG(kLOG_INFO) << "use outter memory"; origin_data = reinterpret_cast<char *>(
origin_data = (char *)program_.combined_params_buf; const_cast<uint8_t *>(program_.combined_params_buf));
} else { } else {
LOG(kLOG_INFO) << " begin init combine memory"; self_alloc = true;
origin_data = Get_binary_data(program_.para_path); origin_data = ReadFileToBuff(program_.para_path);
} }
PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!"); PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
char *data = origin_data; char *data = origin_data;
for (const auto &block : to_predict_program_->Blocks()) { for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) { for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name()); auto var = program_.scope->Var(var_desc->Name());
auto tensor = var->template GetMutable<framework::LoDTensor>();
if (var_desc->Persistable()) { if (var_desc->Persistable()) {
auto tensor = var->template GetMutable<framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue; continue;
} }
LoadMemory(*var_desc, tensor, &data); LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
} else { } else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
bool is_mute_match = false; varInputMemory(var_desc, var, tensor);
framework::LoDTensor *tensor;
is_mute_match = varInputMemory(var_desc, var, tensor);
PADDLE_MOBILE_ENFORCE(
is_mute_match,
"got unhandled var_desc->Tensor_desc().DataType(): %d",
var_desc->Tensor_desc().DataType());
} }
} }
} }
} }
delete origin_data; if (self_alloc) {
LOG(kLOG_INFO) << " end init combine memory "; delete[] origin_data;
}
LOG(kLOG_INFO) << "init combine memory finish";
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory( bool Executor<Dtype, P>::varInputMemory(
const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var, const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
framework::LoDTensor *tensor) const { framework::LoDTensor *tensor) const {
bool is_mute_match = false; auto type = var_desc->Tensor_desc().DataType();
switch (var_desc->Tensor_desc().DataType()) { switch (type) {
case framework::VARTYPE_TYPE_FP16: { case framework::VARTYPE_TYPE_FP32:
tensor->mutable_data<float>();
break; break;
} case framework::VARTYPE_TYPE_INT8:
tensor->mutable_data<int8_t>();
case framework::VARTYPE_TYPE_FP32: {
tensor = var->template GetMutable<framework::LoDTensor>();
tensor->template mutable_data<Ptype>();
is_mute_match = true;
break; break;
} case framework::VARTYPE_TYPE_INT32:
tensor->mutable_data<int32_t>();
case framework::VARTYPE_TYPE_FP64: {
break;
}
case framework::VARTYPE_TYPE_INT32: {
tensor = var->template GetMutable<framework::LoDTensor>();
tensor->template mutable_data<int32_t>();
is_mute_match = true;
break; break;
} case framework::VARTYPE_TYPE_INT64:
tensor->mutable_data<int64_t>();
case framework::VARTYPE_TYPE_INT64: {
tensor = var->template GetMutable<framework::LoDTensor>();
tensor->template mutable_data<int64_t>();
is_mute_match = true;
break; break;
} default:
case framework::VARTYPE_TYPE_BOOL: {
break; break;
}
default: { break; }
} }
bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
(type == framework::VARTYPE_TYPE_INT8) ||
(type == framework::VARTYPE_TYPE_INT32) ||
(type == framework::VARTYPE_TYPE_INT64);
PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
return is_mute_match; return is_mute_match;
} }
...@@ -356,61 +257,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -356,61 +257,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
std::vector<ProfInfo> profile(ops.size()); std::vector<ProfInfo> profile(ops.size());
#endif #endif
#ifdef PADDLE_EXECUTOR_MULTITHREAD
std::mutex m;
std::condition_variable cv;
std::queue<int> next;
next.push(0);
int rsize = ops.size();
std::vector<int> status(rsize, 0);
auto &threadPool = ThreadPool::getThreadPool();
auto &dep = depManager[0];
auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
std::lock_guard<std::mutex> lk(m);
rsize--;
status[opi] = 2;
for (int i : dep.getNext(opi)) {
bool ok = true;
for (int j : dep.getDeps(i)) {
if (status[j] != 2) {
ok = false;
break;
}
}
if (ok && (status[i] == 0)) {
next.push(i);
}
}
cv.notify_one();
};
for (;;) {
std::unique_lock<std::mutex> lk(m);
cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
if (rsize == 0) {
break;
}
while (next.size() > 0) {
int opi = next.front();
next.pop();
status[opi] = 1;
threadPool.enqueue([opi, &ops, &finishF, &profile] {
auto &op = ops[opi];
#ifdef PADDLE_MOBILE_PROFILE
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
profile[opi].tid = ThreadPool::getThreadPoolThreadId();
#endif
ops[opi]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
finishF(opi);
});
}
}
#else
for (int i = 0; i < ops.size(); i++) { for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct timespec ts; struct timespec ts;
...@@ -424,7 +270,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -424,7 +270,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif #endif
} }
#endif
auto last_op = ops.rbegin(); auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs(); auto output_map = (*last_op)->Outputs();
std::vector<std::string> out_keys = (*last_op)->GetOutKeys(); std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
...@@ -433,34 +278,12 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -433,34 +278,12 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map, framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
*(program_.scope)); *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
#ifdef PADDLE_EXECUTOR_MULTITHREAD
// TODO(haipeng): expose profile info as an interface, user can get them to
// analysis
// the performance of their deepnet.
FILE *df = fopen("net.dot", "w");
fprintf(df, "digraph {\n");
for (int i = 0; i < ops.size(); i++) {
for (int j : dep.getNext(i)) {
fprintf(df, "op_%d -> op_%d\n", i, j);
}
}
for (int i = 0; i < ops.size(); i++) {
fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
}
fprintf(df, "}\n");
fclose(df);
#endif
// FILE *pf = fopen("profile.out", "w");
std::unordered_map<std::string, uint64_t> _tp; std::unordered_map<std::string, uint64_t> _tp;
for (int i = 0; i < profile.size(); i++) { for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i]; const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
_tp[ops[i]->Type()] += timeCost; _tp[ops[i]->Type()] += timeCost;
// fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
// ops[i]->Type().c_str(),
// pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
} }
// fclose(pf);
printf("====================[ profile ]======================\n"); printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>; using prof_t = std::pair<std::string, uint64_t>;
std::vector<prof_t> _tv(_tp.begin(), _tp.end()); std::vector<prof_t> _tv(_tp.begin(), _tp.end());
...@@ -501,61 +324,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod( ...@@ -501,61 +324,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
std::vector<ProfInfo> profile(ops.size()); std::vector<ProfInfo> profile(ops.size());
#endif #endif
#ifdef PADDLE_EXECUTOR_MULTITHREAD
std::mutex m;
std::condition_variable cv;
std::queue<int> next;
next.push(0);
int rsize = ops.size();
std::vector<int> status(rsize, 0);
auto &threadPool = ThreadPool::getThreadPool();
auto &dep = depManager[0];
auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
std::lock_guard<std::mutex> lk(m);
rsize--;
status[opi] = 2;
for (int i : dep.getNext(opi)) {
bool ok = true;
for (int j : dep.getDeps(i)) {
if (status[j] != 2) {
ok = false;
break;
}
}
if (ok && (status[i] == 0)) {
next.push(i);
}
}
cv.notify_one();
};
for (;;) {
std::unique_lock<std::mutex> lk(m);
cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
if (rsize == 0) {
break;
}
while (next.size() > 0) {
int opi = next.front();
next.pop();
status[opi] = 1;
threadPool.enqueue([opi, &ops, &finishF, &profile] {
auto &op = ops[opi];
#ifdef PADDLE_MOBILE_PROFILE
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
profile[opi].tid = ThreadPool::getThreadPoolThreadId();
#endif
ops[opi]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
finishF(opi);
});
}
}
#else
for (int i = 0; i < ops.size(); i++) { for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct timespec ts; struct timespec ts;
...@@ -565,14 +333,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod( ...@@ -565,14 +333,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
if (loddable_) { if (loddable_) {
ops[i]->InferShape(); ops[i]->InferShape();
} }
// to Run
ops[i]->Run(); ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts); clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif #endif
} }
#endif
auto last_op = ops.rbegin(); auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs(); auto output_map = (*last_op)->Outputs();
...@@ -582,34 +348,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod( ...@@ -582,34 +348,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map, framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
*(program_.scope)); *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
#ifdef PADDLE_EXECUTOR_MULTITHREAD
// TODO(haipeng): expose profile info as an interface, user can get them to
// analysis
// the performance of their deepnet.
FILE *df = fopen("net.dot", "w");
fprintf(df, "digraph {\n");
for (int i = 0; i < ops.size(); i++) {
for (int j : dep.getNext(i)) {
fprintf(df, "op_%d -> op_%d\n", i, j);
}
}
for (int i = 0; i < ops.size(); i++) {
fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
}
fprintf(df, "}\n");
fclose(df);
#endif
// FILE *pf = fopen("profile.out", "w");
std::unordered_map<std::string, uint64_t> _tp; std::unordered_map<std::string, uint64_t> _tp;
for (int i = 0; i < profile.size(); i++) { for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i]; const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
_tp[ops[i]->Type()] += timeCost; _tp[ops[i]->Type()] += timeCost;
// fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
// ops[i]->Type().c_str(),
// pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
} }
// fclose(pf);
printf("====================[ profile ]======================\n"); printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>; using prof_t = std::pair<std::string, uint64_t>;
std::vector<prof_t> _tv(_tp.begin(), _tp.end()); std::vector<prof_t> _tv(_tp.begin(), _tp.end());
...@@ -654,21 +398,20 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -654,21 +398,20 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
} }
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t, void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
string var_name) { std::string var_name) {
framework::Variable *g_feed_value = program_.scope->Var(var_name); framework::Variable *g_feed_value = program_.scope->Var(var_name);
framework::Tensor *feed_tensor = framework::Tensor *feed_tensor =
g_feed_value->GetMutable<framework::LoDTensor>(); g_feed_value->GetMutable<framework::LoDTensor>();
feed_tensor->Resize(t.dims()); feed_tensor->Resize(t.dims());
feed_tensor->ShareDataWith(t); feed_tensor->ShareDataWith(t);
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) { void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
InjectVariable(t, "feed"); InjectVariable(t, "feed");
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) { std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
...@@ -684,14 +427,14 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) { ...@@ -684,14 +427,14 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
auto *output_tensor = framework::GetVarValue<framework::LoDTensor>( auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
out_keys[0], output_map, *(program_.scope)); out_keys[0], output_map, *(program_.scope));
return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor)); return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) { void Executor<Dtype, P>::Predict_From_To(int start, int end) {
std::shared_ptr<framework::BlockDesc> to_predict_block = std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0); to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()]; auto &ops = ops_of_block_[*to_predict_block.get()];
end = end < 0 ? (int)ops.size() : end; end = end < 0 ? static_cast<int>(ops.size()) : end;
PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(), PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
"start or end parameter is wrong"); "start or end parameter is wrong");
...@@ -712,17 +455,17 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) { ...@@ -712,17 +455,17 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec; profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif #endif
} }
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) { void Executor<Dtype, P>::Predict_From(int start) {
Predict_From_To(start); Predict_From_To(start);
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) { void Executor<Dtype, P>::Predict_To(int end) {
Predict_From_To(0, end); Predict_From_To(0, end);
}; }
#endif #endif
template class Executor<CPU, Precision::FP32>; template class Executor<CPU, Precision::FP32>;
......
...@@ -18,19 +18,12 @@ limitations under the License. */ ...@@ -18,19 +18,12 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "common/types.h" #include "common/types.h"
#include "common/util.h"
#include "framework/lod_tensor.h" #include "framework/lod_tensor.h"
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/program/program.h" #include "framework/program/program.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <condition_variable>
#include <mutex>
#include <thread>
#include "common/dep_core.h"
#endif
using std::string;
namespace paddle_mobile { namespace paddle_mobile {
...@@ -38,50 +31,61 @@ template <typename Dtype = CPU, Precision P = Precision::FP32> ...@@ -38,50 +31,61 @@ template <typename Dtype = CPU, Precision P = Precision::FP32>
class Executor { class Executor {
public: public:
typedef typename PrecisionTrait<P>::ptype Ptype; typedef typename PrecisionTrait<P>::ptype Ptype;
// exector constructor
// @param program program converted from proto program in PaddlePaddle
// @param use_optimize bool whether use operator fusion to speed up or not
// @param loddable bool
Executor(const framework::Program<Dtype> program, int batch_size = 1,
const bool use_optimize = true, const bool loddable = false);
/* // predict with tensor input
* @b init executor with program load by Loader class // @param t input tensor to do prediction
* @b 用 loader load 的 program 实例化 executor // @return predicted tensor
* */
Executor(const framework::Program<Dtype> p, int batch_size = 1,
bool use_optimize = true, bool loddable = false);
/*
* @b to predict
* */
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t); std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
/*
* @b to predict // predict with lod tensor input
* */ // @param t input lod tensor to do prediction
// @return predicted lod tensor
std::shared_ptr<framework::LoDTensor> PredictLod( std::shared_ptr<framework::LoDTensor> PredictLod(
const framework::LoDTensor &t); const framework::LoDTensor &t);
/*
* @b to predict with vector and dim // predict with vector input and dims
* // @param input vector whose elements will be formed
* @b 使用 输入 和 输入的维度信息 进行预测 // @param input lod tensor to do prediction
* */ // @param dims vector whose elements will be formed
// @param input tensor shape
// @return vector which is flatted from predicted tensor
std::vector<Ptype> Predict(const std::vector<Ptype> &input, std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims); const std::vector<int64_t> &dims);
#ifdef PADDLE_MOBILE_FPGA
void InjectVariable(const framework::Tensor &t, std::string var_name);
void FeedData(const framework::Tensor &t);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
void Predict_To(int end);
#endif
protected: protected:
Executor() = default; Executor() = default;
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
int block_id);
bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
framework::Variable *var,
framework::LoDTensor *tensor) const;
void InitMemory(); void InitMemory();
void LoadMemory(const framework::VarDesc var_desc,
framework::LoDTensor *tensor, char **data);
void InitCombineMemory(); void InitCombineMemory();
void LoadMemory(void **data,
const std::shared_ptr<framework::VarDesc> var_desc,
framework::LoDTensor *tensor);
framework::Program<Dtype> program_; framework::Program<Dtype> program_;
int batch_size_ = 1; int batch_size_ = 1;
std::shared_ptr<framework::ProgramDesc> to_predict_program_; std::shared_ptr<framework::ProgramDesc> to_predict_program_;
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
int block_id);
std::map<framework::BlockDesc, std::map<framework::BlockDesc,
std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>> std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
ops_of_block_; ops_of_block_;
bool use_optimize_ = false;
bool loddable_ = false;
#ifdef PADDLE_EXECUTOR_MULTITHREAD
std::vector<depCore> depManager;
#endif
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct ProfInfo { struct ProfInfo {
int tid = 0; int tid = 0;
...@@ -89,21 +93,8 @@ class Executor { ...@@ -89,21 +93,8 @@ class Executor {
uint64_t runEnd = 0UL; uint64_t runEnd = 0UL;
}; };
#endif #endif
bool use_optimize_ = false;
bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc, bool loddable_ = false;
framework::Variable *var,
framework::LoDTensor *tensor) const;
#ifdef PADDLE_MOBILE_FPGA
public:
void InjectVariable(const framework::Tensor &t, string var_name);
void FeedData(const framework::Tensor &t);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start);
void Predict_To(int end);
#endif
}; };
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -27,8 +27,8 @@ using framework::Variable; ...@@ -27,8 +27,8 @@ using framework::Variable;
* @param scope * @param scope
*/ */
void InitMemoryFromProgram( void InitMemoryFromProgram(
std::shared_ptr<framework::ProgramDesc> &originProgramDesc, std::shared_ptr<framework::ProgramDesc> &originProgramDesc, // NOLINT
std::shared_ptr<framework::Scope> &scope) { std::shared_ptr<framework::Scope> &scope) { // NOLINT
for (const auto &block : originProgramDesc.get()->Blocks()) { for (const auto &block : originProgramDesc.get()->Blocks()) {
for (const auto &var_desc : block->Vars()) { for (const auto &var_desc : block->Vars()) {
auto var = scope.get()->Var(var_desc->Name()); auto var = scope.get()->Var(var_desc->Name());
...@@ -61,12 +61,16 @@ void InitMemoryFromProgram( ...@@ -61,12 +61,16 @@ void InitMemoryFromProgram(
*/ */
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void FusionAndPrintInfos( void FusionAndPrintInfos(
bool &optimize, bool &can_add_split, framework::Program<Dtype, P> &program, bool optimize, bool can_add_split,
framework::Program<Dtype, P> &program, // NOLINT
const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) { const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
if (optimize) { if (optimize) {
framework::ProgramOptimize program_optimize; framework::ProgramOptimize program_optimize;
program.optimizeProgram = program.optimizeProgram =
program_optimize.FusionOptimize(originProgramDesc, can_add_split); program_optimize.FusionOptimize(originProgramDesc, can_add_split);
if (!program.optimizeProgram) {
program.optimizeProgram = originProgramDesc;
}
} }
if (optimize) { if (optimize) {
program.optimizeProgram->Description("optimize: "); program.optimizeProgram->Description("optimize: ");
...@@ -74,6 +78,7 @@ void FusionAndPrintInfos( ...@@ -74,6 +78,7 @@ void FusionAndPrintInfos(
originProgramDesc->Description("program: "); originProgramDesc->Description("program: ");
} }
} }
static size_t ReadBuffer(const char *file_name, uint8_t **out) { static size_t ReadBuffer(const char *file_name, uint8_t **out) {
FILE *fp; FILE *fp;
fp = fopen(file_name, "rb"); fp = fopen(file_name, "rb");
......
...@@ -24,19 +24,11 @@ namespace paddle_mobile { ...@@ -24,19 +24,11 @@ namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32> template <typename Dtype = CPU, Precision P = Precision::FP32>
class Loader { class Loader {
public: public:
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
const framework::Program<Dtype, P> Load(const std::string &dirname, const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false, bool optimize = false,
bool quantification = false, bool quantification = false,
bool can_add_split = false); bool can_add_split = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
const framework::Program<Dtype, P> Load(const std::string &model_path, const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path, const std::string &para_path,
bool optimize = false, bool optimize = false,
......
...@@ -87,7 +87,6 @@ enum class PaddleEngineKind { ...@@ -87,7 +87,6 @@ enum class PaddleEngineKind {
class PaddlePredictor { class PaddlePredictor {
public: public:
struct Config; struct Config;
PaddlePredictor() = default;
PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor(const PaddlePredictor&) = delete;
PaddlePredictor& operator=(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete;
...@@ -107,6 +106,9 @@ class PaddlePredictor { ...@@ -107,6 +106,9 @@ class PaddlePredictor {
struct Config { struct Config {
std::string model_dir; // path to the model directory. std::string model_dir; // path to the model directory.
}; };
protected:
PaddlePredictor() = default;
}; };
struct PaddleMobileConfig : public PaddlePredictor::Config { struct PaddleMobileConfig : public PaddlePredictor::Config {
......
...@@ -19,10 +19,9 @@ namespace paddle_mobile { ...@@ -19,10 +19,9 @@ namespace paddle_mobile {
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetThreadNum(int num) { void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP #ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num); omp_set_num_threads(num);
#endif #endif
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize, bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
...@@ -128,40 +127,38 @@ PaddleMobile<Dtype, P>::~PaddleMobile() { ...@@ -128,40 +127,38 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t, void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
string var_name) { std::string var_name) {
executor_->InjectVariable(t, var_name); executor_->InjectVariable(t, var_name);
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) { void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
executor_->FeedData(t); executor_->FeedData(t);
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) { std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
return executor_->FetchResult(id); return executor_->FetchResult(id);
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) { void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
executor_->Predict_From_To(start, end); executor_->Predict_From_To(start, end);
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::Predict_From(int start) { void PaddleMobile<Dtype, P>::Predict_From(int start) {
executor_->Predict_From(start); executor_->Predict_From(start);
}; }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::Predict_To(int end) { void PaddleMobile<Dtype, P>::Predict_To(int end) {
executor_->Predict_To(end); executor_->Predict_To(end);
}; }
#endif #endif
template class PaddleMobile<CPU, Precision::FP32>; template class PaddleMobile<CPU, Precision::FP32>;
template class PaddleMobile<FPGA, Precision::FP32>; template class PaddleMobile<FPGA, Precision::FP32>;
template class PaddleMobile<GPU_MALI, Precision::FP32>; template class PaddleMobile<GPU_MALI, Precision::FP32>;
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#endif // _OPENMP #endif // _OPENMP
#include "common/types.h" #include "common/types.h"
#include "framework/load_ops.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "io/executor.h" #include "io/executor.h"
#include "io/loader.h" #include "io/loader.h"
...@@ -34,74 +35,42 @@ class PaddleMobile { ...@@ -34,74 +35,42 @@ class PaddleMobile {
public: public:
PaddleMobile() {} PaddleMobile() {}
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
bool Load(const std::string &dirname, bool optimize = false, bool Load(const std::string &dirname, bool optimize = false,
bool quantification = false, int batch_size = 1, bool quantification = false, int batch_size = 1,
bool loddable = false); bool loddable = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
bool Load(const std::string &model_path, const std::string &para_path, bool Load(const std::string &model_path, const std::string &para_path,
bool optimize = false, bool quantification = false, bool optimize = false, bool quantification = false,
int batch_size = 1, bool loddable = false); int batch_size = 1, bool loddable = false);
/*
* @b 设置线程数, 当 cmake 中开启 openmp 时生效
* */
void SetThreadNum(int num);
/*
* @b to predict
* */
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t); std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
/*
* @b to predict
* */
std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t); std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t);
/*
* @b to predict with vector and dim
*
* @b 使用 输入 和 输入的维度信息 进行预测
* */
std::vector<Ptype> Predict(const std::vector<Ptype> &input, std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims); const std::vector<int64_t> &dims);
/**
* 从内存加载model 以及 combinedparams的接口
*
* @param model_len model 文件的内存大小
* @param model_buf model文件的内存
* @param combined_params_len params文件的内存大小
* @param combined_params_buf params文件的内存
* @return
*/
bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
size_t combined_params_len, size_t combined_params_len,
const uint8_t *combined_params_buf); const uint8_t *combined_params_buf);
void SetThreadNum(int num);
void Clear(); void Clear();
~PaddleMobile(); ~PaddleMobile();
private:
std::shared_ptr<Loader<Dtype, P>> loader_;
std::shared_ptr<Executor<Dtype, P>> executor_;
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
public: void InjectVariable(const framework::Tensor &t, std::string var_name);
void InjectVariable(const framework::Tensor &t, string var_name);
void FeedData(const framework::Tensor &t); void FeedData(const framework::Tensor &t);
std::shared_ptr<framework::Tensor> FetchResult(int id = -1); std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
void Predict_From_To(int start = 0, int end = -1); void Predict_From_To(int start = 0, int end = -1);
void Predict_From(int start); void Predict_From(int start);
void Predict_To(int end); void Predict_To(int end);
#endif #endif
private:
std::shared_ptr<Loader<Dtype, P>> loader_;
std::shared_ptr<Executor<Dtype, P>> executor_;
}; };
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -13,15 +13,12 @@ ...@@ -13,15 +13,12 @@
limitations under the License. */ limitations under the License. */
#import "PaddleMobileCPU.h" #import "PaddleMobileCPU.h"
#import "framework/load_ops.h"
#import "op_symbols.h" #import "framework/tensor.h"
#include "framework/tensor.h"
#import "io/paddle_mobile.h" #import "io/paddle_mobile.h"
#import <memory> #import <memory>
#import <vector> #import <vector>
@interface PaddleMobileCPUResult() @interface PaddleMobileCPUResult()
-(void)toSetOutput:(float *)output; -(void)toSetOutput:(float *)output;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "operators/batchnorm_op.h"
#include "operators/bilinear_interp_op.h"
#include "operators/box_coder_op.h"
#include "operators/concat_op.h"
#include "operators/conv_op.h"
#include "operators/conv_transpose_op.h"
#include "operators/crf_op.h"
#include "operators/depthwise_conv_op.h"
#include "operators/dropout_op.h"
#include "operators/elementwise_add_op.h"
#include "operators/feed_op.h"
#include "operators/fetch_op.h"
#include "operators/flatten_op.h"
#include "operators/fusion_conv_add.h"
#include "operators/fusion_conv_add_add_prelu_op.h"
#include "operators/fusion_conv_add_bn_op.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/fusion_conv_add_prelu_op.h"
#include "operators/fusion_conv_add_relu_op.h"
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/fusion_dwconv_bn_relu_op.h"
#include "operators/fusion_elementwise_add_relu_op.h"
#include "operators/fusion_fc_op.h"
#include "operators/fusion_fc_relu_op.h"
#include "operators/gru_op.h"
#include "operators/im2sequence_op.h"
#include "operators/lookup_op.h"
#include "operators/lrn_op.h"
#include "operators/mul_op.h"
#include "operators/multiclass_nms_op.h"
#include "operators/pool_op.h"
#include "operators/prelu_op.h"
#include "operators/prior_box_op.h"
#include "operators/relu_op.h"
#include "operators/reshape_op.h"
#include "operators/resize_op.h"
#include "operators/scale_op.h"
#include "operators/shape_op.h"
#include "operators/sigmoid_op.h"
#include "operators/slice_op.h"
#include "operators/softmax_op.h"
#include "operators/split_op.h"
#include "operators/transpose_op.h"
...@@ -46,13 +46,4 @@ class BatchNormOp ...@@ -46,13 +46,4 @@ class BatchNormOp
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(batch_norm);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(batch_norm);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -50,12 +50,4 @@ class BilinearOp : public framework::OperatorWithKernel< ...@@ -50,12 +50,4 @@ class BilinearOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(bilinear_interp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -51,12 +51,4 @@ class BoxCoderOp : public framework::OperatorWithKernel< ...@@ -51,12 +51,4 @@ class BoxCoderOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(box_coder);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -46,14 +46,4 @@ class ConcatOp : public framework::OperatorWithKernel< ...@@ -46,14 +46,4 @@ class ConcatOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(concat);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(concat);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(concat);
#endif
#endif #endif
...@@ -46,14 +46,4 @@ class ConvOp : public framework::OperatorWithKernel< ...@@ -46,14 +46,4 @@ class ConvOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv2d);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv2d);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(conv2d);
#endif
#endif #endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_TRANSPOSE #ifdef CONV_TRANSPOSE_OP
#include "operators/conv_transpose_op.h" #include "operators/conv_transpose_op.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_TRANSPOSE #ifdef CONV_TRANSPOSE_OP
#pragma once #pragma once
...@@ -88,14 +88,4 @@ class ConvOpTranspose : public framework::OperatorWithKernel< ...@@ -88,14 +88,4 @@ class ConvOpTranspose : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv2d_transpose);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv2d_transpose);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(conv2d_transpose);
#endif
#endif #endif
...@@ -47,12 +47,4 @@ class CrfOp : public framework::OperatorWithKernel< ...@@ -47,12 +47,4 @@ class CrfOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(crf_decoding);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -56,9 +56,5 @@ namespace ops = paddle_mobile::operators; ...@@ -56,9 +56,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp); REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -48,12 +48,4 @@ class DepthwiseConvOp : public framework::OperatorWithKernel< ...@@ -48,12 +48,4 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(depthwise_conv2d);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/dequantize_op.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void DequantizeOp<DeviceType, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.out_->Resize(input_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/dequantize_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class DequantizeOp
: public framework::OperatorWithKernel<DeviceType,
DequantizeParam<DeviceType>,
DequantizeKernel<DeviceType, T>> {
public:
DequantizeOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
DequantizeKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
...@@ -30,8 +30,6 @@ namespace ops = paddle_mobile::operators; ...@@ -30,8 +30,6 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp); REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp); REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
#endif #endif
......
...@@ -50,13 +50,4 @@ class DropoutOp : public framework::OperatorWithKernel< ...@@ -50,13 +50,4 @@ class DropoutOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(dropout);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(dropout);
#endif
#endif #endif
...@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp); ...@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp); REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -48,13 +48,4 @@ class ElementwiseAddOp : public framework::OperatorWithKernel< ...@@ -48,13 +48,4 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(elementwise_add);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(elementwise_add);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "feed_op.h" #include "operators/feed_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(feed, ops::FeedOp); REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
#endif #endif
......
...@@ -20,11 +20,11 @@ limitations under the License. */ ...@@ -20,11 +20,11 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FeedOp : public framework::OperatorBase<DeviceType> { class FeedOp : public framework::OperatorBase<DeviceType> {
public: public:
FeedOp(const string &type, const VariableNameMap &inputs, FeedOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
...@@ -35,10 +35,6 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -35,10 +35,6 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
auto out_dims = param_.Out()->dims(); auto out_dims = param_.Out()->dims();
out_dims[0] = param_.BatchSize(); out_dims[0] = param_.BatchSize();
param_.Out()->Resize(out_dims); param_.Out()->Resize(out_dims);
// note : mobile infershape iscalled when executer is created. so do not
// pass lod here .
// it is empty
} }
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
...@@ -49,7 +45,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -49,7 +45,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
} }
void RunImpl() const { void RunImpl() const {
auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX()); auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX()); // NOLINT
fpga::format_image(input); fpga::format_image(input);
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
Tensor *output = param_.Out(); Tensor *output = param_.Out();
...@@ -61,7 +57,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -61,7 +57,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
args.output_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW; args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = (void *)input_ptr; args.image.address = (void *)input_ptr; // NOLINT
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2]; args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3]; args.image.width = (uint32_t)input->dims()[3];
...@@ -86,13 +82,3 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -86,13 +82,3 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(feed);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(feed);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(feed);
#endif
...@@ -12,10 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "fetch_op.h" #include "operators/fetch_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
......
...@@ -46,13 +46,3 @@ class FetchOp : public framework::OperatorBase<DeviceType> { ...@@ -46,13 +46,3 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fetch);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fetch);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fetch);
#endif
...@@ -53,8 +53,6 @@ namespace ops = paddle_mobile::operators; ...@@ -53,8 +53,6 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp); REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -63,12 +63,4 @@ class FlattenOp : public framework::OperatorWithKernel< ...@@ -63,12 +63,4 @@ class FlattenOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(flatten);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP #ifdef FUSION_CONVADDADDPRELU_OP
#include "fusion_conv_add_add_prelu_op.h" #include "operators/fusion_conv_add_add_prelu_op.h"
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -48,13 +48,14 @@ void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const { ...@@ -48,13 +48,14 @@ void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
ops::FusionConvAddAddPReluOpMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp); REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp); REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
#endif #endif
#endif #endif // FUSION_CONVADDADDPRELU_OP
...@@ -76,37 +76,7 @@ class FusionConvAddAddPReluOp ...@@ -76,37 +76,7 @@ class FusionConvAddAddPReluOp
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_ADD_PRELU_REGISTER
#define CONV_ADD_ADD_PRELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
new FusionConvAddAddPReluOpMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef CONV_ADD_ADD_PRELU_REGISTER
#define CONV_ADD_ADD_PRELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
new FusionConvAddAddPReluOpMatcher());
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_add_prelu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_add_prelu);
#endif
#endif #endif
...@@ -49,11 +49,11 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const { ...@@ -49,11 +49,11 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_bn, ops::FusionConvAddBNMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp); REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp); REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
#endif #endif
......
...@@ -70,46 +70,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel< ...@@ -70,46 +70,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel<
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_ADD_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
new FusionConvAddBNMatcher());
#define FUSION_CONV_ADD_BN_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_ADD_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
new FusionConvAddBNMatcher());
#define FUSION_CONV_ADD_BN_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_ADD_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
new FusionConvAddBNMatcher());
#define FUSION_CONV_ADD_BN_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_bn);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_bn);
#endif
#endif #endif
...@@ -49,11 +49,12 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const { ...@@ -49,11 +49,12 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
ops::FusionConvAddBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif #endif
......
...@@ -75,46 +75,7 @@ class FusionConvAddBNReluOp ...@@ -75,46 +75,7 @@ class FusionConvAddBNReluOp
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
new FusionConvAddBNReluMatcher());
#define FUSION_CONV_ADD_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
new FusionConvAddBNReluMatcher());
#define FUSION_CONV_ADD_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
new FusionConvAddBNReluMatcher());
#define FUSION_CONV_ADD_BN_RELU_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_bn_relu);
#endif
#endif #endif
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP #ifdef FUSION_CONVADD_OP
#include "operators/fusion_conv_add.h" #include "operators/fusion_conv_add_op.h"
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -49,13 +49,13 @@ void FusionConvAddOp<Dtype, T>::InferShape() const { ...@@ -49,13 +49,13 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp); REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp); REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -65,40 +65,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel< ...@@ -65,40 +65,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fusion_conv_add);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP #ifdef FUSION_CONVADDPRELU_OP
#include "fusion_conv_add_prelu_op.h" #include "operators/fusion_conv_add_prelu_op.h"
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -48,11 +48,12 @@ void FusionConvAddPReluOp<Dtype, T>::InferShape() const { ...@@ -48,11 +48,12 @@ void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
ops::FusionConvAddPReluOpMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp); REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp); REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
#endif #endif
......
...@@ -71,37 +71,7 @@ class FusionConvAddPReluOp ...@@ -71,37 +71,7 @@ class FusionConvAddPReluOp
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_PRELU_REGISTER
#define CONV_ADD_PRELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
new FusionConvAddPReluOpMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef CONV_ADD_PRELU_REGISTER
#define CONV_ADD_PRELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
new FusionConvAddPReluOpMatcher());
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_prelu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_prelu);
#endif
#endif #endif
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP #ifdef FUSION_CONVADDRELU_OP
#include "fusion_conv_add_relu_op.h" #include "operators/fusion_conv_add_relu_op.h"
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -48,11 +48,11 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const { ...@@ -48,11 +48,11 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp); REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp); REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif #endif
......
...@@ -65,37 +65,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel< ...@@ -65,37 +65,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_RELU_REGISTER
#define CONV_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
new FusionConvAddReluOpMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef CONV_ADD_RELU_REGISTER
#define CONV_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
new FusionConvAddReluOpMatcher());
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_relu);
#endif
#endif #endif
...@@ -49,11 +49,12 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const { ...@@ -49,11 +49,12 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
ops::FusionConvBNAddReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp); REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
#endif #endif
......
...@@ -80,46 +80,7 @@ class FusionConvBNAddReluOp ...@@ -80,46 +80,7 @@ class FusionConvBNAddReluOp
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
new FusionConvBNAddReluMatcher());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
new FusionConvBNAddReluMatcher());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
new FusionConvBNAddReluMatcher());
#define FUSION_CONV_BN_ADD_RELU_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_bn_add_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_bn_add_relu);
#endif
#endif #endif
...@@ -48,11 +48,11 @@ void FusionConvBNOp<Dtype, T>::InferShape() const { ...@@ -48,11 +48,11 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp); REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp); REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
#endif #endif
......
...@@ -67,39 +67,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel< ...@@ -67,39 +67,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel<
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_registrar(
new FusionConvBNMatcher());
#define FUSION_CONV_BN_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_registrar(
new FusionConvBNMatcher());
#define FUSION_CONV_BN_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_bn);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_bn);
#endif
#endif #endif
...@@ -49,11 +49,11 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const { ...@@ -49,11 +49,11 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_conv_bn_relu, ops::FusionConvBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp); REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp); REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#endif #endif
......
...@@ -72,39 +72,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel< ...@@ -72,39 +72,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
new FusionConvBNReluMatcher());
#define FUSION_CONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
new FusionConvBNReluMatcher());
#define FUSION_CONV_BN_RELU_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_bn_relu);
#endif
#endif #endif
...@@ -49,11 +49,11 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const { ...@@ -49,11 +49,11 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp); REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -73,38 +73,7 @@ class FusionDWConvBNReluOp ...@@ -73,38 +73,7 @@ class FusionDWConvBNReluOp
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_DWCONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
new FusionDWConvBNReluMatcher());
#define FUSION_DWCONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_DWCONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
new FusionDWConvBNReluMatcher());
#define FUSION_DWCONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_dwconv_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef FUSION_ELEMENTWISEADDRELU_OP #ifdef FUSION_ELEMENTWISEADDRELU_OP
#include "fusion_elementwise_add_relu_op.h" #include "operators/fusion_elementwise_add_relu_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -29,6 +29,9 @@ void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const { ...@@ -29,6 +29,9 @@ void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
ops::FusioneElementwiseAddReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu, // REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
// ops::FusionElementwiseAddReluOp); // ops::FusionElementwiseAddReluOp);
......
...@@ -61,39 +61,7 @@ class FusionElementwiseAddReluOp ...@@ -61,39 +61,7 @@ class FusionElementwiseAddReluOp
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
new FusioneElementwiseAddReluMatcher());
#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
new FusioneElementwiseAddReluMatcher());
#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
new FusioneElementwiseAddReluMatcher());
#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_elementwise_add_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_elementwise_add_relu);
#endif
#endif #endif
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_FC_OP #ifdef FUSION_FC_OP
#include "operators/fusion_fc_op.h" #include "operators/fusion_fc_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -54,6 +55,8 @@ void FusionFcOp<Dtype, T>::InferShape() const { ...@@ -54,6 +55,8 @@ void FusionFcOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp); REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
#endif #endif
...@@ -64,4 +67,4 @@ REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp); ...@@ -64,4 +67,4 @@ REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp); REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
#endif #endif
#endif #endif // FUSION_FC_OP
...@@ -25,8 +25,7 @@ limitations under the License. */ ...@@ -25,8 +25,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using std::string;
using std::vector;
class FusionFcMatcher : public framework::FusionOpMatcher { class FusionFcMatcher : public framework::FusionOpMatcher {
public: public:
FusionFcMatcher() { FusionFcMatcher() {
...@@ -49,7 +48,7 @@ class FusionFcOp : public framework::OperatorWithKernel< ...@@ -49,7 +48,7 @@ class FusionFcOp : public framework::OperatorWithKernel<
DeviceType, FusionFcParam<DeviceType>, DeviceType, FusionFcParam<DeviceType>,
operators::FusionFcKernel<DeviceType, T>> { operators::FusionFcKernel<DeviceType, T>> {
public: public:
FusionFcOp(const string &type, const VariableNameMap &inputs, FusionFcOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap &attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
...@@ -60,42 +59,11 @@ class FusionFcOp : public framework::OperatorWithKernel< ...@@ -60,42 +59,11 @@ class FusionFcOp : public framework::OperatorWithKernel<
using framework::OperatorWithKernel< using framework::OperatorWithKernel<
DeviceType, FusionFcParam<DeviceType>, DeviceType, FusionFcParam<DeviceType>,
operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel; operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected: void InferShape() const override;
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_FC_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#define FUSION_FC_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_FC_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#define FUSION_FC_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_FC_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#define FUSION_FC_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU #endif // FUSION_FC_OP
USE_OP_CPU(fusion_fc);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fusion_fc);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_fc);
#endif
#endif
...@@ -54,6 +54,9 @@ void FusionFcReluOp<Dtype, T>::InferShape() const { ...@@ -54,6 +54,9 @@ void FusionFcReluOp<Dtype, T>::InferShape() const {
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp); REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
#endif #endif
......
...@@ -64,39 +64,7 @@ class FusionFcReluOp : public framework::OperatorWithKernel< ...@@ -64,39 +64,7 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
protected: protected:
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_FC_RELU_REGISTER
static framework::FusionOpRegistrar fc_relu_registrar(
new FusionFcReluMatcher());
#define FUSION_FC_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_FC_RELU_REGISTER
static framework::FusionOpRegistrar fc_relu_registrar(
new FusionFcReluMatcher());
#define FUSION_FC_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_FC_RELU_REGISTER
static framework::FusionOpRegistrar fc_relu_registrar(
new FusionFcReluMatcher());
#define FUSION_FC_RELU_REGISTER
#endif
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_fc_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fusion_fc_relu);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_fc_relu);
#endif
#endif // FUSION_FC_RELU_OP #endif // FUSION_FC_RELU_OP
...@@ -64,8 +64,6 @@ namespace ops = paddle_mobile::operators; ...@@ -64,8 +64,6 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(gru, ops::GruOp); REGISTER_OPERATOR_CPU(gru, ops::GruOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
#endif #endif
......
...@@ -47,12 +47,4 @@ class GruOp : public framework::OperatorWithKernel< ...@@ -47,12 +47,4 @@ class GruOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(gru);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -50,12 +50,4 @@ class Im2SequenceOp : public framework::OperatorWithKernel< ...@@ -50,12 +50,4 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(im2sequence);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#ifdef BILINEAR_INTERP_OP #ifdef BILINEAR_INTERP_OP
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_TRANSPOSE #ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h" #include "operators/kernel/conv_transpose_kernel.h"
#include "operators/kernel/central-arm-func/conv_transpose_arm_func.h" #include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CPU
#include "operators/kernel/dequantize_kernel.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile {
namespace operators {
template <>
bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
return true;
}
template <>
void DequantizeKernel<CPU, float>::Compute(
const DequantizeParam<CPU> &param) const {
const Tensor *input = param.input_;
Tensor *output = param.out_;
float activation_scale = param.activation_scale_->data<float>()[0];
float weight_scale = param.weight_scale_;
const int32_t *x = input->data<const int32_t>();
float *y = output->mutable_data<float>();
size_t size = output->numel();
float scale = 1.f / (activation_scale * weight_scale);
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
float32x4_t s = vdupq_n_f32(scale);
for (size_t i = 0; i < loop; ++i) {
int32x4_t r0 = vld1q_s32(x);
int32x4_t r1 = vld1q_s32(x + 4);
int32x4_t r2 = vld1q_s32(x + 8);
int32x4_t r3 = vld1q_s32(x + 12);
float32x4_t f0 = vcvtq_f32_s32(r0);
float32x4_t f1 = vcvtq_f32_s32(r1);
float32x4_t f2 = vcvtq_f32_s32(r2);
float32x4_t f3 = vcvtq_f32_s32(r3);
f0 = vmulq_f32(f0, s);
f1 = vmulq_f32(f1, s);
f2 = vmulq_f32(f2, s);
f3 = vmulq_f32(f3, s);
vst1q_f32(y, f0);
vst1q_f32(y + 4, f1);
vst1q_f32(y + 8, f2);
vst1q_f32(y + 12, f3);
x += 16;
y += 16;
}
size = remain;
#endif
for (size_t i = 0; i < size; ++i) {
y[i] = x[i] * scale;
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#ifdef FLATTEN_OP #ifdef FLATTEN_OP
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CPU
#include "operators/kernel/quantize_kernel.h"
#include <cmath>
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#ifndef __aarch64__
float32_t vmaxvq_f32(float32x4_t r) {
float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
return vget_lane_f32(vpmax_f32(v, v), 0);
}
#endif
int32x4_t vrnd_towards_zero(float32x4_t r) { return vcvtq_s32_f32(r); }
int32x4_t vrnd_away_zero(float32x4_t r) {
float32x4_t plus = vdupq_n_f32(0.5);
float32x4_t minus = vdupq_n_f32(-0.5);
float32x4_t zero = vdupq_n_f32(0);
uint32x4_t more_than_zero = vcgtq_f32(r, zero);
float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
temp = vaddq_f32(r, temp);
int32x4_t ret = vcvtq_s32_f32(temp);
return ret;
}
int32x4_t vrnd_to_even(float32x4_t r) {
#if 0
int32x4_t ret;
float value[4];
vst1q_f32(value, r);
for (int i = 0; i < 4; ++i) {
float v = round(value[i]);
int32_t q = (int32_t)v;
if (abs(abs(v - value[i]) - 0.5) > 0) {
ret[i] = q;
} else {
if (abs(q) % 2 == 0) {
ret[i] = q;
} else {
ret[i] = q + ((q > 0) ? -1 : 1);
}
}
}
return ret;
#else
float32x4_t point5 = vdupq_n_f32(0.5);
int32x4_t one = vdupq_n_s32(1);
int32x4_t zero = vdupq_n_s32(0);
int32x4_t rnd = vrnd_away_zero(r);
float32x4_t frnd = vcvtq_f32_s32(rnd);
frnd = vsubq_f32(frnd, r);
frnd = vabsq_f32(frnd);
uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
int32x4_t abs_rnd = vabsq_s32(rnd);
abs_rnd = vandq_s32(abs_rnd, one);
uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
mask = veorq_u32(more_than_zero, mask);
more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
mask = vaddq_u32(more_than_zero, mask);
int32x4_t smask = vreinterpretq_s32_u32(mask);
smask = vsubq_s32(smask, one);
rnd = vaddq_s32(rnd, smask);
return rnd;
#endif
}
#endif
namespace paddle_mobile {
namespace operators {
static float find_abs_max(const Tensor *input) {
float max_abs = 0.f;
const float *x = input->data<const float>();
size_t size = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
for (size_t i = 0; i < loop; ++i) {
float32x4_t max;
float32x4_t r0 = vld1q_f32(x);
float32x4_t r1 = vld1q_f32(x + 4);
float32x4_t r2 = vld1q_f32(x + 8);
float32x4_t r3 = vld1q_f32(x + 12);
r0 = vabsq_f32(r0);
r1 = vabsq_f32(r1);
r2 = vabsq_f32(r2);
r3 = vabsq_f32(r3);
max[0] = vmaxvq_f32(r0);
max[1] = vmaxvq_f32(r1);
max[2] = vmaxvq_f32(r2);
max[3] = vmaxvq_f32(r3);
max[0] = vmaxvq_f32(max);
if (max[0] > max_abs) {
max_abs = max[0];
}
x += 16;
}
size = remain;
#endif
for (size_t i = 0; i < size; ++i) {
float value = std::abs(x[i]);
if (value > max_abs) {
max_abs = value;
}
}
return max_abs;
}
static void quantize_round_to_even(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
for (size_t i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(x);
float32x4_t r1 = vld1q_f32(x + 4);
float32x4_t r2 = vld1q_f32(x + 8);
float32x4_t r3 = vld1q_f32(x + 12);
r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale);
r3 = vmulq_n_f32(r3, scale);
int32x4_t q0 = vrnd_to_even(r0);
int32x4_t q1 = vrnd_to_even(r1);
int32x4_t q2 = vrnd_to_even(r2);
int32x4_t q3 = vrnd_to_even(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5);
vst1_s8(y + 8, d6);
x += 16;
y += 16;
}
size = remain;
#endif
for (size_t i = 0; i < size; ++i) {
float value = x[i] * scale;
float v = round(value);
int32_t q = (int32_t)v;
if (abs(abs(q - value) - 0.5) > 0) {
y[i] = q;
} else {
if (abs(q) % 2 == 0) {
y[i] = q;
} else {
y[i] = q + ((q > 0) ? -1 : 1);
}
}
}
}
static void quantize_round_to_zero(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
for (size_t i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(x);
float32x4_t r1 = vld1q_f32(x + 4);
float32x4_t r2 = vld1q_f32(x + 8);
float32x4_t r3 = vld1q_f32(x + 12);
r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale);
r3 = vmulq_n_f32(r3, scale);
int32x4_t q0 = vrnd_towards_zero(r0);
int32x4_t q1 = vrnd_towards_zero(r1);
int32x4_t q2 = vrnd_towards_zero(r2);
int32x4_t q3 = vrnd_towards_zero(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5);
vst1_s8(y + 8, d6);
x += 16;
y += 16;
}
size = remain;
#endif
for (size_t i = 0; i < size; ++i) {
y[i] = trunc(x[i] * scale);
}
}
static void quantize_round_to_nearest(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
for (size_t i = 0; i < loop; ++i) {
float32x4_t r0 = vld1q_f32(x);
float32x4_t r1 = vld1q_f32(x + 4);
float32x4_t r2 = vld1q_f32(x + 8);
float32x4_t r3 = vld1q_f32(x + 12);
r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale);
r3 = vmulq_n_f32(r3, scale);
int32x4_t q0 = vrnd_away_zero(r0);
int32x4_t q1 = vrnd_away_zero(r1);
int32x4_t q2 = vrnd_away_zero(r2);
int32x4_t q3 = vrnd_away_zero(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(y, d5);
vst1_s8(y + 8, d6);
x += 16;
y += 16;
}
size = remain;
#endif
for (size_t i = 0; i < size; ++i) {
y[i] = round(x[i] * scale);
}
}
template <>
bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
return true;
}
template <>
void QuantizeKernel<CPU, float>::Compute(
const QuantizeParam<CPU> &param) const {
float max_abs = 0.f;
const Tensor *input = param.input_;
Tensor *output = param.out_;
Tensor *output_scale = param.online_scale_;
if (param.is_static_) {
max_abs = param.static_scale_;
} else {
max_abs = find_abs_max(input);
}
max_abs = std::max(max_abs, 1e-6f);
// only support int8 currently
float online_scale = 127 / max_abs;
param.online_scale_->mutable_data<float>()[0] = online_scale;
switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN:
quantize_round_to_even(input, online_scale, output);
break;
case ROUND_NEAREST_TOWARDS_ZERO:
quantize_round_to_zero(input, online_scale, output);
break;
case ROUND_NEAREST_AWAY_ZERO:
quantize_round_to_nearest(input, online_scale, output);
default:
LOG(kLOG_ERROR) << "round type is not supported.";
break;
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#ifdef SHAPE_OP #ifdef SHAPE_OP
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#ifdef SPLIT_OP #ifdef SPLIT_OP
......
...@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_TRANSPOSE #pragma once
#include <vector> #ifdef CONV_TRANSPOSE_OP
#include <vector>
#include "framework/ddim.h" #include "framework/ddim.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
...@@ -24,7 +24,9 @@ limitations under the License. */ ...@@ -24,7 +24,9 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using framework::DDim; using framework::DDim;
void sigmoid(const Tensor *X, Tensor *Y) { void sigmoid(const Tensor *X, Tensor *Y) {
#ifdef __ARM_NEON #ifdef __ARM_NEON
const float *input = X->data<float>(); const float *input = X->data<float>();
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_TRANSPOSE #ifdef CONV_TRANSPOSE_OP
#pragma once #pragma once
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class DequantizeKernel
: public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
public:
void Compute(const DequantizeParam<DeviceType> &param) const;
bool Init(DequantizeParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
...@@ -24,10 +24,12 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -24,10 +24,12 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
auto inputs = param->Inputs(); auto inputs = param->Inputs();
auto out = param->Out(); auto out = param->Out();
auto image_num = inputs.size(); auto image_num = inputs.size();
auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *)); auto images_in =
auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *)); (half **)fpga::fpga_malloc(image_num * sizeof(int *)); // NOLINT
auto scales_in =
(float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT
auto channel_num = auto channel_num =
(uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT
auto height = inputs[0]->dims()[2]; auto height = inputs[0]->dims()[2];
auto width = inputs[0]->dims()[3]; auto width = inputs[0]->dims()[3];
...@@ -36,22 +38,21 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -36,22 +38,21 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width, input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified"); "Image height & width should be unified");
images_in[i] = (half *)input->data<float>(); images_in[i] = (half *)input->data<float>(); // NOLINT
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
scales_in[i] = input->scale; scales_in[i] = input->scale;
} }
fpga::format_concat_output(out, (int)height, (int)width, (int)image_num, fpga::format_concat_output(out, height, width, image_num, channel_num);
channel_num);
fpga::ConcatArgs concatArgs = {0}; fpga::ConcatArgs concatArgs = {0};
concatArgs.image_num = (uint32_t)image_num; concatArgs.image_num = image_num;
concatArgs.images_in = images_in; concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in; concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->data<float>(); concatArgs.image_out = (half *)out->data<float>(); // NOLINT
concatArgs.scale_out = out->scale; concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num; concatArgs.channel_num = channel_num;
concatArgs.height = (uint32_t)height; concatArgs.height = height;
concatArgs.width = (uint32_t)width; concatArgs.width = width;
param->SetFpgaArgs(concatArgs); param->SetFpgaArgs(concatArgs);
return true; return true;
} }
......
...@@ -38,7 +38,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -38,7 +38,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
auto new_scale = new Tensor(); auto new_scale = new Tensor();
auto new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
......
...@@ -31,7 +31,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -31,7 +31,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
int channel = out->dims()[1]; int channel = out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1; bs_ptr[i + channel] = 1;
bs_ptr[i] = bias_ptr[i]; bs_ptr[i] = bias_ptr[i];
......
...@@ -33,7 +33,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -33,7 +33,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // // NOLINT
auto new_scale = new Tensor(); auto new_scale = new Tensor();
auto new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
......
...@@ -33,7 +33,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -33,7 +33,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number"); "Output channel should be equal to bias number");
const int channel = out->dims()[1]; const int channel = out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
auto new_scale = new Tensor(); auto new_scale = new Tensor();
auto new_bias = new Tensor(); auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel}); auto new_scale_ptr = new_scale->mutable_data<float>({channel});
......
...@@ -32,8 +32,8 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -32,8 +32,8 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
fpga::EWAddArgs ewaddArgs = {0}; fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.relu_enabled = relu_enabled; ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.const0 = 1; ewaddArgs.const0 = 0x3c00; // =1
ewaddArgs.const1 = 1; ewaddArgs.const1 = 0x3c00; // =1
ewaddArgs.image0.address = input_x_ptr; ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale; ewaddArgs.image0.scale_address = input_x->scale;
......
...@@ -28,7 +28,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -28,7 +28,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number"); "Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1]; int channel = (uint32_t)out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1; bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i]; bs_ptr[i] = input_z_ptr[i];
...@@ -45,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -45,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1); fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
...@@ -61,7 +62,7 @@ template <> ...@@ -61,7 +62,7 @@ template <>
void FusionFcReluKernel<FPGA, float>::Compute( void FusionFcReluKernel<FPGA, float>::Compute(
const FusionFcReluParam<FPGA> &param) const { const FusionFcReluParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs()); fpga::ComputeFpgaConv(param.FpgaArgs());
}; }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -30,7 +30,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -30,7 +30,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number"); "Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1]; int channel = (uint32_t)out->dims()[1];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) { for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1; bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i]; bs_ptr[i] = input_z_ptr[i];
...@@ -46,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -46,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_filter(filter, max_value, 1); fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#include "operators/kernel/mul_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
bool relu_enabled = false;
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<LoDTensor *>(param->InputY());
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = 0;
}
int num = (uint32_t)filter->dims()[1];
int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(),
"Filter element num should be equal to IFM element num");
int height = (uint32_t)input_x->dims()[2];
int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width;
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
0, bs_ptr);
param->SetFpgaArgs(conv_arg);
return true;
}
template <>
void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
fpga::ComputeFpgaConv(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -29,8 +29,12 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) { ...@@ -29,8 +29,12 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
vector<int> ksize = param->Ksize(); vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides(); vector<int> strides = param->Strides();
vector<int> paddings = param->Paddings(); vector<int> paddings = param->Paddings();
std::string pooling_type = param->PoolingType();
fpga::PoolingArgs poolArgs = {0}; fpga::PoolingArgs poolArgs = {0};
poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1
poolArgs.kernel_reciprocal =
fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));
poolArgs.image.address = input_ptr; poolArgs.image.address = input_ptr;
poolArgs.image.channels = (uint32_t)input->dims()[1]; poolArgs.image.channels = (uint32_t)input->dims()[1];
poolArgs.image.height = (uint32_t)input->dims()[2]; poolArgs.image.height = (uint32_t)input->dims()[2];
......
...@@ -54,8 +54,8 @@ void SoftmaxKernel<FPGA, float>::Compute( ...@@ -54,8 +54,8 @@ void SoftmaxKernel<FPGA, float>::Compute(
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate( fpga::fpga_invalidate(
(void *)in_x->data<float>(), (void *)in_x->data<float>(), // NOLINT
(size_t)fpga::get_align_image_cw((int)in_x->dims()[1]) * sizeof(float)); fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size()); fpga::fpga_flush(out->data<float>(), out->memory_size());
......
...@@ -30,6 +30,7 @@ class FusionFcKernel ...@@ -30,6 +30,7 @@ class FusionFcKernel
void Compute(const FusionFcParam<DeviceType>& param) const; void Compute(const FusionFcParam<DeviceType>& param) const;
bool Init(FusionFcParam<DeviceType>* param); bool Init(FusionFcParam<DeviceType>* param);
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class QuantizeKernel
: public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
public:
void Compute(const QuantizeParam<DeviceType> &param) const;
bool Init(QuantizeParam<DeviceType> *param);
};
} // namespace operators
} // namespace paddle_mobile
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType> template <typename DeviceType>
inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) { inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) {
const auto *input_x = param.InputX(); const auto *input_x = param.InputX();
......
...@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SIGMOID_OP
#pragma once #pragma once
#ifdef SIGMOID_OP
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using framework::OpKernelBase; using framework::OpKernelBase;
void sigmoid(const Tensor* X, Tensor* Y);
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class SigmoidKernel class SigmoidKernel
: public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> { : public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> {
...@@ -29,6 +31,7 @@ class SigmoidKernel ...@@ -29,6 +31,7 @@ class SigmoidKernel
void Compute(const SigmoidParam<DeviceType>& param) const override; void Compute(const SigmoidParam<DeviceType>& param) const override;
bool Init(SigmoidParam<DeviceType>* param); bool Init(SigmoidParam<DeviceType>* param);
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -47,12 +47,4 @@ class LookupOp : public framework::OperatorWithKernel< ...@@ -47,12 +47,4 @@ class LookupOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(lookup_table);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(lrn, ops::LrnOp); ...@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp); REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -47,13 +47,4 @@ class LrnOp : public framework::OperatorWithKernel< ...@@ -47,13 +47,4 @@ class LrnOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(lrn);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(lrn);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale, Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) { const Tensor *new_bias, bool if_relu) {
#if __ARM_NEON #if __ARM_NEON
//#ifdef _OPENMP // #ifdef _OPENMP
// const float *newscale_data = new_scale->data<float>(); // const float *newscale_data = new_scale->data<float>();
// const float *newbias_data = new_bias->data<float>(); // const float *newbias_data = new_bias->data<float>();
// //
...@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
// } // }
// } // }
// //
//#else // #else
const float *input_data = input->data<float>(); const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>(); const float *filter_data = filter->data<float>();
...@@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, ...@@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
input_data += inhxw * c; input_data += inhxw * c;
output_data += outhxw * c; output_data += outhxw * c;
} }
//#endif // #endif
#endif #endif
} }
......
...@@ -26,7 +26,7 @@ limitations under the License. */ ...@@ -26,7 +26,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
int MC = 0; /*int MC = 0;
int KC = 0; int KC = 0;
int NC = 0; int NC = 0;
...@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int); ...@@ -40,7 +40,7 @@ typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
FnPack procPackA; FnPack procPackA;
FnPack procPackB; FnPack procPackB;
FnAddDot procAddDot; FnAddDot procAddDot;*/
/* /*
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
...@@ -101,8 +101,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -101,8 +101,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/ */
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const float *a0, *a1, *a2, *a3; const float *a0, *a1, *a2, *a3;
for (int i = 0; i < m - m_tail; i += MR) { for (int i = 0; i < m - m_tail; i += MR) {
a0 = A + i * lda; a0 = A + i * lda;
...@@ -142,8 +142,8 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, ...@@ -142,8 +142,8 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
for (int i = 0; i < i_length; i += MR) { for (int i = 0; i < i_length; i += MR) {
const float *a0 = A + i * lda; const float *a0 = A + i * lda;
...@@ -196,8 +196,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, ...@@ -196,8 +196,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < i_length; i += MR) { for (int i = 0; i < i_length; i += MR) {
...@@ -251,8 +251,8 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, ...@@ -251,8 +251,8 @@ void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
for (int i = 0; i < i_length; i += MR) { for (int i = 0; i < i_length; i += MR) {
const float *a0 = A + i * lda; const float *a0 = A + i * lda;
...@@ -317,8 +317,8 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda, ...@@ -317,8 +317,8 @@ void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
} }
} }
void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, void Gemm::PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
const int i_length = m - m_tail; const int i_length = m - m_tail;
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < i_length; i += MR) { for (int i = 0; i < i_length; i += MR) {
...@@ -385,8 +385,8 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, ...@@ -385,8 +385,8 @@ void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
} }
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
float *local_buffer = buffer + j * k; float *local_buffer = buffer + j * k;
...@@ -436,8 +436,8 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -436,8 +436,8 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
...@@ -489,8 +489,8 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -489,8 +489,8 @@ void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
} }
#if __aarch64__ #if __aarch64__
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
float *local_buffer = buffer + j * k; float *local_buffer = buffer + j * k;
...@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -519,8 +519,8 @@ void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B,
float *buffer) { int ldb, float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < j_length; j += NR) { for (int j = 0; j < j_length; j += NR) {
...@@ -550,8 +550,8 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -550,8 +550,8 @@ void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
float *local_buffer = buffer + j * k; float *local_buffer = buffer + j * k;
...@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -580,8 +580,8 @@ void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
} }
} }
void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, void Gemm::PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B,
float *buffer) { int ldb, float *buffer) {
const int j_length = n - n_tail; const int j_length = n - n_tail;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < n - n_tail; j += NR) { for (int j = 0; j < n - n_tail; j += NR) {
...@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, ...@@ -613,8 +613,9 @@ void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
#endif // __aarch64__ #endif // __aarch64__
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a,
float beta, float *c, float *C, int ldc, bool relu) { const float *b, float beta, float *c, float *C, int ldc,
bool relu) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -648,9 +649,9 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, ...@@ -648,9 +649,9 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias) { int ldc, bool relu, float *bias) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, ...@@ -692,9 +693,10 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc, const float *b, float beta, float *c, float *C,
bool relu, float *new_scale, float *new_bias) { int ldc, bool relu, float *new_scale,
float *new_bias) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, ...@@ -717,10 +719,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
} }
// 分块矩阵乘法 // 分块矩阵乘法
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *new_scale, float *new_bias, int ldc, bool relu, float *new_scale,
float *bias) { float *new_bias, float *bias) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -737,9 +739,9 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, ...@@ -737,9 +739,9 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias); WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
} }
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1) { std::string mode, float *bias, float *bias1) {
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < nc; j += NR) { for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) { for (int i = 0; i < mc; i += MR) {
...@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, ...@@ -759,7 +761,7 @@ void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -794,7 +796,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
// float32x4x4_t cv = {cv0, cv1, cv2, cv3}; // float32x4x4_t cv = {cv0, cv1, cv2, cv3};
} }
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
// init C // init C
float32x4_t cv0 = vdupq_n_f32(0.0); float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0); float32x4_t cv1 = vdupq_n_f32(0.0);
...@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -844,7 +846,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
// 分块矩阵乘法结果回写 // 分块矩阵乘法结果回写
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -877,10 +879,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -917,7 +919,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
} }
// C = A * B + bias // C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { ...@@ -955,7 +958,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -996,8 +999,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -996,8 +999,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + bias, relu(C) // C = A * B + bias, relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, ...@@ -1038,8 +1041,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B + C,prelu(C) // C = A * B + C,prelu(C)
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1) { float *p, std::string mode, float *bias,
float *bias1) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -1114,8 +1118,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_bias) { float *new_scale, float *new_bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1159,8 +1163,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, ...@@ -1159,8 +1163,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) { float *new_scale, float *new_bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1205,8 +1209,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1205,8 +1209,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C),C = C + bias; relu(C) // C = A * B, batchnorm(C),C = C + bias; relu(C)
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias) { float *new_scale, float *new_bias, float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1259,7 +1263,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#else #else
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -1330,10 +1334,9 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
} }
/* /*
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, int
const float *B, int ldb, float beta, float *C, int ldc, lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
bool relu) { *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3; const float *a0, *b0, *b1, *b2, *b3;
float *c0, *C0; float *c0, *C0;
...@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -1552,7 +1555,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
} }
} }
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias) { int ldc, bool relu, float *new_scale, float *new_bias) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n)); float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
...@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, ...@@ -1764,7 +1767,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
} }
*/ */
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -1872,7 +1875,7 @@ void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
} }
// C = A * B // C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1929,10 +1932,10 @@ void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) { ...@@ -1996,7 +1999,8 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + bias // C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) { ...@@ -2034,7 +2038,7 @@ void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
int nc1 = nc / 16; int nc1 = nc / 16;
int _nc1 = nc % 16; int _nc1 = nc % 16;
int step = 4 * ldc; int step = 4 * ldc;
...@@ -2108,8 +2112,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) { ...@@ -2108,8 +2112,8 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
} }
// C = A * B + bias, relu(C) // C = A * B + bias, relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, ...@@ -2149,8 +2153,9 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
} }
} }
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1) { float *p, std::string mode, float *bias,
float *bias1) {
if (nc < 4) { if (nc < 4) {
if (bias1 == nullptr) { if (bias1 == nullptr) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
...@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, ...@@ -2383,8 +2388,8 @@ void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
if (nc < 4) { if (nc < 4) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
for (int j = 0; j < nc; ++j) { for (int j = 0; j < nc; ++j) {
...@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -2484,8 +2489,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *bias) { float *scale, float *bias) {
if (nc < 4) { if (nc < 4) {
for (int i = 0; i < mc; ++i) { for (int i = 0; i < mc; ++i) {
for (int j = 0; j < nc; ++j) { for (int j = 0; j < nc; ++j) {
...@@ -2595,8 +2600,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale, ...@@ -2595,8 +2600,8 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
} }
// C = A * B, batchnorm(C),C = C + bias; relu(C) // C = A * B, batchnorm(C),C = C + bias; relu(C)
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias) { float *new_scale, float *new_bias, float *bias) {
int nc1 = nc / 4; int nc1 = nc / 4;
int _nc1 = nc % 4; int _nc1 = nc % 4;
...@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2649,7 +2654,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
/* /*
// C = A * B // C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc) { void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
int nc2 = _nc1 / 4; int nc2 = _nc1 / 4;
...@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2695,10 +2700,10 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = alpha * A * B + beta * C // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {} void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
// C = A * B + C // C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc) { void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2736,7 +2741,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B + C, relu(C) // C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) { void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2784,7 +2789,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C) // C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale, void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) { float *bias) {
int nc1 = n / 16; int nc1 = n / 16;
int _nc1 = n % 16; int _nc1 = n % 16;
...@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2850,12 +2855,9 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
} }
// C = A * B, batchnorm(C), relu(C) // C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale, void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
float *bias) { *scale, float *bias) { int nc1 = n / 16; int _nc1 = n % 16; int nc2 = _nc1 /
int nc1 = n / 16; 4; int nc3 = 16 - 4 * (_nc1 % 4);
int _nc1 = n % 16;
int nc2 = _nc1 / 4;
int nc3 = 16 - 4 * (_nc1 % 4);
asm volatile( asm volatile(
"vmov.f32 q14, #0.0 \n\t" "vmov.f32 q14, #0.0 \n\t"
...@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -2926,7 +2928,7 @@ void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __aarch64__ #endif // __aarch64__
#else #else
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
float *c0, *c1, *c2, *c3; float *c0, *c1, *c2, *c3;
c0 = c; c0 = c;
c1 = c + ldc; c1 = c + ldc;
...@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -2962,38 +2964,42 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
} }
} }
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {} void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
}
void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {} void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {}
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {} void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias) {} float *bias) {}
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p, void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1) {} float *p, std::string mode, float *bias,
float *bias1) {}
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
float *new_bias) {} float *new_scale, float *new_bias) {}
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias) {} float *new_scale, float *new_bias) {}
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1) {} float *new_scale, float *new_bias, float *bias1) {
}
#endif // __ARM_NEON #endif // __ARM_NEON
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu, const float *B, int ldb, float beta, float *C, int ldc,
float *bias) { bool relu, float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3063,9 +3069,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta, float *C,
bool relu, float *new_scale, float *new_bias, float *bias) { int ldc, bool relu, float *new_scale, float *new_bias,
float *bias) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -3136,9 +3143,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3136,9 +3143,9 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1) { std::string mode, float *bias, float *bias1) {
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster) // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int L1 = 32 * 1024; int L1 = 32 * 1024;
...@@ -3212,9 +3219,9 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, ...@@ -3212,9 +3219,9 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
} }
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias) { bool relu, float *bias) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
#else #else
...@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3237,18 +3244,18 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
NC = (n + NR - 1) / NR * NR; NC = (n + NR - 1) / NR * NR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c; procPackB = &Gemm::PackMatrixB_omp_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c; procPackB = &Gemm::PackMatrixB_omp_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB); (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else { } else {
...@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3265,18 +3272,19 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
MC = (m + MR - 1) / MR * MR; MC = (m + MR - 1) / MR * MR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c; procPackB = &Gemm::PackMatrixB_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c; procPackA = &Gemm::PackMatrixA_omp_6r;
procAddDot = AddDot6x8; procPackB = &Gemm::PackMatrixB_8c;
procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA); (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
} }
...@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3298,7 +3306,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, bias + i); &C(i, 0), ldc, relu, bias + i);
} }
...@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3315,7 +3323,7 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C, InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, bias); &C(0, j), ldc, relu, bias);
} }
...@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3327,10 +3335,10 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
const float *B, int ldb, float beta, float *C, int ldc, int lda, const float *B, int ldb, float beta,
bool relu, float *new_scale, float *new_bias, float *C, int ldc, bool relu, float *new_scale,
float *bias) { float *new_bias, float *bias) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
#else #else
...@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3353,18 +3361,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
NC = (n + NR - 1) / NR * NR; NC = (n + NR - 1) / NR * NR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c; procPackB = &Gemm::PackMatrixB_omp_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c; procPackB = &Gemm::PackMatrixB_omp_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB); (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else { } else {
...@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3381,18 +3389,18 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
MC = (m + MR - 1) / MR * MR; MC = (m + MR - 1) / MR * MR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c; procPackB = &Gemm::PackMatrixB_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c; procPackB = &Gemm::PackMatrixB_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA); (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
} }
...@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3414,7 +3422,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
if (bias == nullptr) { if (bias == nullptr) {
InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
&C(i, 0), ldc, relu, new_scale + i, new_bias + i); &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
...@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3437,7 +3445,7 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
if (bias == nullptr) { if (bias == nullptr) {
InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
&C(0, j), ldc, relu, new_scale, new_bias); &C(0, j), ldc, relu, new_scale, new_bias);
...@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -3455,9 +3463,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc,
std::string mode, float *bias, float *bias1) { float *p, std::string mode, float *bias,
float *bias1) {
#ifdef _OPENMP #ifdef _OPENMP
int max_threads = omp_get_max_threads(); int max_threads = omp_get_max_threads();
#else #else
...@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3480,18 +3489,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
NC = (n + NR - 1) / NR * NR; NC = (n + NR - 1) / NR * NR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_16c; procPackB = &Gemm::PackMatrixB_omp_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_6r; procPackA = &Gemm::PackMatrixA_6r;
procPackB = PackMatrixB_omp_8c; procPackB = &Gemm::PackMatrixB_omp_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
procPackB(KC, NC, NC % NR, B, ldb, packedB); (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB);
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
} else { } else {
...@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3508,18 +3517,18 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
MC = (m + MR - 1) / MR * MR; MC = (m + MR - 1) / MR * MR;
#if __aarch64__ #if __aarch64__
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_16c; procPackB = &Gemm::PackMatrixB_16c;
procAddDot = AddDot6x16; procAddDot = &Gemm::AddDot6x16;
#else #else
procPackA = PackMatrixA_omp_6r; procPackA = &Gemm::PackMatrixA_omp_6r;
procPackB = PackMatrixB_8c; procPackB = &Gemm::PackMatrixB_8c;
procAddDot = AddDot6x8; procAddDot = &Gemm::AddDot6x8;
#endif #endif
packedA = static_cast<float *>( packedA = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
procPackA(MC, KC, MC % MR, A, lda, packedA); (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA);
packedB = static_cast<float *>( packedB = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
} }
...@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3541,7 +3550,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
float *local_A = packedA + MC * KC * local_threads; float *local_A = packedA + MC * KC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A); (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
if (bias1 == nullptr) { if (bias1 == nullptr) {
InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc, InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
p + i, mode, bias + i, nullptr); p + i, mode, bias + i, nullptr);
...@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3563,7 +3572,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
float *local_B = packedB + KC * NC * local_threads; float *local_B = packedB + KC * NC * local_threads;
float *local_C = packedC + MC * NC * local_threads; float *local_C = packedC + MC * NC * local_threads;
procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B); (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B);
if (bias1 == nullptr) { if (bias1 == nullptr) {
InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p, InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
mode, bias, nullptr); mode, bias, nullptr);
...@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, ...@@ -3580,7 +3589,7 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
paddle_mobile::memory::Free(zero); paddle_mobile::memory::Free(zero);
} }
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__ #if __aarch64__
...@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3867,7 +3876,8 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
} }
#if __aarch64__ #if __aarch64__
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c,
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3956,7 +3966,8 @@ void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
} }
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) { void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c,
int ldc) {
const float *a_ptr, *b_ptr; const float *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
......
...@@ -35,146 +35,166 @@ namespace paddle_mobile { ...@@ -35,146 +35,166 @@ namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
/* class Gemm {
public:
/*
// 将 A 矩阵分块复制到连续内存(ColMajor) // 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(ColMajor) // 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
*/ */
typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *);
// 将 A 矩阵分块复制到连续内存(RowMajor) typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda, int);
float *buffer); FnPack procPackA;
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda, FnPack procPackB;
float *buffer); FnAddDot procAddDot;
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); // 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda, void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
// 将 B 矩阵分块复制到连续内存(RowMajor) float *buffer);
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb, void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
// 分块矩阵乘法 float *buffer);
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b, void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
float beta, float *c, float *C, int ldc, bool relu); float *buffer);
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a, void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu);
void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias);
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, const float *b, float beta, float *c, float *C,
int ldc, bool relu, float *bias); int ldc, bool relu, float *new_scale, float *new_bias);
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a, const float *b, float beta, float *c, float *C,
const float *b, float beta, float *c, float *C, int ldc, int ldc, bool relu, float *new_scale,
bool relu, float *new_scale, float *new_bias); float *new_bias, float *bias);
void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
const float *b, float beta, float *c, float *C, float *c, float *C, int ldc, float *p,
int ldc, bool relu, float *new_scale, float *new_bias, std::string mode, float *bias, float *bias1);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float
*C, int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写
// C = A * B
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C,prelu(C)
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// C = A * B + bias ,relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias); float *bias);
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, // C = A * B, batchnorm(C)
float *c, float *C, int ldc, float *p, void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
std::string mode, float *bias, float *bias1); float *new_scale, float *new_bias);
/* // C = A * B, batchnorm(C), relu(C)
// 向量矩阵乘法 (M = 1) void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, float *new_scale, float *new_bias);
const float *B, int ldb, float beta, float *C, int ldc, void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
bool relu); float *new_scale, float *new_bias, float *bias1);
/*
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A, // 向量矩阵乘法结果回写
int lda, const float *B, int ldb, float beta, float *C, // C = A * B
int ldc, bool relu, float *new_scale, float *new_bias); void VecWriteBasic(int n, float *c, float *C, int ldc);
*/ // C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, bool relu,
float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias);
// 计算一个更小的 C 矩阵分块 // 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc); void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc); int lda, const float *B, int ldb, float beta, float *C,
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc); int ldc, bool relu, float *new_scale, float *new_bias,
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc); float *bias);
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
// 分块矩阵乘法结果回写 const float *B, int ldb, float *C, int ldc, float *p,
// C = A * B std::string mode, float *bias, float *bias1);
void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C
void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + bias
void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
// C = A * B + C,prelu(C)
void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// C = A * B + bias ,relu(C)
void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
float *bias);
// C = A * B, batchnorm(C)
void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法 private:
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, int MC = 0;
const float *B, int ldb, float beta, float *C, int ldc, bool relu, int KC = 0;
float *bias); int NC = 0;
// 32位 float 矩阵乘法, 并对结果进行 batchnrom float *packedA;
void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, float *packedB;
const float *B, int ldb, float beta, float *C, int ldc, float *packedC;
bool relu, float *new_scale, float *new_bias, float *bias); float *zero;
void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, };
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
// 32位 float 矩阵乘法(openmp 多线程版本)
void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *bias);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias, float *bias);
void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> { ...@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
static void compute(GRUMetaValue<T> value, int frame_size, int batch_size, static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
const ActivationType active_node, const ActivationType active_node,
const ActivationType active_gate) { const ActivationType active_gate) {
Gemm gemm;
if (value.prev_out_value) { if (value.prev_out_value) {
Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, value.prev_out_value, frame_size, value.gate_weight,
frame_size * 3, false, nullptr); frame_size * 2, 1, value.gate_value, frame_size * 3, false,
nullptr);
} }
forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size, forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
batch_size, active_gate); batch_size, active_gate);
if (value.prev_out_value) { if (value.prev_out_value) {
Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value, gemm.Sgemm(batch_size, frame_size, frame_size, 1,
frame_size, value.state_weight, frame_size, 1, value.reset_output_value, frame_size, value.state_weight,
value.gate_value + frame_size * 2, frame_size * 3, false, nullptr); frame_size, 1, value.gate_value + frame_size * 2,
frame_size * 3, false, nullptr);
} }
forward_final_output(forward::gru_finalOutput<T>(), value, frame_size, forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include <cstring> #include <cstring>
#include <string>
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -35,12 +36,13 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -35,12 +36,13 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int M = dim_out[0]; int M = dim_out[0];
int N = dim_out[1]; int N = dim_out[1];
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm;
if (trans_a) { if (trans_a) {
int numel = matrix_a.numel(); int numel = matrix_a.numel();
int m = matrix_a.dims()[0]; int m = matrix_a.dims()[0];
int n = matrix_a.dims()[1]; int n = matrix_a.dims()[1];
float *tmp = (float *)(matrix_a.data<float>()); float *tmp = (float *)(matrix_a.data<float>()); // NOLINT
float *a = static_cast<float *>( float *a = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel)); paddle_mobile::memory::Alloc(sizeof(float) * numel));
int index = 0; int index = 0;
...@@ -49,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -49,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
a[index++] = tmp[i * n + j]; a[index++] = tmp[i * n + j];
} }
} }
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta, gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
matrix_out->data<float>(), N, relu, bias); matrix_out->data<float>(), N, relu, bias);
#endif #endif
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K,
N, beta, matrix_out->data<float>(), N, relu, bias); matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
N, relu, bias);
#else #else
Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, gemm.Sgemm(M, N, K, alpha, matrix_a.data<float>(), K,
beta, matrix_out->data<float>(), N, relu, bias); matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
relu, bias);
#endif #endif
} }
} }
...@@ -73,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -73,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
float alpha, framework::Tensor *matrix_out, float beta, float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale, bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias, int group, float *bias) { framework::Tensor *new_bias, int group, float *bias) {
Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -85,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -85,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP #ifdef _OPENMP
SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K, gemm.SgemmWithBn_omp(
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N, M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
relu, new_scale->data<float>() + group, beta, matrix_out->data<float>(), N, relu,
new_bias->data<float>() + group, bias); new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
#else #else
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
N, beta, matrix_out->data<float>(), N, relu, matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
new_scale->data<float>() + group, new_bias->data<float>() + group, N, relu, new_scale->data<float>() + group,
bias); new_bias->data<float>() + group, bias);
#endif #endif
} }
void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
framework::Tensor *matrix_out, float *p, std::string mode, framework::Tensor *matrix_out, float *p, std::string mode,
float *bias, float *bias1) { float *bias, float *bias1) {
Gemm gemm;
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -112,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, ...@@ -112,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
int K = (!trans_a) ? dim_a[1] : dim_a[0]; int K = (!trans_a) ? dim_a[1] : dim_a[0];
#ifdef _OPENMP #ifdef _OPENMP
SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
N, matrix_out->data<float>(), N, p, mode, bias, bias1); matrix_b.data<float>(), N, matrix_out->data<float>(),
N, p, mode, bias, bias1);
#else #else
SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N, gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
matrix_out->data<float>(), N, p, mode, bias, bias1); matrix_b.data<float>(), N, matrix_out->data<float>(), N,
p, mode, bias, bias1);
#endif #endif
} }
...@@ -126,7 +136,7 @@ struct ClearTensor<CPU, T> { ...@@ -126,7 +136,7 @@ struct ClearTensor<CPU, T> {
void operator()(framework::Tensor *tensor) { void operator()(framework::Tensor *tensor) {
auto size = tensor->numel(); auto size = tensor->numel();
auto *tensor_data = tensor->data<float>(); auto *tensor_data = tensor->data<float>();
memset((void *)tensor_data, 0, sizeof(T) * size); memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT
} }
}; };
......
...@@ -225,7 +225,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) { ...@@ -225,7 +225,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
// //
// const float coef = 1.0 / 9.0; // const float coef = 1.0 / 9.0;
// for (int k = 0; k < batch_size; ++k) { // for (int k = 0; k < batch_size; ++k) {
//#pragma omp parallel for // #pragma omp parallel for
// for (int c = 0; c < output_channels; ++c) { // for (int c = 0; c < output_channels; ++c) {
// const float *input_seg = input_data + c * inputdata_channel_stride; // const float *input_seg = input_data + c * inputdata_channel_stride;
// float *output_seg = out_data + c * outputdata_channel_stride; // float *output_seg = out_data + c * outputdata_channel_stride;
......
...@@ -62,6 +62,6 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp); ...@@ -62,6 +62,6 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp); REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
#endif #endif
#endif #endif
...@@ -47,13 +47,4 @@ class MulOp : public framework::OperatorWithKernel< ...@@ -47,13 +47,4 @@ class MulOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(mul);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(mul);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -42,9 +42,5 @@ namespace ops = paddle_mobile::operators; ...@@ -42,9 +42,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp); REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -52,12 +52,4 @@ class MultiClassNMSOp : public framework::OperatorWithKernel< ...@@ -52,12 +52,4 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(multiclass_nms);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -40,30 +40,6 @@ using std::vector; ...@@ -40,30 +40,6 @@ using std::vector;
template <typename Dtype> template <typename Dtype>
struct DtypeTensorTrait { struct DtypeTensorTrait {
typedef void ptype;
typedef void rtype;
};
template <>
struct DtypeTensorTrait<CPU> {
// This is the type we obtained in variable.
typedef framework::LoDTensor gtype;
// This type will be the parent class type
// or the same type.
typedef framework::Tensor rtype;
};
template <>
struct DtypeTensorTrait<FPGA> {
// This is the type we obtained in variable.
typedef framework::LoDTensor gtype;
// This type will be the parent class type
// or the same type.
typedef framework::Tensor rtype;
};
template <>
struct DtypeTensorTrait<GPU_MALI> {
// This is the type we obtained in variable. // This is the type we obtained in variable.
typedef framework::LoDTensor gtype; typedef framework::LoDTensor gtype;
// This type will be the parent class type // This type will be the parent class type
...@@ -287,6 +263,10 @@ class OpParam { ...@@ -287,6 +263,10 @@ class OpParam {
static const T GetAttr(const string &key, const AttributeMap &map) { static const T GetAttr(const string &key, const AttributeMap &map) {
return ((Attribute)map.at(key)).Get<T>(); return ((Attribute)map.at(key)).Get<T>();
} }
static const std::string GetStringAttr(const string &key,
const AttributeMap &map) {
return ((Attribute)map.at(key)).GetString();
}
static const bool HasAttr(const string &key, const AttributeMap &map) { static const bool HasAttr(const string &key, const AttributeMap &map) {
return map.count(key) > 0; return map.count(key) > 0;
...@@ -462,6 +442,15 @@ class MulParam : OpParam { ...@@ -462,6 +442,15 @@ class MulParam : OpParam {
GType *out_; GType *out_;
int x_num_col_dims_; int x_num_col_dims_;
int y_num_col_dims_; int y_num_col_dims_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::WrapperConvArgs fpga_conv_args;
public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
#endif
}; };
#endif #endif
...@@ -517,7 +506,7 @@ class LrnParam : public OpParam { ...@@ -517,7 +506,7 @@ class LrnParam : public OpParam {
alpha_ = GetAttr<float>("alpha", attrs); alpha_ = GetAttr<float>("alpha", attrs);
beta_ = GetAttr<float>("beta", attrs); beta_ = GetAttr<float>("beta", attrs);
k_ = GetAttr<float>("k", attrs); k_ = GetAttr<float>("k", attrs);
data_format_ = GetAttr<string>("data_format", attrs); data_format_ = GetStringAttr("data_format", attrs);
} }
const RType *InputX() const { return input_x_; } const RType *InputX() const { return input_x_; }
...@@ -614,7 +603,7 @@ class PoolParam : public OpParam { ...@@ -614,7 +603,7 @@ class PoolParam : public OpParam {
input_ = InputXFrom<GType>(inputs, scope); input_ = InputXFrom<GType>(inputs, scope);
output_ = OutFrom<GType>(outputs, scope); output_ = OutFrom<GType>(outputs, scope);
pooling_type_ = GetAttr<string>("pooling_type", attrs); pooling_type_ = GetStringAttr("pooling_type", attrs);
ksize_ = GetAttr<vector<int>>("ksize", attrs); ksize_ = GetAttr<vector<int>>("ksize", attrs);
strides_ = GetAttr<vector<int>>("strides", attrs); strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs); paddings_ = GetAttr<vector<int>>("paddings", attrs);
...@@ -748,7 +737,7 @@ class BoxCoderParam : public OpParam { ...@@ -748,7 +737,7 @@ class BoxCoderParam : public OpParam {
input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope); input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope);
input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope); input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope);
output_box_ = OutputBoxFrom<GType>(outputs, scope); output_box_ = OutputBoxFrom<GType>(outputs, scope);
code_type_ = GetAttr<std::string>("code_type", attrs); code_type_ = GetStringAttr("code_type", attrs);
} }
const RType *InputPriorBox() const { return input_priorbox_; } const RType *InputPriorBox() const { return input_priorbox_; }
...@@ -1223,7 +1212,7 @@ class PReluParam : public OpParam { ...@@ -1223,7 +1212,7 @@ class PReluParam : public OpParam {
alpha_ = InputAlphaFrom<GType>(inputs, scope); alpha_ = InputAlphaFrom<GType>(inputs, scope);
framework::DDim dims = alpha_->dims(); framework::DDim dims = alpha_->dims();
out_ = OutFrom<GType>(outputs, scope); out_ = OutFrom<GType>(outputs, scope);
mode_ = GetAttr<std::string>("mode", attrs); mode_ = GetStringAttr("mode", attrs);
DLOG << "PReluParam mode after" << mode_; DLOG << "PReluParam mode after" << mode_;
} }
const RType *InputX() const { return input_x_; } const RType *InputX() const { return input_x_; }
...@@ -1354,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> { ...@@ -1354,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
const AttributeMap &attrs, const Scope &scope) const AttributeMap &attrs, const Scope &scope)
: ConvParam<Dtype>(inputs, outputs, attrs, scope) { : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope); alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
mode_ = OpParam::GetAttr<std::string>("mode", attrs); mode_ = OpParam::GetStringAttr("mode", attrs);
framework::DDim dims = alpha_->dims(); framework::DDim dims = alpha_->dims();
bias_ = OpParam::InputYFrom<GType>(inputs, scope); bias_ = OpParam::InputYFrom<GType>(inputs, scope);
axis_ = OpParam::GetAttr<int>("axis", attrs); axis_ = OpParam::GetAttr<int>("axis", attrs);
...@@ -1397,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> { ...@@ -1397,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
: ConvParam<Dtype>(inputs, outputs, attrs, scope) { : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
bias1_ = OpParam::InputYFrom1<GType>(inputs, scope); bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope); alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
mode_ = OpParam::GetAttr<std::string>("mode", attrs); mode_ = OpParam::GetStringAttr("mode", attrs);
framework::DDim dims = alpha_->dims(); framework::DDim dims = alpha_->dims();
bias_ = OpParam::InputYFrom<GType>(inputs, scope); bias_ = OpParam::InputYFrom<GType>(inputs, scope);
output_ = OpParam::OutFrom<GType>(outputs, scope); output_ = OpParam::OutFrom<GType>(outputs, scope);
...@@ -1935,7 +1924,7 @@ class DropoutParam : public OpParam { ...@@ -1935,7 +1924,7 @@ class DropoutParam : public OpParam {
}; };
#endif #endif
#ifdef CONV_TRANSPOSE #ifdef CONV_TRANSPOSE_OP
template <typename Dtype> template <typename Dtype>
class ConvTransposeParam : public OpParam { class ConvTransposeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -2004,8 +1993,8 @@ class GruParam : public OpParam { ...@@ -2004,8 +1993,8 @@ class GruParam : public OpParam {
OutputBatchResetHiddenPrevFrom<GType>(outputs, scope); OutputBatchResetHiddenPrevFrom<GType>(outputs, scope);
output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope); output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope);
output_hidden_ = OutputHiddenFrom<GType>(outputs, scope); output_hidden_ = OutputHiddenFrom<GType>(outputs, scope);
activation_ = GetAttr<std::string>("activation", attrs); activation_ = GetStringAttr("activation", attrs);
gate_activation_ = GetAttr<std::string>("gate_activation", attrs); gate_activation_ = GetStringAttr("gate_activation", attrs);
is_reverse_ = GetAttr<bool>("is_reverse", attrs); is_reverse_ = GetAttr<bool>("is_reverse", attrs);
} }
const GType *InputInput() const { return input_input_; } const GType *InputInput() const { return input_input_; }
...@@ -2151,5 +2140,75 @@ class ShapeParam : public OpParam { ...@@ -2151,5 +2140,75 @@ class ShapeParam : public OpParam {
}; };
#endif #endif
template <typename Dtype>
class QuantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
if (HasAttr("is_static", attrs)) {
is_static_ = GetAttr<bool>("is_static", attrs);
}
// online
// scale = max(abs(x))
online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
// offline
if (HasAttr("static_scale", attrs)) {
static_scale_ = GetAttr<float>("static_scale", attrs);
}
// x = round(scale * x)
if (HasAttr("round_type", attrs)) {
round_type_ = GetAttr<RoundType>("round_type", attrs);
}
}
public:
// op input
RType *input_;
// op output
RType *out_;
//
RType *online_scale_;
// if static scale or not
bool is_static_ = false;
// quantize scale
float static_scale_ = 1.0f;
// round method type
// nearest_zero and nearest_even is valid currently
RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
};
template <typename Dtype>
class DequantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope);
activation_scale_ = GetVarValue<GType>("Scale", inputs, scope);
// dequantization is performed as x = x / static_scale / online_scale
if (HasAttr("weight_scale", attrs)) {
weight_scale_ = GetAttr<float>("weight_scale", attrs);
} else {
weight_scale_ = GetAttr<float>("max_range", attrs);
}
}
public:
// op input
RType *input_;
// op output
RType *out_;
RType *activation_scale_;
float weight_scale_;
};
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -48,14 +48,4 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>, ...@@ -48,14 +48,4 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(pool2d);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(pool2d);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(pool2d);
#endif
#endif #endif
...@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(prelu, ops::PReluOp); ...@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp); REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -50,14 +50,4 @@ class PReluOp : public framework::OperatorWithKernel< ...@@ -50,14 +50,4 @@ class PReluOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(prelu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(prelu);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(prelu);
#endif
#endif #endif
...@@ -54,7 +54,5 @@ REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp); ...@@ -54,7 +54,5 @@ REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -51,12 +51,4 @@ class PriorBoxOp : public framework::OperatorWithKernel< ...@@ -51,12 +51,4 @@ class PriorBoxOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(prior_box);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/quantize_op.h"
#include <vector>
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
void QuantizeOp<DeviceType, T>::InferShape() const {
const auto& input_dims = this->param_.input_->dims();
this->param_.out_->Resize(input_dims);
auto scale_dims = framework::make_ddim(std::vector<int>{1});
this->param_.online_scale_->Resize(scale_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/quantize_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class QuantizeOp : public framework::OperatorWithKernel<
DeviceType, QuantizeParam<DeviceType>,
operators::QuantizeKernel<DeviceType, T>> {
public:
QuantizeOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, QuantizeParam<DeviceType>,
operators::QuantizeKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// inference output shape
void InferShape() const override;
};
} // namespace operators
} // namespace paddle_mobile
...@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp); ...@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp); REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -53,13 +53,4 @@ class ReluOp : public framework::OperatorWithKernel< ...@@ -53,13 +53,4 @@ class ReluOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(relu);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -38,7 +38,5 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); ...@@ -38,7 +38,5 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp); REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -51,14 +51,4 @@ class ReshapeOp : public framework::OperatorWithKernel< ...@@ -51,14 +51,4 @@ class ReshapeOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(reshape);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(reshape);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -30,14 +30,10 @@ void ResizeOp<Dtype, T>::InferShape() const { ...@@ -30,14 +30,10 @@ void ResizeOp<Dtype, T>::InferShape() const {
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(resize);
REGISTER_OPERATOR_CPU(resize, ops::ResizeOp); REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(resize);
REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp); REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -30,14 +30,10 @@ void ScaleOp<Dtype, T>::InferShape() const { ...@@ -30,14 +30,10 @@ void ScaleOp<Dtype, T>::InferShape() const {
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(scale);
REGISTER_OPERATOR_CPU(scale, ops::ScaleOp); REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(scale);
REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp); REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -36,7 +36,5 @@ REGISTER_OPERATOR_CPU(shape, ops::ShapeOp); ...@@ -36,7 +36,5 @@ REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -48,12 +48,4 @@ class ShapeOp : public framework::OperatorWithKernel< ...@@ -48,12 +48,4 @@ class ShapeOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(shape);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
void SigmoidOp<DeviceType, T>::InferShape() const { void SigmoidOp<DeviceType, T>::InferShape() const {
this->param_.Out()->Resize(this->param_.InputX()->dims()); this->param_.Out()->Resize(this->param_.InputX()->dims());
...@@ -30,9 +31,5 @@ namespace ops = paddle_mobile::operators; ...@@ -30,9 +31,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp); REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -17,13 +17,13 @@ limitations under the License. */ ...@@ -17,13 +17,13 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/kernel/sigmoid_kernel.h" #include "operators/kernel/sigmoid_kernel.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class SigmoidOp : public framework::OperatorWithKernel< class SigmoidOp : public framework::OperatorWithKernel<
DeviceType, SigmoidParam<DeviceType>, DeviceType, SigmoidParam<DeviceType>,
...@@ -43,15 +43,8 @@ class SigmoidOp : public framework::OperatorWithKernel< ...@@ -43,15 +43,8 @@ class SigmoidOp : public framework::OperatorWithKernel<
void InferShape() const override; void InferShape() const override;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(sigmoid);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -29,14 +29,10 @@ void SliceOp<Dtype, T>::InferShape() const { ...@@ -29,14 +29,10 @@ void SliceOp<Dtype, T>::InferShape() const {
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(slice);
REGISTER_OPERATOR_CPU(slice, ops::SliceOp); REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU #ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(slice);
REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp); REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -48,14 +48,4 @@ class SoftmaxOp : public framework::OperatorWithKernel< ...@@ -48,14 +48,4 @@ class SoftmaxOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(softmax);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(softmax);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(softmax);
#endif
#endif #endif
...@@ -83,9 +83,5 @@ namespace ops = paddle_mobile::operators; ...@@ -83,9 +83,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(split, ops::SplitOp); REGISTER_OPERATOR_CPU(split, ops::SplitOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif // SPLIT_OP
...@@ -47,12 +47,4 @@ class SplitOp : public framework::OperatorWithKernel< ...@@ -47,12 +47,4 @@ class SplitOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(split);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -55,9 +55,5 @@ namespace ops = paddle_mobile::operators; ...@@ -55,9 +55,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp); REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
#endif #endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif // TRANSPOSE_OP
...@@ -50,12 +50,4 @@ class TransposeOp : public framework::OperatorWithKernel< ...@@ -50,12 +50,4 @@ class TransposeOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(transpose);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif #endif
...@@ -35,8 +35,8 @@ if (CON GREATER -1) ...@@ -35,8 +35,8 @@ if (CON GREATER -1)
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-yolo paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test_yolo_combined paddle-mobile) target_link_libraries(test-yolo-combined paddle-mobile)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif () endif ()
...@@ -212,6 +212,14 @@ if (NOT FOUND_MATCH) ...@@ -212,6 +212,14 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fc-op paddle-mobile) target_link_libraries(test-fc-op paddle-mobile)
# test quantize op
ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
target_link_libraries(test-quantize-op paddle-mobile)
# test dequantize op
ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
target_link_libraries(test-dequantize-op paddle-mobile)
# gen test log # gen test log
ADD_EXECUTABLE(test-log common/test_log.cpp) ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-log paddle-mobile) target_link_libraries(test-log paddle-mobile)
...@@ -315,7 +323,10 @@ if (NOT FOUND_MATCH) ...@@ -315,7 +323,10 @@ if (NOT FOUND_MATCH)
target_link_libraries(test-fssd paddle-mobile) target_link_libraries(test-fssd paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) # gen test
ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
target_link_libraries(test-multi-process paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif () endif ()
...@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
} }
paddle_mobile::operators::math::SgemmWithBn( paddle_mobile::operators::math::Gemm gemm;
m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr); gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
......
...@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50"; ...@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50";
int main() { int main() {
DLOG << paddle_mobile::fpga::open_device(); DLOG << paddle_mobile::fpga::open_device();
paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model", // if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
std::string(g_resnet_combine) + "/params", true)) { // std::string(g_resnet_combine) + "/params", true)) {
if (paddle_mobile.Load(std::string(g_resnet_combine), true)) {
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor; Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0), SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
......
...@@ -46,7 +46,12 @@ int main() { ...@@ -46,7 +46,12 @@ int main() {
tensor_out.dtype = PaddleDType::FLOAT32; tensor_out.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs(1, tensor_out); std::vector<PaddleTensor> outputs(1, tensor_out);
assert(predictor->Run(paddle_tensor_feeds, &outputs)); std::cout << " before predict " << std::endl;
predictor->Run(paddle_tensor_feeds, &outputs);
std::cout << " after predict " << std::endl;
// assert();
float* data_o = static_cast<float*>(outputs[0].data.data()); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) { for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <thread> // NOLINT
#include "../test_helper.h"
#include "../test_include.h"
void fun_yolo();
int fun_mobilenet();
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
// fun_yolo();
// fun_mobilenet();
std::thread t1(fun_yolo);
std::thread t2(fun_mobilenet);
t1.join();
t2.join();
return 0;
}
void fun_yolo() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto time1 = time();
if (paddle_mobile.Load(g_yolo, true)) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
vector<int64_t> dims{1, 3, 227, 227};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
static_cast<float>(1));
vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
auto time3 = time();
for (int i = 0; i < 10; ++i) {
paddle_mobile.Predict(input, dims);
}
auto time4 = time();
std::cout << "thread 1: predict cost :" << time_diff(time3, time4) / 10
<< "ms" << std::endl;
}
}
int fun_mobilenet() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto isok = paddle_mobile.Load(g_mobilenet, true);
if (isok) {
auto time2 = time();
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
vector<float> input;
vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
auto vec_result = paddle_mobile.Predict(input, dims);
auto biggest = max_element(begin(vec_result), end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< distance(begin(vec_result), biggest) << std::endl;
// 预热十次
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
DLOG << vec_result;
auto time4 = time();
std::cout << "thread 2: predict cost :" << time_diff(time3, time4) / 10
<< "ms" << std::endl;
}
std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<< std::endl;
return 0;
}
...@@ -60,7 +60,15 @@ int main() { ...@@ -60,7 +60,15 @@ int main() {
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
// 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479 // 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
std::vector<int64_t> ids{1791, 656, 1549, 281, 96}; std::vector<int64_t> ids{
2084, 635, 1035, 197, 990, 150, 1132, 2403, 546, 770, 4060, 3352,
1798, 1589, 1352, 98, 136, 3461, 3186, 1159, 515, 764, 278, 1178,
5044, 4060, 943, 932, 463, 1198, 3352, 374, 1198, 3352, 374, 2047,
1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635, 3087, 2236, 546,
2047, 1549, 546, 2047, 302, 2202, 398, 804, 397, 657, 804, 866,
932, 2084, 515, 2165, 397, 302, 2202, 526, 992, 906, 1215, 1589,
4493, 2403, 723, 932, 2084, 635, 1352, 932, 444, 2047, 1159, 1893,
1579, 59, 330, 98, 1296, 1159, 3430, 738, 3186, 1071, 2174, 3933};
paddle_mobile::framework::LoDTensor words; paddle_mobile::framework::LoDTensor words;
auto size = static_cast<int>(ids.size()); auto size = static_cast<int>(ids.size());
......
...@@ -52,8 +52,8 @@ int main() { ...@@ -52,8 +52,8 @@ int main() {
#else #else
auto time3 = time(); auto time3 = time();
paddle_mobile.FeedData(input_tensor); paddle_mobile.FeedData(input_tensor);
paddle_mobile.Predict_To(10); paddle_mobile.Predict_To(-1);
paddle_mobile.Predict_From(10); /*paddle_mobile.Predict_From(10);
auto tensor_ptr = paddle_mobile.FetchResult(9); auto tensor_ptr = paddle_mobile.FetchResult(9);
std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel() std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
<< std::endl; << std::endl;
...@@ -63,7 +63,7 @@ int main() { ...@@ -63,7 +63,7 @@ int main() {
auto time4 = time(); auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms" std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl; << std::endl;*/
#endif #endif
} }
return 0; return 0;
......
...@@ -46,7 +46,7 @@ class TestBoxCoderOp { ...@@ -46,7 +46,7 @@ class TestBoxCoderOp {
DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0]; DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
DLOG << " OutputBox is : " << op->Output("OutputBox")[0]; DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
DLOG << " code_type : " DLOG << " code_type : "
<< op->GetAttrMap().at("code_type").Get<std::string>(); << op->GetAttrMap().at("code_type").GetString();
std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder = std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
std::make_shared<operators::BoxCoderOp<Dtype, float>>( std::make_shared<operators::BoxCoderOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(), op->Type(), op->GetInputs(), op->GetOutputs(),
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/dequantize_op.h"
namespace paddle_mobile {
void dequantize(const Tensor* input, const float scale, Tensor* output) {
const int32_t* x = input->data<const int32_t>();
float* y = output->mutable_data<float>();
size_t size = output->numel();
for (size_t i = 0; i < size; ++i) {
y[i] = x[i] * scale;
}
}
int TestDequqntizeOp() {
framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["X"] = std::vector<std::string>({"input"});
inputs["Scale"] = std::vector<std::string>({"scale"});
outputs["Out"] = std::vector<std::string>({"output"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<int32_t>(input, dim, -1000, 1000);
auto scale_var = scope.get()->Var("scale");
auto scale = scale_var->template GetMutable<framework::LoDTensor>();
scale->Resize(framework::make_ddim({1}));
scale->mutable_data<float>()[0] = 1.27;
auto output_var = scope.get()->Var("output");
framework::AttributeMap attrs;
attrs["weight_scale"].Set<float>(1.74);
auto* op = new operators::DequantizeOp<CPU, float>("dequantize", inputs,
outputs, attrs, scope);
op->InferShape();
op->Run();
auto output = output_var->template Get<framework::LoDTensor>();
const float* output_data = output->data<float>();
framework::Tensor output_cmp;
output_cmp.Resize(dim);
float dequant_scale = 1.f / (1.27 * 1.74);
dequantize(input, dequant_scale, &output_cmp);
const float* output_cmp_data = output_cmp.data<float>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
"output[%d] = %.6f, output_cmp[%d] = %.6f", i,
output_data[i], i, output_cmp_data[i]);
}
delete op;
return 0;
}
} // namespace paddle_mobile
int main() { return paddle_mobile::TestDequqntizeOp(); }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/quantize_op.h"
namespace paddle_mobile {
// static float g_test_data[50] = {
// -5.55, -5.5, -5.45, -5.0, -4.55, -4.5, -4.45, -4.0, -3.55, -3.5,
// -3.45, -3.01, -2.75, -2.5, -2.501, -2.49, -2.01, -1.75, -1.5, -1.25,
// -1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25,
// 1.5, 1.75, 2.01, 2.49, 2.501, 2.5, 2.75, 3.01, 3.45, 3.5,
// 3.55, 4.0, 4.45, 4.5, 4.55, 5.0, 5.45, 5.5, 5.55, 6.0,
// };
static float find_abs_max(const Tensor *input) {
float max_abs = 0.f;
const float *x = input->data<const float>();
size_t size = input->numel();
for (size_t i = 0; i < size; ++i) {
float value = std::abs(x[i]);
if (value > max_abs) {
max_abs = value;
}
}
return max_abs;
}
static void quantize_round_to_even(const Tensor *input, const float scale,
Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
for (size_t i = 0; i < size; ++i) {
float value = x[i] * scale;
float v = round(value);
int32_t q = (int32_t)v;
if (abs(abs(q - value) - 0.5) > 0) {
y[i] = q;
} else {
if (abs(q) % 2 == 0) {
y[i] = q;
} else {
y[i] = q + ((q > 0) ? -1 : 1);
}
}
}
}
int TestQuqntizeOp() {
framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
VariableNameMap inputs;
VariableNameMap outputs;
auto scope = std::make_shared<framework::Scope>();
inputs["X"] = std::vector<std::string>({"input"});
outputs["Out"] = std::vector<std::string>({"output"});
outputs["OutScale"] = std::vector<std::string>({"output_scale"});
auto input_var = scope.get()->Var("input");
auto input = input_var->template GetMutable<framework::LoDTensor>();
SetupTensor<float>(input, dim, -100.f, 100.f);
auto output_var = scope.get()->Var("output");
auto output_scale_var = scope.get()->Var("output_scale");
framework::AttributeMap attrs;
auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
attrs, scope);
op->InferShape();
op->Run();
auto output = output_var->template Get<framework::LoDTensor>();
const int8_t *output_data = output->data<int8_t>();
auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
const float *output_scale_data = output_scale->data<float>();
float max_abs = find_abs_max(input);
float output_scale_cmp = 127 / max_abs;
PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
"output_scale = %.6f, output_scale_cmp = %.6f",
output_scale_cmp, output_scale_data[0]);
framework::Tensor output_cmp;
output_cmp.Resize(dim);
quantize_round_to_even(input, output_scale_cmp, &output_cmp);
int8_t *output_cmp_data = output_cmp.data<int8_t>();
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
"output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int>(output_data[i]), i,
static_cast<int>(output_cmp_data[i]));
}
delete op;
return 0;
}
} // namespace paddle_mobile
int main() { return paddle_mobile::TestQuqntizeOp(); }
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h"
#include "../../src/operators/kernel/sigmoid_kernel.h" #include "../../src/operators/kernel/sigmoid_kernel.h"
#include "../test_helper.h" #include "../test_helper.h"
#include "io/executor.h" #include "io/executor.h"
......
...@@ -121,6 +121,7 @@ if (CON GREATER -1) ...@@ -121,6 +121,7 @@ if (CON GREATER -1)
set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBNRELU_OP ON)
set(FUSION_CONVBN_OP ON) set(FUSION_CONVBN_OP ON)
set(FUSION_CONVADD_OP ON) set(FUSION_CONVADD_OP ON)
set(MUL_OP ON)
set(FOUND_MATCH ON) set(FOUND_MATCH ON)
endif() endif()
...@@ -356,7 +357,7 @@ if (FUSION_CONVBN_OP) ...@@ -356,7 +357,7 @@ if (FUSION_CONVBN_OP)
endif() endif()
if (CONV_TRANSPOSE_OP) if (CONV_TRANSPOSE_OP)
add_definitions(-DCONV_TRANSPOSE) add_definitions(-DCONV_TRANSPOSE_OP)
endif() endif()
if (LOOKUP_OP) if (LOOKUP_OP)
...@@ -386,4 +387,4 @@ endif() ...@@ -386,4 +387,4 @@ endif()
if (SHAPE_OP) if (SHAPE_OP)
add_definitions(-DSHAPE_OP) add_definitions(-DSHAPE_OP)
endif() endif()
\ No newline at end of file
...@@ -3,7 +3,9 @@ ...@@ -3,7 +3,9 @@
TOTAL_ERRORS=0 TOTAL_ERRORS=0
# The trick to remove deleted files: https://stackoverflow.com/a/2413151 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep -v ".pb.cpp" | grep -v ".pb.h"); do for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
cpplint $file; cpplint $file;
TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
done done
......
# Created by .ignore support plugin (hsz.mobi)
### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
/yolo/datas/
/mobilenet/datas/
...@@ -5,22 +5,28 @@ layer_mdl_conv = 'ConvolutionLayer' ...@@ -5,22 +5,28 @@ layer_mdl_conv = 'ConvolutionLayer'
layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer' layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
layer_mdl_relu = 'ReluLayer' layer_mdl_relu = 'ReluLayer'
layer_mdl_pointwise_add = 'PointwiseConvolutionLayer' layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
layer_mdl_pooling = 'PoolingLayer'
layer_mdl_softmax = 'SoftmaxLayer'
# fluid ops # fluid ops
op_fluid_fusion_conv_add = 'fusion_conv_add' op_fluid_fusion_conv_add = 'fusion_conv_add'
op_fluid_relu = 'relu' op_fluid_relu = 'relu'
op_fluid_pooling = 'pool2d'
op_fluid_softmax = 'softmax'
# dict mdk layer --- fluid op # dict mdk layer --- fluid op
mdl2fluid_op_layer_dict = { mdl2fluid_op_layer_dict = {
layer_mdl_conv: op_fluid_fusion_conv_add, layer_mdl_conv: op_fluid_fusion_conv_add,
layer_mdl_deepwise_conv: op_fluid_fusion_conv_add, layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
layer_mdl_relu: op_fluid_relu, layer_mdl_relu: op_fluid_relu,
layer_mdl_pointwise_add: op_fluid_fusion_conv_add layer_mdl_pointwise_add: op_fluid_fusion_conv_add,
layer_mdl_pooling: op_fluid_pooling,
layer_mdl_softmax: op_fluid_softmax
} }
mdl_outputs_key = "outputs" mdl_outputs_key = "outputs"
mdl_inputs_key = "inputs" mdl_inputs_key = "inputs"
mdl_weight_key = "weights" mdl_weight_key = "weight"
mdl_attrs_key = "params" mdl_attrs_key = "params"
# dict of mdl-input _out param to fluid input out attrs # dict of mdl-input _out param to fluid input out attrs
...@@ -39,13 +45,30 @@ fusion_conv_add_dict = { ...@@ -39,13 +45,30 @@ fusion_conv_add_dict = {
relu_dict = { relu_dict = {
mdl_inputs_key: 'X', mdl_inputs_key: 'X',
mdl_outputs_key: 'Out', mdl_outputs_key: 'Out',
mdl_weight_key: () # mdl_weight_key: ()
} }
pool2d_dict = {
mdl_inputs_key: 'X',
mdl_outputs_key: 'Out',
# mdl_weight_key: (),
mdl_attrs_key: ('pooling_type', 'global_pooling')
}
softmax_dict = {
mdl_inputs_key: 'X',
mdl_outputs_key: 'Out',
mdl_weight_key: (),
mdl_attrs_key: ()
}
# mdl layers --- fluid ops # mdl layers --- fluid ops
op_io_dict = { op_io_dict = {
'fusion_conv_add': fusion_conv_add_dict, 'fusion_conv_add': fusion_conv_add_dict,
'relu': relu_dict 'relu': relu_dict,
'pool2d': pool2d_dict,
'softmax': softmax_dict
} }
# fluid attr key --- mdl params key # fluid attr key --- mdl params key
...@@ -60,64 +83,3 @@ fluid_attrs_type_dict = { ...@@ -60,64 +83,3 @@ fluid_attrs_type_dict = {
'strides': 6, 'strides': 6,
'groups': 6 'groups': 6
} }
# '': "bias_term", 是不是要add 目前 yolo的模型都是 bias_term = 1
# attrs {
# name: "axis"
# type: INT
# i: 1
# }
# attrs_name = {
# 'name': "workspace_size_MB",
# 'type': 'INT',
# 'i': '4096'
# }
# attrs
# {
# name: "data_format"
# type: STRING
# s: "AnyLayout"
# }
# attrs
# {
# name: "use_mkldnn"
# type: BOOLEAN
# b: false
# }
# attrs
# {
# name: "use_cudnn"
# type: BOOLEAN
# b: true
# }
# attrs
# {
# name: "dilations"
# type: INTS
# ints: 1
# ints: 1
# }
# attrs
# {
# name: "groups"
# type: INT
# i: 1
# }
# attrs
# {
# name: "paddings"
# type: INTS
# ints: 0
# ints: 0
# }
# attrs
# {
# name: "strides"
# type: INTS
# ints: 1
# ints: 1
# }
import json
import os
from core import framework_pb2 as framework_pb2, op_types as types
from mobilenet.swicher import Swichter
import shutil
def load_mdl(mdl_json_path):
# print('mdl json path : ' + mdl_json_path)
with open(mdl_json_path, 'r') as f:
return json.load(f)
class Converter:
'convert mdlmodel to fluidmodel'
def __init__(self, base_dir, mdl_json_path):
self.mdl_json_path = base_dir + mdl_json_path
self.base_dir = base_dir
print mdl_json_path
self.mdl_json = load_mdl(self.mdl_json_path)
self.program_desc = framework_pb2.ProgramDesc()
self.weight_list_ = []
self.deepwise_weight_list_ = []
# print(json_dick)
# layers = (json_dick['layer'])
# for layer in layers:
# print(layer)
def convert(self):
print 'convert begin.....'
# add block_desc
block_desc = self.program_desc.blocks.add()
block_desc.idx = 0
block_desc.parent_idx = -1
self.package_ops(block_desc)
self.package_vars(block_desc)
print 'blocks: '
print self.program_desc.blocks
print 'convert end.....'
desc_serialize_to_string = self.program_desc.SerializeToString()
outputmodel_ = self.base_dir + 'datas/target/outputmodel/'
if os.path.exists(outputmodel_):
shutil.rmtree(outputmodel_)
os.makedirs(outputmodel_, 0777)
# todo copy weight files
# if os.path.exists(outputmodel_):
# shutil.rmtree(outputmodel_)
# shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'mobilenet/datas/target/outputmodel/')
f = open(outputmodel_ + "__model__", "wb")
f.write(desc_serialize_to_string)
f.close()
def package_ops(self, block_desc):
self.add_op_feed(block_desc)
# add ops with layer
if 'layer' in self.mdl_json:
layers_ = self.mdl_json['layer']
for layer in layers_:
desc_ops_add = block_desc.ops.add()
# print layer
# for i in layer:
# print i
if 'name' in layer:
l_name = layer['name']
if 'type' in layer:
self.package_ops_type(desc_ops_add, layer)
if 'weight' in layer:
self.package_ops_weight2inputs(desc_ops_add, layer)
if 'output' in layer:
self.package_ops_outputs(desc_ops_add, layer)
if 'input' in layer:
self.package_ops_inputs(desc_ops_add, layer)
self.package_ops_attrs(desc_ops_add, layer)
self.add_op_fetch(block_desc)
def add_op_feed(self, block_desc):
desc_ops_add = block_desc.ops.add()
inputs_add = desc_ops_add.inputs.add()
inputs_add.parameter = 'X'
inputs_add.arguments.append('feed')
desc_ops_add.type = 'feed'
outputs_add = desc_ops_add.outputs.add()
outputs_add.parameter = 'Out'
outputs_add.arguments.append('data')
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'col'
# boolean
attrs_add.type = 0
attrs_add.i = 0
def add_op_fetch(self, block_desc):
desc_ops_add = block_desc.ops.add()
inputs_add = desc_ops_add.inputs.add()
inputs_add.parameter = 'X'
inputs_add.arguments.append('conv_pred_87')
desc_ops_add.type = 'fetch'
outputs_add = desc_ops_add.outputs.add()
outputs_add.parameter = 'Out'
outputs_add.arguments.append('fetch')
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'col'
# boolean
attrs_add.type = 0
attrs_add.i = 0
@staticmethod
def package_ops_attrs(desc_ops_add, layer):
# print l_params
# print desc_ops_add.type
if desc_ops_add.type == types.op_fluid_fusion_conv_add:
Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
elif desc_ops_add.type == types.op_fluid_relu:
# fusion_conv_add : attrs
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'use_mkldnn'
# boolean
attrs_add.type = 6
attrs_add.b = 0
@staticmethod
def pack_fusion_conv_add_attr(desc_ops_add, layer):
# fusion_conv_add : attrs
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'workspace_size_MB'
# 0-->INT
attrs_add.type = 0
attrs_add.i = 4096
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'data_format'
# 2-->STRING
attrs_add.type = 2
attrs_add.s = 'AnyLayout'
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'use_mkldnn'
# boolean
attrs_add.type = 6
attrs_add.b = 0
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'use_cudnn'
# boolean
attrs_add.type = 6
attrs_add.b = 1
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'dilations'
# ints
attrs_add.type = 3
attrs_add.ints.append(1)
attrs_add.ints.append(1)
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'axis'
# int
attrs_add.type = 0
attrs_add.i = 1
if 'param' in layer:
l_params = layer['param']
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'paddings'
# ints
attrs_add.type = 3
attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'strides'
# ints
attrs_add.type = 3
attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
attrs_add = desc_ops_add.attrs.add()
attrs_add.name = 'groups'
# int
attrs_add.type = 0
attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
# attrs_add.i = 1
#
# op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
# .get(types.mdl_attrs_key)
#
#
#
#
# # group stride padding
# print '----------------------'
# for i, val in enumerate(op_attrs_tupl):
# attrs_add = desc_ops_add.attrs.add()
# attr_name = op_attrs_tupl[i]
# print attr_name
# attrs_add.name = attr_name
# attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
# attrs_add.
# print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
# for p in l_params:
# attrs_add = desc_ops_add.attrs.add()
@staticmethod
def package_ops_inputs(desc_ops_add, layer):
l_inputs = layer['input']
for i in l_inputs:
inputs_add = desc_ops_add.inputs.add()
# print i
inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
inputs_add.arguments.append(i)
@staticmethod
def package_ops_outputs(desc_ops_add, layer):
l_outputs = layer['output']
for o in l_outputs:
# print o
outputs_add = desc_ops_add.outputs.add()
dict = types.op_io_dict.get(desc_ops_add.type)
print 'desc_ops_add.type: ' + desc_ops_add.type
print dict
outputs_add.parameter = dict.get(types.mdl_outputs_key)
outputs_add.arguments.append(o)
def package_ops_weight2inputs(self, desc_ops_add, layer):
l_weights = layer['weight']
for w in l_weights:
self.weight_list_.append(w)
if layer['type'] == types.layer_mdl_deepwise_conv:
# print l_weights[0]
self.deepwise_weight_list_.append(l_weights[0])
op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
if op_weight_tup is not None:
# print len(op_weight_tup)
for i, val in enumerate(op_weight_tup):
# print i
# print val
inputs_add = desc_ops_add.inputs.add()
inputs_add.parameter = op_weight_tup[i]
inputs_add.arguments.append(l_weights[i])
# for w in l_weights:
# inputs_add = desc_ops_add.inputs.add()
# # print w
# inputs_add.parameter = op_weight_tup[0]
# inputs_add.arguments.append(w)
@staticmethod
def package_ops_type(desc_ops_add, layer):
l_type = layer['type']
# print l_type
# print mdl2fluid_op_layer_dict.get(l_type)
desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
def package_vars(self, block_desc):
vars_add = block_desc.vars.add()
vars_add.name = 'feed'
vars_add.type.type = 9 # 9 is FEED_MINIBATCH
vars_add.persistable = 1
# fetch
vars_add = block_desc.vars.add()
vars_add.name = 'fetch'
vars_add.type.type = 10 # 10 is fetch list
vars_add.persistable = 1
json_matrix_ = self.mdl_json['matrix']
# print json_matrix_
for j in json_matrix_:
vars_add = block_desc.vars.add()
vars_add.name = j
vars_add.type.type = 7 # 7 is lodtensor
# print j
tensor = vars_add.type.lod_tensor.tensor
tensor.data_type = 5 # 5 is FP32
# print json_matrix_
dims_of_matrix = json_matrix_.get(j)
# dims_size = len(dims_of_matrix)
# print dims_size
# if dims_size == 4:
# tensor.dims.append(dims_of_matrix[0]) # N
# tensor.dims.append(dims_of_matrix[3]) # C
# tensor.dims.append(dims_of_matrix[1]) # H
# tensor.dims.append(dims_of_matrix[2]) # W
# else:
# issues in mdl model filter swich n and c
if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
print j
tensor.dims.append(dims_of_matrix[1])
tensor.dims.append(dims_of_matrix[0])
tensor.dims.append(dims_of_matrix[2])
tensor.dims.append(dims_of_matrix[3])
print tensor.dims
else:
for dims in dims_of_matrix:
# print dims
tensor.dims.append(dims)
if j in self.weight_list_:
vars_add.persistable = 1
dims_size = len(dims_of_matrix)
# print dims_size
# if dims_size == 4:
# # convert weight from nhwc to nchw
# Swichter().nhwc2nchw_one_slice_add_head(
# 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
# 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
# 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
# dims_of_matrix[0],
# dims_of_matrix[1],
# dims_of_matrix[2],
# dims_of_matrix[3]
# )
# else:
# Swichter().copy_add_head(
# 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
# 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
# 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
# )
else:
vars_add.persistable = 0
mdl_path = "datas/sourcemodels/cls231_0802/mobileNetModel.json"
base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/"
converter = Converter(base_dir, mdl_path)
converter.convert()
...@@ -58,7 +58,7 @@ class Swichter: ...@@ -58,7 +58,7 @@ class Swichter:
to_file = open(to_file_name, "wb") to_file = open(to_file_name, "wb")
tmp = tmp_file.read() tmp = tmp_file.read()
head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') head = self.read_head('yolo/datas/yolo/conv1_biases')
to_file.write(head) to_file.write(head)
to_file.write(tmp) to_file.write(tmp)
tmp_file.close() tmp_file.close()
...@@ -77,7 +77,7 @@ class Swichter: ...@@ -77,7 +77,7 @@ class Swichter:
to_file = open(to_file_name, "wb") to_file = open(to_file_name, "wb")
# tmp_file = open(tmp_file_name, "wb") # tmp_file = open(tmp_file_name, "wb")
head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') head = self.read_head('yolo/datas/yolo/conv1_biases')
to_file.write(head) to_file.write(head)
to_file.write(from_file.read()) to_file.write(from_file.read())
from_file.close() from_file.close()
...@@ -96,7 +96,7 @@ class Swichter: ...@@ -96,7 +96,7 @@ class Swichter:
to_file = open(to_file_name, "wb") to_file = open(to_file_name, "wb")
# tmp_file = open(tmp_file_name, "wb") # tmp_file = open(tmp_file_name, "wb")
head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') head = self.read_head('yolo/datas/yolo/conv1_biases')
to_file.write(head) to_file.write(head)
to_file.write(read) to_file.write(read)
from_file.close() from_file.close()
...@@ -104,12 +104,12 @@ class Swichter: ...@@ -104,12 +104,12 @@ class Swichter:
pass pass
# Swichter().nhwc2nchw_one_slice_add_head( # Swichter().nhwc2nchw_one_slice_add_head(
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin', # '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0', # '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp', # '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
# 32, # 32,
# 3, 3, 3) # 3, 3, 3)
# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') # Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/conv1_biases')
# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '') # Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
import datetime
import json import json
import os
import google.protobuf as pbg
import framework_pb2 as framework_pb2
def loadmdl(json_path): def loadmdl(json_path):
......
import os import os
import framework_pb2 as framework_pb2 from core import framework_pb2 as framework_pb2
def read_model(model_path): def read_model(model_path):
...@@ -16,7 +16,7 @@ def read_model(model_path): ...@@ -16,7 +16,7 @@ def read_model(model_path):
# print desc.blocks # print desc.blocks
except IOError: except IOError:
print ": File not found. Creating a new file." print ": File not found."
def get_file_size(file_path): def get_file_size(file_path):
...@@ -26,5 +26,5 @@ def get_file_size(file_path): ...@@ -26,5 +26,5 @@ def get_file_size(file_path):
return round(fsize, 2) return round(fsize, 2)
path = "newyolo/__model__" path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__'
read_model(path) read_model(path)
import json import json
import os
import framework_pb2 as framework_pb2 from core import framework_pb2 as framework_pb2, op_types as types
import op_types as types from yolo.swicher import Swichter
from swicher import Swichter
import shutil import shutil
...@@ -40,10 +38,10 @@ class Converter: ...@@ -40,10 +38,10 @@ class Converter:
print self.program_desc.blocks print self.program_desc.blocks
print 'convert end.....' print 'convert end.....'
desc_serialize_to_string = self.program_desc.SerializeToString() desc_serialize_to_string = self.program_desc.SerializeToString()
shutil.rmtree('newyolo/') shutil.rmtree('yolo/datas/newyolo/')
shutil.copytree('multiobjects/float32s_nchw_with_head', 'newyolo/') shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/')
f = open("newyolo/__model__", "wb") f = open("yolo/datas/newyolo/__model__", "wb")
f.write(desc_serialize_to_string) f.write(desc_serialize_to_string)
f.close() f.close()
...@@ -312,9 +310,9 @@ class Converter: ...@@ -312,9 +310,9 @@ class Converter:
if dims_size == 4: if dims_size == 4:
# convert weight from nhwc to nchw # convert weight from nhwc to nchw
Swichter().nhwc2nchw_one_slice_add_head( Swichter().nhwc2nchw_one_slice_add_head(
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin', 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j, 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp', 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
dims_of_matrix[0], dims_of_matrix[0],
dims_of_matrix[1], dims_of_matrix[1],
dims_of_matrix[2], dims_of_matrix[2],
...@@ -322,14 +320,14 @@ class Converter: ...@@ -322,14 +320,14 @@ class Converter:
) )
else: else:
Swichter().copy_add_head( Swichter().copy_add_head(
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin', 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j, 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp' 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
) )
else: else:
vars_add.persistable = 0 vars_add.persistable = 0
mdl_path = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/YOLO_Universal.json" mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json"
converter = Converter(mdl_path) converter = Converter(mdl_path)
converter.convert() converter.convert()
from array import array
class Swichter:
def __init__(self):
pass
def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
from_file = open(from_file_name, "rb")
to_file = open(to_file_name, "wb")
float_array = array("f")
float_array.fromfile(from_file, width * height * batch * channel)
float_write_array = array("f")
for b in range(batch):
for c in range(channel):
for h in range(height):
for w in range(width):
float_value = float_array[b * channel * width * height
+ channel * (h * width + w) + c]
float_write_array.append(float_value)
float_write_array.tofile(to_file)
from_file.close()
to_file.close()
def copy(self, from_file_name, to_file_name):
from_file = open(from_file_name, "rb")
to_file = open(to_file_name, "wb")
to_file.write(from_file.read())
from_file.close()
to_file.close()
def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
from_file = open(from_file_name, "rb")
tmp_file = open(tmp_file_name, "wb+")
float_array = array("f")
float_array.fromfile(from_file, width * height * batch * channel)
float_write_array = array("f")
for b in range(batch):
for c in range(channel):
for h in range(height):
for w in range(width):
float_value = float_array[b * channel * width * height
+ channel * (h * width + w) + c]
float_write_array.append(float_value)
float_write_array.tofile(tmp_file)
tmp_file.close()
from_file.close()
tmp_file = open(tmp_file_name, "rb")
to_file = open(to_file_name, "wb")
tmp = tmp_file.read()
head = self.read_head('yolo/datas/yolo/conv1_biases')
to_file.write(head)
to_file.write(tmp)
tmp_file.close()
to_file.close()
def read_head(self, head_file):
from_file = open(head_file, "rb")
read = from_file.read(24)
# print read
from_file.close()
# print read
return read
def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
from_file = open(from_file_name, "rb")
to_file = open(to_file_name, "wb")
# tmp_file = open(tmp_file_name, "wb")
head = self.read_head('yolo/datas/yolo/conv1_biases')
to_file.write(head)
to_file.write(from_file.read())
from_file.close()
to_file.close()
pass
def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
print'padding = %d' % padding
from_file = open(from_file_name, "rb")
# print len(from_file.read())
from_file.seek(padding, 0)
read = from_file.read()
print len(read)
to_file = open(to_file_name, "wb")
# tmp_file = open(tmp_file_name, "wb")
head = self.read_head('yolo/datas/yolo/conv1_biases')
to_file.write(head)
to_file.write(read)
from_file.close()
to_file.close()
pass
# Swichter().nhwc2nchw_one_slice_add_head(
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
# 32,
# 3, 3, 3)
# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/conv1_biases')
# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册