提交 529f254e 编写于 作者: L liuruilong

Merge branch 'metal' of https://github.com/codeWorm2015/paddle-mobile into metal

......@@ -70,6 +70,12 @@ build
cmake-build-debug
cmake-build-release
#ios demo
demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
*.xcuserstate
/tools/quantification/quantify
# metal
Podfile.lock
......@@ -78,12 +84,3 @@ SwiftProtobuf.framework
paddle-mobile.xcworkspace
metal/models/
metal/images/
[submodule "src/operators/kernel/mali/ACL_Android"]
path = src/operators/kernel/mali/ACL_Android
url = https://github.com/halsay/ACL_Android.git
cmake_minimum_required(VERSION 3.0)
cmake_minimum_required(VERSION 3.6)
project(paddle-mobile)
option(DEBUGING "enable debug mode" ON)
......@@ -6,41 +6,30 @@ option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
option(CPU "cpu" ON)
option(MALI_GPU "mali gpu" ON)
option(CPU "armv7 with neon" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
if (CPU)
add_definitions(-DPADDLE_MOBILE_CPU)
endif()
if (MALI_GPU)
add_definitions(-DPADDLE_MOBILE_MALI_GPU)
endif()
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/)
if(FPGA)
add_definitions(-DPADDLE_MOBILE_FPGA)
if(IS_IOS)
set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
else()
set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
endif()
set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
if (DEBUGING)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS}")
else()
message(STATUS "debug")
set(CMAKE_BUILD_TYPE Release)
endif ()
if(DEBUGING)
message(STATUS "debuging")
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-DPADDLE_MOBILE_DEBUG)
if(ANDROID)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
endif()
else()
message(STATUS "releasing")
else ()
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif()
endif ()
if (USE_EXCEPTION)
message(STATUS "use exception")
......@@ -54,115 +43,123 @@ if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(IS_MAC)
add_definitions(-DX86)
elseif(IS_IOS)
add_definitions(-DIOS)
elseif(V7)
add_definitions(-DARMV7)
elseif(V8)
add_definitions(-DARMV8)
else ()
add_definitions(-DX86)
if(USE_OPENMP AND NOT IS_IOS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
endif()
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
if (NOT ANDROID)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
# platform control
if (ARM_LINUX)
include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
endif ()
include_directories(src/)
if (CPU)
add_definitions(-DPADDLE_MOBILE_CPU)
else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if(USE_OPENMP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
if (MALI_GPU)
add_definitions(-DPADDLE_MOBILE_MALI_GPU)
add_definitions(-DUSE_ACL=1)
add_definitions(-DUSE_OPENCL)
set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if (googlenet)
add_definitions(-DCONCAT_OP)
add_definitions(-DCONV_OP)
add_definitions(-DLRN_OP)
add_definitions(-DMUL_OP)
add_definitions(-DELEMENTWISEADD_OP)
add_definitions(-DFUSION_FC_OP)
add_definitions(-DPOOL_OP)
add_definitions(-DRELU_OP)
add_definitions(-DFUSION_CONVADD_OP)
add_definitions(-DFUSION_CONVADD_RELU_OP)
elseif (mobilenet)
add_definitions(-DCONV_OP)
add_definitions(-DELEMENTWISEADD_OP)
add_definitions(-DRELU_OP)
add_definitions(-DSOFTMAX_OP)
add_definitions(-DSOFTMAX_OP)
add_definitions(-DDEPTHWISECONV_OP)
add_definitions(-DBATCHNORM_OP)
add_definitions(-DPOOL_OP)
add_definitions(-DRESHAPE_OP)
elseif (yolo)
add_definitions(-DBATCHNORM_OP)
add_definitions(-DCONV_OP)
add_definitions(-DRELU_OP)
add_definitions(-DELEMENTWISEADD_OP)
elseif (squeezenet)
add_definitions(-DCONCAT_OP)
add_definitions(-DCONV_OP)
add_definitions(-DRELU_OP)
add_definitions(-DELEMENTWISEADD_OP)
add_definitions(-DPOOL_OP)
add_definitions(-DRESHAPE_OP)
add_definitions(-DSOFTMAX_OP)
elseif(resnet)
add_definitions(-DCONV_OP)
add_definitions(-DBATCHNORM_OP)
add_definitions(-DELEMENTWISEADD_OP)
add_definitions(-DSOFTMAX_OP)
add_definitions(-DMUL_OP)
add_definitions(-DPOOL_OP)
add_definitions(-DRELU_OP)
else ()
add_definitions(-DBATCHNORM_OP)
add_definitions(-DBOXCODER_OP)
add_definitions(-DCONCAT_OP)
add_definitions(-DCONV_OP)
add_definitions(-DDEPTHWISECONV_OP)
add_definitions(-DELEMENTWISEADD_OP)
add_definitions(-DFUSION_CONVADD_OP)
add_definitions(-DCONVADDRELU_OP)
add_definitions(-DFUSION_FC_OP)
add_definitions(-DLRN_OP)
add_definitions(-DMUL_OP)
add_definitions(-DMULTICLASSNMS_OP)
add_definitions(-DPOOL_OP)
add_definitions(-DPRIORBOX_OP)
add_definitions(-DRELU_OP)
add_definitions(-DRESHAPE_OP)
add_definitions(-DSIGMOID_OP)
add_definitions(-DSOFTMAX_OP)
add_definitions(-DTRANSPOSE_OP)
add_definitions(-DFUSION_CONVADD_RELU_OP)
if(FPGA)
add_definitions(-DPADDLE_MOBILE_FPGA)
else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
endif ()
if (IS_IOS)
add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
elseif(ANDROID)
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
endif ()
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default
set(NET "default" CACHE STRING "select net type")
set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
# build library
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
elseif(IS_IOS)
add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
else ()
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
endif ()
# unit test
if(DEBUGING)
add_subdirectory(test)
if(IS_IOS)
else()
add_subdirectory(test)
endif()
endif()
......@@ -183,6 +183,9 @@ upstream
接下来等待 review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
之后就可以提交代码了
## 删除远程分支
在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。
......@@ -219,7 +222,7 @@ upstream
- 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。
- 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)
- 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。
3. 如果解决了某个Issue的问题,请在该Pull Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)
3. 如果解决了某个Issue的问题,请在该Pull Request的**第一个**评论框中加上:`fix #issue_number`,这样当该Pull Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)
此外,在回复评审人意见时,请您遵守以下约定:
......
FROM ubuntu:16.04
RUN echo '\
deb <mirror> <version> main restricted universe multiverse\n\
deb <mirror> <version>-updates main restricted universe multiverse\n\
deb <mirror> <version>-backports main restricted universe multiverse\n\
deb <mirror> <version>-security main restricted universe multiverse\n'\
> /etc/apt/sources.list
RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y --no-install-recommends \
curl \
unzip \
git \
make \
cmake-curses-gui \
python \
python-pip \
python-setuptools \
clang-format-5.0 \
graphviz \
g++-arm-linux-gnueabi \
gcc-arm-linux-gnueabi
RUN apt-get autoremove -y && apt-get clean
RUN ln -s clang-format-5.0 /usr/bin/clang-format
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
ENV NDK_ROOT /opt/android-ndk-r17b
# Paddle-Mobile
# Paddle-Mobile
[![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
[![License](https://img.shields.io/badge/license-Apache%202-brightgreen.svg)](LICENSE)
[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/doc)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
欢迎来到 Paddle-Mobile GitHub 项目。
Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。Paddle-Moible设计思想和PaddlePaddle的最新版fluid版本保持了高度一致,同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
## 简单搜索线上效果
如下gif是简单搜索app的线上主体检测应用效果
![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
## Demo目录
[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
## Features
- **ARM CPU**
|mobilenet arm v7|1线程|2线程|4线程|
|------------|----|-----|-----|
|麒麟960(ms)|110.586|70.897|47.474|
|||||
|mobilenetssd arm v7|1线程|2线程|4线程|
|麒麟960(ms)|222.124|138.952|90.856|
|||||
|googlenet(v1) arm v7|1线程|2线程|4线程|
|麒麟960(ms)|348.018|240.304|169.998|
|||||
|squeezenet arm v7|1线程|2线程|4线程|
|麒麟960(ms)|84.685|56.544|38.833|
|||||
|yolo arm v7|1线程|2线程|4线程|
|麒麟960(ms)|131.831|88.990|60.905|
arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
- **Mali GPU**
Mali GPU是百度和ARM合作开发的,双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet,googlenet,resnet等几个网络模型,后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。
- **苹果设备的GPU Metal实现**
基于Metal实现的苹果设备的GPU预测库,也已经在实现中,近期也会有相应可运行版本。
- **FPGA**
FPGA实现正在进行中,是基于Xilinx的ZU5目标开发板。
- **灵活性**
* paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
* 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
* 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
* 使用 docker 编译, 提供统一的编译环境。
* 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
* 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
- **体积**
paddle-mobile从设计之初就深入考虑到移动端的包体积的问题,cpu实现中没有外部依赖。在编译过程中,如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
除了二进制体积,我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
## 文档
### 设计文档
关于paddle-mobile设计文档在下面链接中,如果想了解更多内容。[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程。
[设计文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
### 开发文档
开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。
[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)
### 贡献文档
- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
- 上面文档中涵盖了主要的贡献代码流程,如果在实践中您还遇到了其他问题,可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。
This project is used to develop the next version deep learning freamwork for mobile device.
# Development
## 模型获得
目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型,需要进行模型转换才可以运行。
### 1. 直接使用Paddle Fluid训练
该方式最为可靠,推荐方式
### 2. caffe转为Paddle Fluid模型
[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
### 3. ONNX
ONNX全称为“Open Neural Network Exchange”,即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
[Used model in development](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
除直接使用PaddlePaddle训练fluid版本的模型外,还可以通过onnx转换得到个别Paddle fluid模型。
## cross-compilation to android
目前,百度也在做onnx支持工作。相关转换项目在这里:[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
* NDK is required
* ANDROID_NDK environment variable is required
![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
```bash
sh build.sh android
```
### 4. 部分测试模型和测试图片下载
[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
## build for x86
paddle-mobile is to run on arm platform. x86 only used to test not arm assembly code. So do not recommend compiling x86.
## 问题解决
Now only support osx.
欢迎提出或解决我们的问题,有疑问可以发issue. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
```
sh build.sh mac
```
## Copyright and License
Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE).
## Old Version of Mobile-Deep-Learning
The old version of MDL was I moved to here [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
## 旧版 Mobile-Deep-Learning
原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
## 如何运行demo
- Android demo下载路径
http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
- iOS demo下载路径:
http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
在demo目录下执行下载demo的脚本
```
sh getDemo.sh
```
demo工程就下载解压到当前目录中了。
\ No newline at end of file
#!/usr/bin/env bash
wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
unzip paddle-mobile%2FPaddleMobile_Android.zip
unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip
rm -rf paddle-mobile%2FPaddleMobile_Android.zip
rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip
rm -rf __MACOSX
\ No newline at end of file
# 环境搭建
## 使用 docker
### 1. 安装 docker
安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
### 2. 使用 docker 搭建构建环境
首先进入 paddle-mobile 的目录下,执行 `docker build`
以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
```
$ docker build -t paddle-mobile:dev - < Dockerfile
```
使用 `docker images` 可以看到我们新建的 image
```
$ docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
paddle-mobile dev 33b146787711 45 hours ago 372MB
```
### 3. 使用 docker 构建
进入 paddle-mobile 目录,执行 docker run
```
$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
root@5affd29d4fc5:/ # cd /paddle-mobile
###
### paddle-mobile 支持 arm 架构下的各种平台,包括 android 以及 linux 等,可以使用不同的
### toolchain 文件生成满足需要的 makefile
###
# 生成构建 android 产出的 Makefile
root@5affd29d4fc5:/ # rm CMakeCache.txt
root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
# 生成构建 linux 产出的 Makefile
root@5affd29d4fc5:/ # rm CMakeCache.txt
root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
```
### 4. 设置编译选项
可以通过 ccmake 设置编译选项
```
root@5affd29d4fc5:/ # ccmake .
Page 1 of 1
CMAKE_ASM_FLAGS
CMAKE_ASM_FLAGS_DEBUG
CMAKE_ASM_FLAGS_RELEASE
CMAKE_BUILD_TYPE
CMAKE_INSTALL_PREFIX /usr/local
CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake
CPU ON
DEBUGING ON
FPGA OFF
LOG_PROFILE ON
MALI_GPU OFF
NET googlenet
USE_EXCEPTION ON
USE_OPENMP OFF
```
修改选项后,按 `c`, `g` 更新 Makefile
### 5. 构建
使用 make 命令进行构建
```
root@5affd29d4fc5:/ # make
```
### 6. 查看构建产出
构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行
## 不使用 docker
不使用 docker 的方法,可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具,可能需要设置 CC,CXX 环境变量,或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake,或者增加自己需要的 toolchain file。
# paddle-mobile 设计文档
#### 以下是 paddle-mobile 代码的执行流程图:
![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
#### 下面展开说一下各个模块的作用以及设计思路
### 一. Loader
先来看一下模型, 模型分为两种结构:
一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
方便进行算法优化.
__那么为什么融合在一起能够做算法优化 ?__
如果未融合的 conv add batchnorm relu 运算是这样的
```
[n]
[conv_res] = conv([n])
for &res in conv_res {
res = add_biase(res)
}
for &res in conv_res {
res = batchnorm(res)
}
for &res in conv_res {
res = relu(res)
}
```
融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
```
[n]
[conv_res] = conv([n])
for &res in conv_res {
res = relu(batchnorm(add_biase(res)))
}
```
由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
```
[n]
for &res in [res] {
res = relu(batchnorm(add_biase(A * B)))
}
其中 A 和 B 为 1 * k 和 k * 1 矩阵
```
### 二. Program
program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况:
* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
* block 包含着 ops 和 vars
* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
* vars 里包含的为所有 op 运算所需的参数描述
### 三. Executor
executor 主要是用于 op 运算的上层调度操作, 主要有两个操作, executor 实例化 和 暴露给上层的 predict 方法
* executor 实例化过程中, 主要进行了这几个操作
1. 根据 loader 产出的 program 初始化 operator 对象
2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
### 四. op
关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写:
```c++
// 三个平台都注册了 conv op
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv2d);
REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv2d);
REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(conv2d);
REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
#endif
```
__一个关于包大小的优化__:
每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h , conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
```c++
#ifdef CONV_OP //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h , conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/conv_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
template <typename DeviceType, typename T>
class ConvOp
//impl
};
} // namespace operators
} // namespace paddle_mobile
#endif
```
这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
```sh
cd toools
sh build.sh android yolo
```
这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
### 五. kernel
kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
### 六. scope variable Tensor
* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致, 使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过 inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
1. DDim: 用来存储矩阵的维度信息.
2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
### iOS&Android开发文档
# iOS开发文档
## 编译
```sh
# 在 paddle-mobile 目录下:
cd tools
sh build.sh ios
# 如果只想编译某个特定模型的 op, 则需执行以下命令
sh build.sh ios googlenet
# 在这个文件夹下, 你可以拿到生成的 .a 库
cd ../build/release/ios/build
```
#### 常见问题:
1. No iOS SDK's found in default search path ...
这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定,
以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
## 集成
```
将上一步生成的:
libpaddle-mobile.a
/src/ios_io/ 下的
PaddleMobile.h
```
拖入工程
#### oc 接口
接口如下:
```
/*
创建对象
*/
- (instancetype)init;
/*
load 模型, 开辟内存
*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
/*
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/*
进行预测
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/*
清理内存
*/
- (void)clear;
```
# Android开发文档
用户可通过如下两种方式,交叉编译Android平台上适用的paddle-mobile库:
- 基于Docker容器编译
- 基于Linux交叉编译
## 基于Docker容器编译
### 1. 安装 docker
安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
### 2. 使用 docker 搭建构建环境
首先进入 paddle-mobile 的目录下,执行 `docker build`
以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
```
$ docker build -t paddle-mobile:dev - < Dockerfile
```
使用 `docker images` 可以看到我们新建的 image
```
$ docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
paddle-mobile dev 33b146787711 45 hours ago 372MB
```
### 3. 使用 docker 构建
进入 paddle-mobile 目录,执行 docker run
```
$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
root@5affd29d4fc5:/ # cd /paddle-mobile
# 生成构建 android 产出的 Makefile
root@5affd29d4fc5:/ # rm CMakeCache.txt
root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
# 生成构建 linux 产出的 Makefile
root@5affd29d4fc5:/ # rm CMakeCache.txt
root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
```
### 4. 设置编译选项
可以通过 ccmake 设置编译选项
```
root@5affd29d4fc5:/ # ccmake .
Page 1 of 1
CMAKE_ASM_FLAGS
CMAKE_ASM_FLAGS_DEBUG
CMAKE_ASM_FLAGS_RELEASE
CMAKE_BUILD_TYPE
CMAKE_INSTALL_PREFIX /usr/local
CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake
CPU ON
DEBUGING ON
FPGA OFF
LOG_PROFILE ON
MALI_GPU OFF
NET googlenet
USE_EXCEPTION ON
USE_OPENMP OFF
```
修改选项后,按 `c`, `g` 更新 Makefile
### 5. 构建
使用 make 命令进行构建
```
root@5affd29d4fc5:/ # make
```
### 6. 查看构建产出
构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行
## 基于Linux交叉编译
### 交叉编译环境准备
##### 下载Android NDK
从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/),用户可以自行前往下载,也可以通过以下命令获取:
- Mac平台
```
wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
unzip android-ndk-r17b-darwin-x86_64.zip
```
- Linux平台
```
wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
unzip android-ndk-r17b-linux-x86_64.zip
```
##### 设置环境变量
工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK,因此需要配置环境变量:
```
export NDK_ROOT = "path to ndk"
```
### 执行编译
在paddle-mobile根目录中,执行以下命令:
```
cd tools
sh build.sh android
```
执行完毕后,生成的so位于build目录中,单测可执行文件位于test/build目录中。
##### Tips:
如果想要获得体积更小的库,可选择编译支持指定模型结构的库。
如执行如下命令:
```
sh build.sh android googlenet
```
会得到一个支持googlnet的体积更小的库。
##测试
在编译完成后,我们提供了自动化的测试脚本,帮助用户将运行单测文件所需要的模型及库文件push到Android设备中,执行以下命令:
```
cd tools/android-debug-script
sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上
```
出现如下提示:
```
**** choose OP or NET to test ****
which to test :
```
输入名称即可运行对应的测试文件。
##部署
Android应用可通过JNI接口调用底层C/C++,paddle-mobile对外提供的JNI接口如下:
##### 1 load接口 加载模型参数
- 用于加载参数文件分散的模型
```
/**
* Load seperated parameters
* @param modelDir
* @return
*/
public static native boolean load(String modelDir);
```
- 用于加载参数文件合并的模型文件
```
/**
* Load combined parameters
* @param modelPath
* @param paramPath
* @return
*/
public static native boolean loadCombined(String modelPath,String paramPath);
```
##### 2 predict接口 执行预测
- 接受预处理过的RGB数组的predict接口
```
/**
*@param buf 输入数据
*@return 输出数据
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
JNIEnv *env, jclass thiz, jfloatArray buf);
```
- 接受原始yuv数据的predict接口
```
/**
*
* @param buf yuv420格式的字节数组
* @param imgWidth yuv数据的宽
* @param imgHeight yuv数据的高
* @param ddims 输入数据的形状
* @param meanValues 模型训练时各通道的均值
* @return
*/
public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
```
##### 3 clear接口 销毁实例、清理内存操作
```
JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
jclass thiz);
```
# Quantification 模型量化、反量化
## 背景故事
部分网络如AlexNet训练出的模型体积较大,不适宜在移动设备上使用。
## 解决模型过大办法
1. 选用适合移动端的模型结构如:mobilenet、googlenet、 yolo、squeezenet 等;
2. 使用我们提供的量化工具,可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4;
- - - - -
## 量化工具介绍
### 模型转化工具目录:
- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
#### 使用说明
- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
## 如何读取量化后的模型
load方法中添加了 quantification 参数,默认为false。 如果需要load量化后的模型,按需传参即可。
[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
```c++
bool Load(const std::string &dirname, bool optimize = false,
bool quantification = false, int batch_size = 1);
```
- - - - -
......@@ -42,4 +42,3 @@ class AppDelegate: UIResponder, UIApplicationDelegate {
}
......@@ -17,4 +17,3 @@ class ViewController: UIViewController {
}
}
......@@ -12,6 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#import <UIKit/UIKit.h>
//! Project version number for paddle_mobile.
......@@ -20,6 +22,4 @@ FOUNDATION_EXPORT double paddle_mobileVersionNumber;
//! Project version string for paddle_mobile.
FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
// In this header, you should import all the public headers of your framework using statements like #import <paddle_mobile/PublicHeader.h>
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <chrono>
using Time = decltype(std::chrono::high_resolution_clock::now());
inline Time time() { return std::chrono::high_resolution_clock::now(); }
inline double time_diff(Time t1, Time t2) {
typedef std::chrono::microseconds ms;
auto diff = t2 - t1;
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}
......@@ -61,7 +61,14 @@ struct PaddleMobileException : public std::exception {
}
#else
#define PADDLE_MOBILE_THROW_EXCEPTION(...)
#define PADDLE_MOBILE_ENFORCE(stat, ...)
#define PADDLE_MOBILE_ENFORCE(stat, ...) \
{ \
if (stat) { \
} else { \
} \
}
#endif
} // namespace paddle_mobile
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector>
#ifdef PADDLE_MOBILE_DEBUG
#include <cstring>
#include <iostream>
#include <sstream>
#include <string>
......@@ -115,26 +116,29 @@ struct ToLog {
Print printer_;
};
#define LOG(level) \
if (level > paddle_mobile::log_level) { \
} else \
paddle_mobile::ToLog( \
level, \
(std::stringstream() \
<< "[file: " \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
<< "] [line: " << __LINE__ << "] ") \
.str())
#define DLOG \
if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
} else \
paddle_mobile::ToLog( \
paddle_mobile::kLOG_DEBUG, \
(std::stringstream() \
<< "[file: " \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
<< "] [line: " << __LINE__ << "] ") \
#define LOG(level) \
if (level > paddle_mobile::log_level) { \
} else \
paddle_mobile::ToLog( \
level, static_cast<const std::stringstream &>( \
std::stringstream() \
<< "[file: " \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
: __FILE__) \
<< "] [line: " << __LINE__ << "] ") \
.str())
#define DLOG \
if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
} else \
paddle_mobile::ToLog( \
paddle_mobile::kLOG_DEBUG, \
static_cast<const std::stringstream &>( \
std::stringstream() \
<< "[file: " \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
: __FILE__) \
<< "] [line: " << __LINE__ << "] ") \
.str())
#define LOGF(level, format, ...) \
......@@ -170,7 +174,10 @@ struct ToLog;
struct Print {
friend struct ToLog;
template <typename T>
Print &operator<<(T const &value) {}
Print &operator<<(T const &value) {
Print p = Print();
return p;
}
private:
};
......
......@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once;
#pragma once
#include <functional>
#include <map>
#include <string>
#include <vector>
......
......@@ -17,34 +17,47 @@ limitations under the License. */
namespace paddle_mobile {
const std::string G_OP_TYPE_CONV = "conv2d";
const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
const std::string G_OP_TYPE_BOX_CODER = "box_coder";
const std::string G_OP_TYPE_CONCAT = "concat";
const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const std::string G_OP_TYPE_FC = "fc";
const std::string G_OP_TYPE_CONV_ADD = "conv_add";
const std::string G_OP_TYPE_LRN = "lrn";
const std::string G_OP_TYPE_MUL = "mul";
const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const std::string G_OP_TYPE_POOL2D = "pool2d";
const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
const std::string G_OP_TYPE_RELU = "relu";
const std::string G_OP_TYPE_RESHAPE = "reshape";
const std::string G_OP_TYPE_SIGMOID = "sigmoid";
const std::string G_OP_TYPE_SOFTMAX = "softmax";
const std::string G_OP_TYPE_TRANSPOSE = "transpose";
const std::string G_OP_TYPE_SPLIT = "split";
const std::string G_OP_TYPE_FEED = "feed";
const std::string G_OP_TYPE_FETCH = "fetch";
const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const char *G_OP_TYPE_CONV = "conv2d";
const char *G_OP_TYPE_BATCHNORM = "batch_norm";
const char *G_OP_TYPE_BOX_CODER = "box_coder";
const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc";
const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul";
const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const char *G_OP_TYPE_POOL2D = "pool2d";
const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const char *G_OP_TYPE_RELU = "relu";
const char *G_OP_TYPE_RESHAPE = "reshape";
const char *G_OP_TYPE_SIGMOID = "sigmoid";
const char *G_OP_TYPE_SOFTMAX = "softmax";
const char *G_OP_TYPE_TRANSPOSE = "transpose";
const char *G_OP_TYPE_SPLIT = "split";
const char *G_OP_TYPE_FEED = "feed";
const char *G_OP_TYPE_FETCH = "fetch";
const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
const char *G_OP_TYPE_DROPOUT = "dropout";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
"fusion_elementwise_add_relu";
const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
const char *G_OP_TYPE_REGION = "region";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key = {
{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
......@@ -59,11 +72,19 @@ std::unordered_map<
{G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
{G_OP_TYPE_BOX_CODER,
{{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
{G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},
{G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}},
{G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile
......@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once;
#pragma once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace paddle_mobile {
enum class Precision : int { FP32 = 0 };
enum class Precision : int { FP32 = 0, FP16 = 1 };
typedef int16_t half;
template <Precision p>
struct PrecisionTrait {
......@@ -29,6 +33,10 @@ template <>
struct PrecisionTrait<Precision::FP32> {
typedef float ptype;
};
template <>
struct PrecisionTrait<Precision::FP16> {
typedef half ptype;
};
//! device type
enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
......@@ -71,28 +79,40 @@ enum PMStatus {
PMWrongDevice = 0x08 /*!< un-correct device. */
};
extern const std::string G_OP_TYPE_CONV;
extern const std::string G_OP_TYPE_BATCHNORM;
extern const std::string G_OP_TYPE_BOX_CODER;
extern const std::string G_OP_TYPE_CONCAT;
extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const std::string G_OP_TYPE_FC;
extern const std::string G_OP_TYPE_CONV_ADD;
extern const std::string G_OP_TYPE_LRN;
extern const std::string G_OP_TYPE_MUL;
extern const std::string G_OP_TYPE_MULTICLASS_NMS;
extern const std::string G_OP_TYPE_POOL2D;
extern const std::string G_OP_TYPE_PRIOR_BOX;
extern const std::string G_OP_TYPE_RELU;
extern const std::string G_OP_TYPE_RESHAPE;
extern const std::string G_OP_TYPE_SIGMOID;
extern const std::string G_OP_TYPE_SOFTMAX;
extern const std::string G_OP_TYPE_TRANSPOSE;
extern const std::string G_OP_TYPE_SPLIT;
extern const std::string G_OP_TYPE_FEED;
extern const std::string G_OP_TYPE_FETCH;
extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
extern const char *G_OP_TYPE_CONV;
extern const char *G_OP_TYPE_BATCHNORM;
extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
extern const char *G_OP_TYPE_LRN;
extern const char *G_OP_TYPE_MUL;
extern const char *G_OP_TYPE_MULTICLASS_NMS;
extern const char *G_OP_TYPE_POOL2D;
extern const char *G_OP_TYPE_PRIOR_BOX;
extern const char *G_OP_TYPE_RELU;
extern const char *G_OP_TYPE_RESHAPE;
extern const char *G_OP_TYPE_SIGMOID;
extern const char *G_OP_TYPE_SOFTMAX;
extern const char *G_OP_TYPE_TRANSPOSE;
extern const char *G_OP_TYPE_SPLIT;
extern const char *G_OP_TYPE_FEED;
extern const char *G_OP_TYPE_FETCH;
extern const char *G_OP_TYPE_DEPTHWISE_CONV;
extern const char *G_OP_TYPE_IM2SEQUENCE;
extern const char *G_OP_TYPE_DROPOUT;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
extern const char *G_OP_TYPE_FUSION_POOL_BN;
extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_FC_RELU;
extern const char *G_OP_TYPE_REGION;
extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include "common/enforce.h"
#include "common/log.h"
......@@ -82,7 +84,8 @@ struct Variant {
if (type_id == typeid(T).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(&data));
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
exit(0);
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include "fpga/api/fpga_api.h"
namespace paddle_mobile {
namespace fpga {
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static inline int do_ioctl(int req, const void *arg) {
return ioctl(req, (unsigned int64_t)arg);
}
int open_device() {
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
return fd;
}
// memory management;
void *fpga_malloc(size_t size) {
return reinterpret_cast<void *>(
mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
}
void fpga_free(void *ptr) { munmap(ptr, 0); }
void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int ComputeFpgaConv(const struct ConvArgs &args) {
return do_ioctl(IOCTL_CONFIG_CONV, &args);
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
return do_ioctl(IOCTL_CONFIG_POOLING, &args);
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
return do_ioctl(IOCTL_CONFIG_EW, &args);
}
int PerformBypass(const struct BypassArgs &args) {
return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
// memory management;
namespace paddle_mobile {
namespace fpga {
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
enum DataConvertType {
DATA_NO_CONVERT = 0,
DATA_FP32_TO_FP16 = 1,
DATA_FP16_TO_FP32 = 2,
};
enum LayoutConvertType {
LAYOUT_NO_CONVERT = 0,
LAYOUT_CHW_TO_HWC = 1,
LAYOUT_HWC_TO_CHW = 2,
};
struct VersionArgs {
void* buffer;
};
struct MemoryCopyArgs {
void* src;
void* dest;
size_t size;
};
struct BNArgs {
bool enabled;
void* bias_address;
void* scale_address;
};
/**
Conv and Pooling kernel
*/
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
float* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
void* filter_address;
float* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct PoolingArgs {
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
// elementwise add arguments
struct EWAddArgs {
bool relu_enabled;
float const0; // output0 = const0 x input0 + const1 x input1;
float const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataConvertType convert_type;
enum LayoutConvertType layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
struct FpgaRegWriteArgs {
uint64_t address; //
uint64_t value;
};
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_SEPARATOR_0 10
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_SEPARATOR_1 20
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
enum FPGA_ERR_TYPE {
ERR_IOCTL_CMD = -1,
ERR_TIMEOUT = -2,
ERR_COMPLETION_TIMEOUT = -3,
ERR_INVALID_FPGA_ADDR = -4,
ERR_NOMEM = -5,
ERR_NO_RESERVE_MEM = -6,
ERR_COPY_FROM_USER = -7,
ERR_COPY_TO_USER = -8,
ERR_DEL_TIMER = -9,
ERR_ENABLE_MSI = -10,
ERR_REGISTER_IRQ = -11,
ERR_PCIE_REGISTER = -12,
ERR_PCIE_PROBE = -13,
ERR_REGISTER_BLOCK = -14,
ERR_ALLOC_GENDISK = -15,
ERR_INIT_QUEUE = -16,
ERR_WAIT = -17,
ERR_ECC_ERROR = -31,
ERR_FPGA_FAIL_STOP = -64,
ERR_FPGA_DEBUG_STOP = -113,
DEV_TMP_UNAVAILABLE = -128
};
//============================== API =============================
int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct ConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/fpga_quantilization.h"
#include <algorithm>
namespace paddle_mobile {
namespace fpga {
template <typename Dtype>
static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
int height, int width) {
int offset_height = 0;
for (int n = 0; n < num; n++) {
int amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_out + offset_height + w * channel + c) = *(data_in++);
}
}
}
data_out += num;
}
}
template <typename Dtype>
static Dtype find_max(Dtype* data, int num) {
Dtype max = 0;
for (int i = 0; i < num; ++i) {
max = std::max(max, data[i]);
}
return max;
}
// template <typename Dtype>
framework::Tensor* quantify_filter(framework::Tensor* filter) {
float scale = 0;
float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
const int batch_size = filter->dims()[0];
const int channel = filter->dims()[1];
const int height = filter->dims()[2];
const int width = filter->dims()[3];
int8_t* int_data = nullptr;
int8_t* tmp_data = new int8_t[filter->numel()];
// 32bit filter -> 8bit filter;
if (filter->type() == typeid(float)) {
float* float_data = filter->data<float>();
float max = find_max(float_data, filter->numel());
scale = (max / fix_range);
framework::Tensor* filter = filter;
framework::Tensor* quant_filter = new framework::Tensor();
int_data = quant_filter->mutable_data<int8_t>();
for (int i = 0; i < filter->numel(); ++i) {
tmp_data[i] = (int8_t)float_data[i] * scale;
}
filter = quant_filter;
} else {
int8_t max = find_max(filter->data<int8_t>(), filter->numel());
scale = (max / fix_range);
int_data = filter->data<int8_t>();
for (int i = 0; i < filter->numel(); ++i) {
tmp_data[i] = int_data[i];
}
int_data = filter->mutable_data<int8_t>();
}
// NCHW -> NHWC;
chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
delete tmp_data;
*(filter->fpga_args().scale_pointer()) = scale;
return filter;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "common/types.h"
#include "framework/lod_tensor.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace fpga {
template <typename Dtype>
static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
int height, int width);
// template <typename Dtype>
framework::Tensor* quantify_filter(framework::Tensor* filter);
} // namespace fpga
} // namespace paddle_mobile
......@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once
#include <cstdlib>
#include <string>
#include <typeinfo>
#include <unordered_map>
#include <vector>
......@@ -128,6 +130,7 @@ class Attribute {
return vistor(attr.variant_.Get<int64_t>());
} else {
PADDLE_MOBILE_THROW_EXCEPTION("type not support");
exit(0);
}
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <cctype>
#include <cstdlib>
#include <string>
namespace paddle_mobile {
......@@ -40,6 +41,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
return DataLayout::kAnyLayout;
} else {
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
exit(0);
}
}
......@@ -52,6 +54,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
case DataLayout::kAnyLayout:
return "ANY_LAYOUT";
default:
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
exit(0);
break;
}
}
......
......@@ -14,8 +14,11 @@ limitations under the License. */
#pragma once
#include <cstdlib>
#include <initializer_list>
#include <typeinfo>
#include <vector>
#include "common/enforce.h"
#include "common/variant.h"
#include "dim.h"
......@@ -57,7 +60,8 @@ struct DDim {
} else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
return vistor(d.var.Get<Dim<9>>());
} else {
DLOG << " dim not support";
PADDLE_MOBILE_ENFORCE(false, " dim not support");
exit(0);
}
}
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <cstdlib>
#include "common/enforce.h"
namespace paddle_mobile {
namespace framework {
......@@ -129,6 +130,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
template <>
int64_t &indexer<0>(Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
exit(0);
}
template <int D>
......@@ -145,6 +147,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
template <>
int64_t indexer<0>(const Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
exit(0);
}
} // namespace
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lod_tensor.h"
#include <algorithm>
namespace paddle_mobile {
namespace framework {
......
......@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
return it->second.second;
}
template <typename Dtype>
vector<string> OperatorBase<Dtype>::GetInputKeys() const {
auto it = op_input_output_key.find(type_);
if (it == op_input_output_key.end()) {
DLOG << type_ << " has no outputs";
return {};
}
return it->second.first;
}
template <typename Dtype>
OperatorBase<Dtype>::OperatorBase(const std::string &type,
const VariableNameMap &inputs,
......@@ -49,6 +59,11 @@ template <typename Dtype>
void OperatorBase<Dtype>::Run() const {
RunImpl();
#ifdef PADDLE_MOBILE_DEBUG
vector<string> input_keys = GetInputKeys();
for (const auto key : input_keys) {
Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
DLOG << type_ << " input- " << key << "=" << *input;
}
vector<string> output_keys = GetOutKeys();
for (const auto key : output_keys) {
Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
......
......@@ -61,8 +61,10 @@ class OperatorBase {
virtual ~OperatorBase() {}
void Run() const;
std::vector<string> GetOutKeys() const;
std::vector<string> GetInputKeys() const;
virtual void RunImpl() const = 0;
virtual void Init() = 0;
/*
* @b op 运算所需的输入, 如上一层的输出结果、卷积核
* */
......@@ -110,15 +112,21 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
param_(inputs, outputs, attrs, *scope) {
PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), " %s kernel init failed",
this->type_.c_str());
}
param_(inputs, outputs, attrs, *scope) {}
virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
virtual void InferShape() const = 0;
void Init() {
// for (auto i : this->inputs_) {
// DLOG << i.first;
// DLOG << i.second;
// }
PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), " %s kernel init failed",
this->type_.c_str());
}
protected:
KernelType kernel_;
ParamType param_;
......@@ -135,9 +143,21 @@ class OpKernelBase {
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase() { acl_op_ = nullptr; }
void *GetAclOp() const { return acl_op_; }
void SetAclOp(void *op, void *ob) const {
reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
}
#endif
virtual void Compute(const P &para) const = 0;
virtual bool Init(const P &para) const { return true; };
virtual bool Init(P *para) { return true; }
virtual ~OpKernelBase() = default;
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void *acl_op_;
#endif
};
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "block_desc.h"
#include <algorithm>
namespace paddle_mobile {
namespace framework {
......
......@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "framework/operator.h"
#include "node.h"
#include "framework/program/program-optimize/node.h"
namespace paddle_mobile {
namespace framework {
......@@ -34,12 +36,25 @@ class FusionOpRegister {
}
void regist(FusionOpMatcher* matcher) {
if (matchers_.find(matcher->Type()) != matchers_.end()) {
return;
}
std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
matchers_[matcher->Type()] = shared_matcher;
}
const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
return matchers_;
const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
for (const auto& match : matchers_) {
matchers.push_back(match.second);
}
std::sort(matchers.begin(), matchers.end(),
[](std::shared_ptr<FusionOpMatcher> first,
std::shared_ptr<FusionOpMatcher> second) {
return first->BeginNode().Depth() > second->BeginNode().Depth();
});
return matchers;
}
private:
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/program/program-optimize/node.h"
#include <algorithm>
#include "framework/operator.h"
namespace paddle_mobile {
......@@ -43,23 +44,6 @@ bool Node::operator==(const Node &in) {
return true;
}
std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
OpDescs(size - 1, &op_descs);
return op_descs;
}
void Node::OpDescs(int index,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
if (index == 0) {
return;
}
op_desc->push_back(this->op_desc_);
for (auto &output : outputs_) {
output->OpDescs(index, op_desc);
}
}
std::shared_ptr<Node> Node::To(int size) {
std::shared_ptr<Node> node = std::make_shared<Node>();
this->To(size - 1, node);
......@@ -92,7 +76,8 @@ int Node::Depth(int begin) {
Node &Node::Folder(
int size, std::string type,
std::map<std::string, std::pair<std::string, std::string>> change,
std::map<std::string, std::vector<std::pair<std::string, std::string>>>
change,
std::vector<std::shared_ptr<Node>> *removed_nodes) {
std::shared_ptr<framework::OpDesc> op_desc =
std::make_shared<framework::OpDesc>();
......@@ -109,12 +94,15 @@ Node &Node::Folder(
void Node::Folder(
std::shared_ptr<framework::OpDesc> op_desc,
std::vector<std::shared_ptr<Node>> *outputs, int index,
std::map<std::string, std::pair<std::string, std::string>> *change,
std::map<std::string, std::vector<std::pair<std::string, std::string>>>
*change,
Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
if (change->find(this->type_) != change->end()) {
auto change_pair = (*change)[this->type_];
op_desc->GetInputs()[change_pair.second] =
this->op_desc_->GetInputs()[change_pair.first];
auto change_pairs = (*change)[this->type_];
for (const auto &change_pair : change_pairs) {
op_desc->GetInputs()[change_pair.second] =
this->op_desc_->GetInputs()[change_pair.first];
}
}
for (auto &attr_pair : this->op_desc_->attrs_) {
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <cinttypes>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "common/log.h"
#include "framework/program/op_desc.h"
......@@ -43,20 +44,19 @@ class Node {
int Depth(int begin = 0);
Node &Folder(
int size, std::string type,
std::map<std::string, std::pair<std::string, std::string>> change_map,
std::map<std::string, std::vector<std::pair<std::string, std::string>>>
change,
std::vector<std::shared_ptr<Node>> *removed_nodes);
std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
std::string Type() { return type_; }
private:
void OpDescs(int size,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
void To(int index, std::shared_ptr<Node>);
void Folder(
std::shared_ptr<framework::OpDesc> op_desc,
std::vector<std::shared_ptr<Node>> *outputs, int index,
std::map<std::string, std::pair<std::string, std::string>> *change,
std::map<std::string, std::vector<std::pair<std::string, std::string>>>
*change,
Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
std::shared_ptr<framework::OpDesc> op_desc_;
#ifdef PADDLE_MOBILE_DEBUG
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/program/program-optimize/program_optimize.h"
#include <algorithm>
#include "framework/program/program-optimize/fusion_op_register.h"
namespace paddle_mobile {
......@@ -77,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
}
for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
std::string fusion_type = registed.first;
std::shared_ptr<FusionOpMatcher> matcher = registed.second;
// DLOG << " registed node \n " << matcher->BeginNode();
std::string fusion_type = registed->Type();
std::shared_ptr<FusionOpMatcher> matcher = registed;
auto match_vector = type_map[matcher->BeginType()];
......
......@@ -29,7 +29,8 @@ class Program {
std::shared_ptr<Scope> scope;
std::string model_path;
std::string para_path;
bool is_commbine = false;
bool combined = false;
bool quantification = false;
private:
};
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "framework/scope.h"
#include <algorithm>
#include <set>
#include <string>
#include <vector>
......
......@@ -23,7 +23,17 @@ namespace framework {
class Scope {
public:
Scope() = default;
~Scope() = default;
~Scope() {
for (auto &var : vars_) {
delete var.second;
}
vars_.clear();
for (auto kid : kids_) {
delete kid;
}
kids_.clear();
}
Scope &NewScope() const;
......
......@@ -16,13 +16,15 @@ limitations under the License. */
#include <cstdint>
#include <cstring>
#include <fstream>
#include <memory>
#include <string>
#include <type_traits>
#include <typeindex>
#include <vector>
#include "common/enforce.h"
#include "common/enforce.h"
#include "common/types.h"
#include "framework/data_layout.h"
#include "framework/ddim.h"
#include "memory/t_malloc.h"
......@@ -62,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
functor;
size_t size = functor(type);
PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
......@@ -131,11 +134,27 @@ class Tensor {
return reinterpret_cast<T *>(mutable_data(typeid(T)));
}
#ifdef PADDLE_MOBILE_DEBUG
template <typename T>
inline void dump(std::string filename) const {
const T *dataptr = data<T>();
std::ofstream out(filename.c_str());
for (int i = 0; i < numel(); ++i) {
out << dataptr[i] << " ";
}
out << "形状:";
for (int j = 0; j < dims_.size(); ++j) {
out << dims_[j] << " ";
}
out.close();
}
#endif
inline void *mutable_data(std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = numel() * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type));
......@@ -234,6 +253,18 @@ class Tensor {
"Tensor's dims_ is out of bound. ");
}
#ifdef PADDLE_MOBILE_FPGA
struct FPGAArgs {
float scale;
inline float *scale_pointer() { return &scale; }
};
struct FPGAArgs fpga_args() const {
return fpgaArgs_;
}
#endif
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a
......@@ -300,6 +331,10 @@ class Tensor {
* begins.
*/
size_t offset_;
#ifdef PADDLE_MOBILE_FPGA
FPGAArgs fpgaArgs_;
#endif
};
#ifdef PADDLE_MOBILE_DEBUG
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "cstring"
#include "io/paddle_inference_api.h"
namespace paddle_mobile {
int PaddleDtypeSize(PaddleDType dtype) {
switch (dtype) {
case PaddleDType::FLOAT32:
return sizeof(float);
case PaddleDType::INT64:
return sizeof(int64_t);
default:
assert(false);
return -1;
}
}
PaddleBuf::PaddleBuf(PaddleBuf&& other)
: data_(other.data_),
length_(other.length_),
memory_owned_(other.memory_owned_) {
other.memory_owned_ = false;
other.data_ = nullptr;
other.length_ = 0;
}
PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
// only the buffer with external memory can be copied
if (!other.memory_owned_) {
data_ = other.data_;
length_ = other.length_;
memory_owned_ = other.memory_owned_;
} else {
Resize(other.length());
memcpy(data_, other.data(), other.length());
length_ = other.length();
memory_owned_ = true;
}
return *this;
}
void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return;
if (memory_owned_) {
Free();
}
data_ = new char[length];
length_ = length;
memory_owned_ = true;
}
void PaddleBuf::Reset(void* data, size_t length) {
Free();
memory_owned_ = false;
data_ = data;
length_ = length;
}
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
assert(length_ > 0);
delete[] static_cast<char*>(data_);
data_ = nullptr;
length_ = 0;
}
}
} // namespace paddle_mobile
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "io/api_paddle_mobile.h"
#include <vector>
#include "framework/tensor.h"
namespace paddle_mobile {
template <typename Dtype, Precision P>
PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
const PaddleMobileConfig &config) {
PADDLE_MOBILE_ENFORCE(Init(config) == true,
"paddle mobile predictor init failed!");
config_ = config;
}
template <typename Dtype, Precision P>
bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
if (!config.model_dir.empty()) {
paddle_mobile_->Load(config.model_dir, config.optimize,
config.quantification, config.batch_size);
} else if (!config.prog_file.empty() && !config.param_file.empty()) {
paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
config.quantification, config.batch_size);
} else {
LOG(kLOG_ERROR) << "fail to load inference model!";
return false;
}
// If the openmp is open, set the thread num
paddle_mobile_->SetThreadNum(config.thread_num);
return true;
}
template <typename Dtype, Precision P>
bool PaddleMobilePredictor<Dtype, P>::Run(
const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data, int batch_size) {
if (inputs.empty()) {
LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
return false;
}
auto input = inputs[0];
if (input.shape.size() != 4) {
LOG(kLOG_ERROR) << "input shape not equal to 4!";
return false;
}
std::vector<int64_t> dims;
for (auto d : input.shape) {
dims.push_back(static_cast<int64_t>(d));
}
// use tensor
framework::DDim ddim =
framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
framework::Tensor input_tensor;
input_tensor.Resize(ddim);
int input_length = framework::product(ddim);
typedef typename PrecisionTrait<P>::ptype PType;
auto input_ptr = input_tensor.mutable_data<PType>();
memcpy(input_ptr, static_cast<PType *>(input.data.data()),
input_length * sizeof(PType));
auto output_tensor = paddle_mobile_->Predict(input_tensor);
if (output_data->empty()) {
LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
return false;
}
auto &output = (*output_data)[0];
int output_length = output_tensor->numel();
std::vector<int64_t> tensor_shape =
framework::vectorize(output_tensor->dims());
for (auto d : tensor_shape) {
output.shape.push_back(static_cast<int>(d));
}
if (output.data.length() < output_length * sizeof(PType)) {
output.data.Resize(output_length * sizeof(PType));
}
memcpy(output.data.data(), output_tensor->template data<PType>(),
output_length * sizeof(PType));
return true;
}
// A factory to help create difference predictor.
template <>
std::unique_ptr<PaddlePredictor>
CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
const PaddleMobileConfig &config) {
std::unique_ptr<PaddlePredictor> x;
if (config.precision == PaddleMobileConfig::FP32) {
if (config.device == PaddleMobileConfig::kCPU) {
x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config));
} else if (config.device == PaddleMobileConfig::kFPGA) {
x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
} else if (config.device == PaddleMobileConfig::kGPU_MALI) {
x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
} else {
LOG(kLOG_ERROR) << "unsupport device type!";
return nullptr;
}
} else {
LOG(kLOG_ERROR) << "unsupport precision type!";
return nullptr;
}
return std::move(x);
}
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the implementation of inference API with Anakin engine
* embeded, this API can only support Anakin models.
*/
#pragma once
#include <vector>
#include "io/paddle_inference_api.h"
// from paddle_mobile
#include "common/enforce.h"
#include "common/types.h"
#include "io/paddle_mobile.h"
namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class PaddleMobilePredictor : public PaddlePredictor {
public:
PaddleMobilePredictor() {}
explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override;
~PaddleMobilePredictor() override{};
private:
std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
bool Init(const PaddleMobileConfig& config);
PaddleMobileConfig config_;
};
} // namespace paddle_mobile
......@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io/io.h"
#include "io/executor.h"
#include <operators/math/gemm.h>
#include <algorithm>
#include <vector>
#include "common/enforce.h"
#include "common/log.h"
......@@ -25,7 +27,6 @@ limitations under the License. */
#include "framework/scope.h"
#include "framework/tensor.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <algorithm>
#include <queue>
#include <utility>
#include "common/threadpool.h"
......@@ -39,7 +40,7 @@ char *Get_binary_data(std::string filename) {
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str());
fseek(file, 0, SEEK_END);
long size = ftell(file);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
rewind(file);
char *data = new char[size];
......@@ -50,116 +51,6 @@ char *Get_binary_data(std::string filename) {
return data;
}
static size_t ReadBuffer(const char *file_name, uint8_t **out) {
printf("%s \n", file_name);
FILE *fp;
fp = fopen(file_name, "rb");
PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
rewind(fp);
DLOG << "model size: " << size;
*out = reinterpret_cast<uint8_t *>(malloc(size));
size_t cur_len = 0;
size_t nread;
while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
cur_len += nread;
}
fclose(fp);
return cur_len;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize, bool can_add_split) {
auto program =
this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
program.model_path = dirname;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path,
bool optimize) {
auto program = this->LoadProgram(model_path, optimize);
program.para_path = para_path;
program.is_commbine = true;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL;
size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
c_program = paddle_mobile__framework__proto__program_desc__unpack(
NULL, read_size, buf);
//
PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
//
DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
//
auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
framework::Program<Dtype, P> program;
program.originProgram = originProgramDesc;
auto scope = std::make_shared<framework::Scope>();
program.scope = scope;
for (const auto &block : originProgramDesc->Blocks()) {
for (auto var_desc : block->Vars()) {
auto var = scope->Var(var_desc->Name());
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
if (var_desc->Persistable() &&
var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
auto dim = var_desc->Tensor_desc().Dims();
auto tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dim));
} else {
auto dim = var_desc->Tensor_desc().Dims();
PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
dim[0] = 1;
auto tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dim));
}
} else {
// TODO(codeWorm): some.
}
}
}
if (optimize) {
framework::ProgramOptimize program_optimize;
program.optimizeProgram =
program_optimize.FusionOptimize(originProgramDesc, can_add_split);
}
if (optimize) {
program.optimizeProgram->Description("optimize: ");
} else {
originProgramDesc->Description("program: ");
}
paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
return program;
}
template class Loader<CPU, Precision::FP32>;
template class Loader<FPGA, Precision::FP32>;
template class Loader<GPU_MALI, Precision::FP32>;
#pragma mark - executor
template <typename Dtype, Precision P>
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
......@@ -193,39 +84,46 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
#endif
}
}
if (program_.is_commbine) {
if (program_.combined) {
InitCombineMemory();
} else {
InitMemory();
}
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()];
for (const auto &op : ops) {
op->Init();
}
}
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
framework::LoDTensor *tensor, char *&data) {
framework::LoDTensor *tensor, char **data) {
// 1. version
uint32_t version = *(uint32_t *)data;
data += sizeof(uint32_t);
uint32_t version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 2 Lod information
uint64_t *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, data, sizeof(uint64_t));
memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
uint64_t lod_level = *lod_level_ptr;
delete lod_level_ptr;
data += sizeof(uint64_t);
(*data) += sizeof(uint64_t);
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *(uint64_t *)data;
data += sizeof(uint64_t);
uint64_t size = *reinterpret_cast<uint64_t *>(*data);
(*data) += sizeof(uint64_t);
DLOG << "lod size: " << i << size;
std::vector<size_t> tmp(size / sizeof(size_t));
for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *(size_t *)data;
DLOG << "tmp[k]: " << k << *(size_t *)data;
data += sizeof(size_t);
tmp[k] = *reinterpret_cast<size_t *>(*data);
(*data) += sizeof(size_t);
}
for (auto j : tmp) {
......@@ -235,18 +133,18 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
}
// 3. tensor version
uint32_t tensor_version = *(uint32_t *)data;
data += sizeof(uint32_t);
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 4. tensor desc
int32_t size = *(int32_t *)data;
data += sizeof(int32_t);
int32_t size = *reinterpret_cast<int32_t *>(*data);
(*data) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = data[m];
buf.get()[m] = (*data)[m];
}
data += (sizeof(char) * size);
(*data) += (sizeof(char) * size);
const framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
......@@ -256,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
tensor->Resize(framework::make_ddim(desc.Dims()));
void *memory = tensor;
void *memory = nullptr;
int type_size = 0;
switch (desc.DataType()) {
case framework::VARTYPE_TYPE_FP16:
......@@ -281,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
default:
break;
}
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = data[n];
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
}
(*data) += (sizeof(char) * memory_size * type_size);
}
data += (sizeof(char) * memory_size * type_size);
}
template <typename Dtype, Precision P>
......@@ -302,7 +214,7 @@ void Executor<Dtype, P>::InitMemory() {
char *origin_data =
Get_binary_data(program_.model_path + "/" + var_desc->Name());
char *data = origin_data;
LoadMemory(*var_desc, tensor, data);
LoadMemory(*var_desc, tensor, &data);
delete origin_data;
} else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
......@@ -328,7 +240,7 @@ void Executor<Dtype, P>::InitCombineMemory() {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
LoadMemory(*var_desc, tensor, data);
LoadMemory(*var_desc, tensor, &data);
} else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto tensor = var->template GetMutable<framework::LoDTensor>();
......@@ -416,6 +328,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
// to Run
ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
......@@ -433,7 +347,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
*(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
#ifdef PADDLE_EXECUTOR_MULTITHREAD
// TODO expose profile info as an interface, user can get them to analysis
// TODO(haipeng): expose profile info as an interface, user can get them to
// analysis
// the performance of their deepnet.
FILE *df = fopen("net.dot", "w");
fprintf(df, "digraph {\n");
......@@ -448,16 +363,19 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
fprintf(df, "}\n");
fclose(df);
#endif
FILE *pf = fopen("profile.out", "w");
// FILE *pf = fopen("profile.out", "w");
std::unordered_map<std::string, uint64_t> _tp;
for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
_tp[ops[i]->Type()] += timeCost;
fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
// fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
// ops[i]->Type().c_str(),
// pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
}
fclose(pf);
// fclose(pf);
printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>;
std::vector<prof_t> _tv(_tp.begin(), _tp.end());
......@@ -471,8 +389,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
std::sort(_tv.begin(), _tv.end(), compf);
_tv.push_back(std::make_pair("total", _ptotal));
for (auto const &p : _tv) {
printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
(float)p.second / _ptotal * 100.0);
printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
static_cast<float>(p.second),
static_cast<float>(p.second) / _ptotal * 100.0);
}
printf("====================[---------]======================\n");
#endif
......@@ -500,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
}
template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
} // namespace paddle_mobile
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <memory>
#include <string>
#include <vector>
#include "common/types.h"
#include "framework/lod_tensor.h"
#include "framework/operator.h"
......@@ -27,36 +28,11 @@ limitations under the License. */
#include <condition_variable>
#include <mutex>
#include <thread>
#include "common/depCore.h"
#include "common/dep_core.h"
#endif
namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Loader {
public:
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool can_add_split = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false);
private:
const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool can_add_split = false);
};
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Executor {
public:
......@@ -86,7 +62,7 @@ class Executor {
Executor() = default;
void InitMemory();
void LoadMemory(const framework::VarDesc var_desc,
framework::LoDTensor *tensor, char *&data);
framework::LoDTensor *tensor, char **data);
void InitCombineMemory();
framework::Program<Dtype> program_;
int batch_size_ = 1;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io/loader.h"
#include "framework/lod_tensor.h"
#include "framework/program/program-optimize/program_optimize.h"
namespace paddle_mobile {
using framework::Variable;
static size_t ReadBuffer(const char *file_name, uint8_t **out) {
FILE *fp;
fp = fopen(file_name, "rb");
PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
fseek(fp, 0, SEEK_END);
size_t size = ftell(fp);
rewind(fp);
DLOG << "model size: " << size;
*out = reinterpret_cast<uint8_t *>(malloc(size));
size_t cur_len = 0;
size_t nread;
while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
cur_len += nread;
}
fclose(fp);
return cur_len;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize, bool quantification,
bool can_add_split) {
auto program = this->LoadProgram(dirname + "/__model__", optimize,
quantification, can_add_split);
program.model_path = dirname;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path, bool optimize,
bool quantification) {
auto program = this->LoadProgram(model_path, optimize, quantification);
program.para_path = para_path;
program.combined = true;
program.quantification = quantification;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool quantification,
bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL;
size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
c_program = paddle_mobile__framework__proto__program_desc__unpack(
NULL, read_size, buf);
//
PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
//
DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
//
auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
framework::Program<Dtype, P> program;
program.originProgram = originProgramDesc;
program.quantification = quantification;
auto scope = std::make_shared<framework::Scope>();
program.scope = scope;
for (const auto &block : originProgramDesc->Blocks()) {
for (auto var_desc : block->Vars()) {
auto var = scope->Var(var_desc->Name());
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
if (var_desc->Persistable() &&
var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
auto dim = var_desc->Tensor_desc().Dims();
auto tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dim));
} else {
auto dim = var_desc->Tensor_desc().Dims();
PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
dim[0] = 1;
auto tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(dim));
}
} else {
// TODO(codeWorm): some.
}
}
}
if (optimize) {
framework::ProgramOptimize program_optimize;
program.optimizeProgram =
program_optimize.FusionOptimize(originProgramDesc, can_add_split);
}
if (optimize) {
program.optimizeProgram->Description("optimize: ");
} else {
originProgramDesc->Description("program: ");
}
paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
return program;
}
template class Loader<CPU, Precision::FP32>;
template class Loader<FPGA, Precision::FP32>;
template class Loader<GPU_MALI, Precision::FP32>;
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "common/types.h"
#include "framework/program/program.h"
namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Loader {
public:
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false,
bool quantification = false);
private:
const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
};
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the definition of a simple Inference API for Paddle.
*
* ATTENTION: It requires some C++ features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
#include <cassert>
#include <memory>
#include <string>
#include <vector>
namespace paddle_mobile {
enum PaddleDType {
FLOAT32,
INT64,
};
class PaddleBuf {
public:
PaddleBuf() = default;
PaddleBuf(PaddleBuf&& other);
// Copy only available when memory is managed externally.
explicit PaddleBuf(const PaddleBuf&);
PaddleBuf& operator=(const PaddleBuf&);
// Do not own the memory.
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
// Reset to external memory.
void Reset(void* data, size_t length);
bool empty() const { return length_ == 0; }
void* data() const { return data_; }
size_t length() const { return length_; }
~PaddleBuf() { Free(); }
private:
void Free();
void* data_{nullptr}; // pointer to the data memory.
size_t length_{0}; // number of memory bytes.
bool memory_owned_{true};
};
struct PaddleTensor {
PaddleTensor() = default;
std::string name; // variable name.
std::vector<int> shape;
// TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
enum class PaddleEngineKind {
kPaddleMobile,
// TODO(Superjomn) support following engines latter.
// kTensorRT, // Use TensorRT for inference.
// kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
// kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
};
/*
* A simple Inference API for Paddle. Currently this API can be used by
* non-sequence scenerios.
*/
class PaddlePredictor {
public:
struct Config;
PaddlePredictor() = default;
PaddlePredictor(const PaddlePredictor&) = delete;
PaddlePredictor& operator=(const PaddlePredictor&) = delete;
// Predict an record.
// The caller should be responsible for allocating and releasing the memory of
// `inputs`. `inputs` should be available until Run returns. Caller should be
// responsible for the output tensor's buffer, either allocated or passed from
// outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
// Destroy the Predictor.
virtual ~PaddlePredictor() = default;
// The common configs for all the predictors.
struct Config {
std::string model_dir; // path to the model directory.
};
};
struct PaddleMobileConfig : public PaddlePredictor::Config {
enum Precision { FP32 = 0 };
enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
enum Precision precision;
enum Device device;
int batch_size = 1;
bool optimize = true;
bool quantification = false;
int thread_num = 1;
std::string prog_file;
std::string param_file;
};
// A factory to help create different predictors.
template <typename ConfigT,
PaddleEngineKind engine = PaddleEngineKind::kPaddleMobile>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io/paddle_mobile.h"
namespace paddle_mobile {
template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
#ifdef _OPENMP
// omp_set_dynamic(0);
omp_set_num_threads(num);
#endif
};
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
bool quantification, int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
} else {
LOG(kLOG_INFO) << "loader inited";
}
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(dirname, optimize, quantification), batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
}
return true;
}
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
const std::string &para_path, bool optimize,
bool quantification, int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
} else {
LOG(kLOG_INFO) << "loader inited";
}
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(model_path, para_path, optimize, quantification),
batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
}
return true;
}
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
const framework::Tensor &t) {
return executor_->Predict(t);
}
template <typename Dtype, Precision P>
std::vector<typename PaddleMobile<Dtype, P>::Ptype>
PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims) {
return executor_->Predict(input, dims);
}
template <typename Dtype, Precision P>
void PaddleMobile<Dtype, P>::Clear() {
executor_ = nullptr;
loader_ = nullptr;
}
template <typename Dtype, Precision P>
PaddleMobile<Dtype, P>::~PaddleMobile() {
executor_ = nullptr;
loader_ = nullptr;
}
template class PaddleMobile<CPU, Precision::FP32>;
template class PaddleMobile<FPGA, Precision::FP32>;
template class PaddleMobile<GPU_MALI, Precision::FP32>;
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#ifdef _OPENMP
#include <omp.h>
#endif // _OPENMP
#include "common/types.h"
#include "framework/tensor.h"
#include "io/executor.h"
#include "io/loader.h"
namespace paddle_mobile {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class PaddleMobile {
typedef typename PrecisionTrait<P>::ptype Ptype;
public:
PaddleMobile() {}
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
bool Load(const std::string &dirname, bool optimize = false,
bool quantification = false, int batch_size = 1);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
bool Load(const std::string &model_path, const std::string &para_path,
bool optimize = false, bool quantification = false,
int batch_size = 1);
/*
* @b 设置线程数, 当 cmake 中开启 openmp 时生效
* */
void SetThreadNum(int num);
/*
* @b to predict
* */
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
/*
* @b to predict with vector and dim
*
* @b 使用 输入 和 输入的维度信息 进行预测
* */
std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims);
void Clear();
~PaddleMobile();
private:
std::shared_ptr<Loader<Dtype, P>> loader_;
std::shared_ptr<Executor<Dtype, P>> executor_;
};
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#import <CoreImage/CoreImage.h>
#import <Foundation/Foundation.h>
@interface PaddleMobile : NSObject
/*
创建对象
*/
- (instancetype)init;
/*
load 模型, 开辟内存
*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
/*
加载散开形式的模型, 需传入模型的目录
*/
- (BOOL)load:(NSString *)modelAndWeightPath;
/*
进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
/*
进行预测
*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
/*
清理内存
*/
- (void)clear;
@end
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#import "PaddleMobile.h"
#import "op_symbols.h"
#import "io/paddle_mobile.h"
#import <memory>
#import <vector>
@interface PaddleMobile()
{
paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
BOOL loaded_;
}
@end
@implementation PaddleMobile
static std::mutex shared_mutex;
- (instancetype)init {
if (self = [super init]) {
pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
}
return self;
}
- (void)dealloc {
if (pam_) {
delete pam_;
}
}
+ (instancetype)sharedInstance{
static dispatch_once_t onceToken;
static id sharedManager = nil;
dispatch_once(&onceToken, ^{
sharedManager = [[[self class] alloc] init];
});
return sharedManager;
}
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
std::string model_path_str = std::string([modelPath UTF8String]);
std::string weights_path_str = std::string([weighsPath UTF8String]);
if (loaded_ = pam_->Load(model_path_str, weights_path_str, false)) {
return YES;
} else {
return NO;
}
}
- (BOOL)load:(NSString *)modelAndWeightPath{
std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
if (loaded_ = pam_->Load(model_path_str)) {
return YES;
} else {
return NO;
}
}
-(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
if (means == nil) {
means = @[@0, @0, @0];
}
int wanted_input_width = dim[3];
int wanted_input_height = dim[2];
int wanted_input_channels = dim[1];
for (int c = 0; c < wanted_input_channels; ++c) {
float *out_channel = output + c * wanted_input_height * wanted_input_width;
for (int y = 0; y < wanted_input_height; ++y) {
float *out_row = out_channel + y * wanted_input_width;
for (int x = 0; x < wanted_input_width; ++x) {
int in_row = (y * imageHeight) / wanted_input_height;
int in_col = (x * imageWidth) / wanted_input_width;
const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
float *out_pos = out_row + x;
if (c == 0) {
*out_pos = (in_pixel[c] - means[c].floatValue) * scale;
}else if (c == 1){
*out_pos = (in_pixel[c] - means[c].floatValue) * scale;
}else if (c == 2){
*out_pos = (in_pixel[c] - means[c].floatValue) * scale;
}
}
}
}
}
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
std::lock_guard<std::mutex> lock(shared_mutex);
if (!loaded_) {
printf("PaddleMobile doesn't be loaded yet");
return nil;
}
if (dim.count != 4) {
printf("dim must have 4 elements");
return nil;
}
// dim to c++ vector, get numel
std::vector<int64_t > dim_vec;
int numel = 1;
for (int k = 0; k < dim.count; ++k) {
int d = dim[k].intValue;
numel *= d;
dim_vec.push_back(d);
}
const int sourceRowBytes = CGImageGetBytesPerRow(image);
const int image_width = CGImageGetWidth(image);
const int image_height = CGImageGetHeight(image);
const int image_channels = 4;
CGDataProviderRef provider = CGImageGetDataProvider(image);
CFDataRef cfData = CGDataProviderCopyData(provider);
const UInt8 *input = CFDataGetBytePtr(cfData);
// sample image
float *output = (float *)malloc(numel*sizeof(float));
[self preprocess:input output:output imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means scale:scale dim:dim_vec];
float *dataPointer = nullptr;
if (nullptr != output) {
dataPointer = output;
} else {
return nil;
}
// input
std::vector<float> predict_input;
for (int j = 0; j < numel; ++j) {
predict_input.push_back(dataPointer[j]);
}
// predict
std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
// result
long count = 0;
count = cpp_result.size();
NSMutableArray *result = [[NSMutableArray alloc] init];
for (int i = 0; i < count; i++) {
[result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
}
free(output);
// 待验证
// if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
CFRelease(cfData);
cfData = NULL;
// }
return result;
}
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
[self predict:image dim:dim means:nil scale:1];
}
- (void)clear{
pam_->Clear();
}
@end
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "operators/batchnorm_op.h"
#include "operators/box_coder_op.h"
#include "operators/concat_op.h"
#include "operators/conv_op.h"
#include "operators/depthwise_conv_op.h"
#include "operators/dropout_op.h"
#include "operators/elementwise_add_op.h"
#include "operators/feed_op.h"
#include "operators/fetch_op.h"
#include "operators/fusion_conv_add.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/fusion_fc_op.h"
#include "operators/im2sequence_op.h"
#include "operators/lrn_op.h"
#include "operators/mul_op.h"
#include "operators/multiclass_nms_op.h"
#include "operators/pool_op.h"
#include "operators/prior_box_op.h"
#include "operators/relu_op.h"
#include "operators/reshape_op.h"
#include "operators/sigmoid_op.h"
#include "operators/softmax_op.h"
#include "operators/transpose_op.h"
......@@ -15,6 +15,11 @@ limitations under the License. */
#ifdef ANDROID
#include "paddle_mobile_jni.h"
#include <cmath>
#include "common/log.h"
#include "framework/tensor.h"
#include "io/paddle_mobile.h"
#ifdef __cplusplus
extern "C" {
#endif
......@@ -28,17 +33,16 @@ using std::string;
extern const char *ANDROID_LOG_TAG =
"paddle_mobile LOG built on " __DATE__ " " __TIME__;
static Executor<CPU> *shared_executor_instance = nullptr;
static PaddleMobile<CPU> *shared_paddle_mobile_instance = nullptr;
// toDo mutex lock
// static std::mutex shared_mutex;
Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
bool use_optimize) {
if (nullptr == shared_executor_instance) {
shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
PaddleMobile<CPU> *getPaddleMobileInstance() {
if (nullptr == shared_paddle_mobile_instance) {
shared_paddle_mobile_instance = new PaddleMobile<CPU>();
}
return shared_executor_instance;
return shared_paddle_mobile_instance;
}
string jstring2cppstring(JNIEnv *env, jstring jstr) {
......@@ -51,15 +55,51 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
jclass thiz,
jstring modelPath) {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
ANDROIDLOGI("load invoked");
bool optimize = true;
return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
optimize);
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
JNIEnv *env, jclass thiz, jstring modelPath) {
ANDROIDLOGI("loadQualified invoked");
bool optimize = true;
bool qualified = true;
return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
optimize, qualified);
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
ANDROIDLOGI("loadCombined invoked");
bool optimize = true;
return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
jstring2cppstring(env, paramPath),
optimize);
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
ANDROIDLOGI("loadCombinedQualified invoked");
bool optimize = true;
auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
shared_executor_instance = getExecutorInstance(program, 1, optimize);
return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
bool qualified = true;
return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
jstring2cppstring(env, paramPath),
optimize, qualified);
}
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
JNIEnv *env, jclass thiz, jfloatArray buf) {
JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
ANDROIDLOGI("predictImage invoked");
jsize ddim_size = env->GetArrayLength(ddims);
if (ddim_size != 4) {
ANDROIDLOGE("ddims size not equal to 4");
}
jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
framework::DDim ddim = framework::make_ddim(
{ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
int length = framework::product(ddim);
jfloatArray result = NULL;
int count = 0;
float *dataPointer = nullptr;
......@@ -67,21 +107,116 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
dataPointer = env->GetFloatArrayElements(buf, NULL);
}
framework::Tensor input;
framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
input.Resize(ddim);
auto input_ptr = input.mutable_data<float>();
for (int i = 0; i < framework::product(ddim); i++) {
for (int i = 0; i < length; i++) {
input_ptr[i] = dataPointer[i];
}
auto output = shared_executor_instance->Predict(input);
auto output = shared_paddle_mobile_instance->Predict(input);
count = output->numel();
result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>());
env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
ANDROIDLOGI("predictImage finished");
return result;
}
inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) {
int r1 = (int)(y + 1.370705 * (v - 128));
int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128));
int b1 = (int)(y + 1.732446 * (u - 128));
r1 = (int)fminf(255, fmaxf(0, r1));
g1 = (int)fminf(255, fmaxf(0, g1));
b1 = (int)fminf(255, fmaxf(0, b1));
*r = r1;
*g = g1;
*b = b1;
return 0;
}
void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height,
int targetWidth, int targetHeight, float *means) {
const uint8_t *yData = nv21;
const uint8_t *vuData = nv21 + width * height;
const int yRowStride = width;
const int vuRowStride = width;
float scale_x = width * 1.0 / targetWidth;
float scale_y = height * 1.0 / targetHeight;
for (int j = 0; j < targetHeight; ++j) {
int y = j * scale_y;
const uint8_t *pY = yData + y * yRowStride;
const uint8_t *pVU = vuData + (y >> 1) * vuRowStride;
for (int i = 0; i < targetWidth; ++i) {
int x = i * scale_x;
const int offset = ((x >> 1) << 1);
float r = 0;
float g = 0;
float b = 0;
yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b);
int r_index = j * targetWidth + i;
int g_index = r_index + targetWidth * targetHeight;
int b_index = g_index + targetWidth * targetHeight;
matrix[r_index] = r - means[0];
matrix[g_index] = g - means[1];
matrix[b_index] = b - means[2];
}
}
}
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight,
jintArray ddims, jfloatArray meanValues) {
ANDROIDLOGI("predictYuv invoked");
jsize ddim_size = env->GetArrayLength(ddims);
if (ddim_size != 4) {
ANDROIDLOGE("ddims size not equal to 4");
}
jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
framework::DDim ddim = framework::make_ddim(
{ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
int length = framework::product(ddim);
float matrix[length];
jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
float *meansPointer = nullptr;
if (nullptr != meanValues) {
meansPointer = env->GetFloatArrayElements(meanValues, NULL);
}
convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth, imgHeight, ddim[3],
ddim[2], meansPointer);
jfloatArray result = NULL;
int count = 0;
framework::Tensor input;
input.Resize(ddim);
auto input_ptr = input.mutable_data<float>();
for (int i = 0; i < length; i++) {
input_ptr[i] = matrix[i];
}
auto output = shared_paddle_mobile_instance->Predict(input);
count = output->numel();
result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>());
env->ReleaseByteArrayElements(yuv_, yuv, 0);
env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
ANDROIDLOGI("predictYuv finished");
return result;
}
JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
jclass thiz,
jint threadCount) {
ANDROIDLOGI("setThreadCount %d", threadCount);
getPaddleMobileInstance()->SetThreadNum((int)threadCount);
}
JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
jclass thiz) {}
jclass thiz) {
getPaddleMobileInstance()->Clear();
}
} // namespace jni
} // namespace paddle_mobile
......
......@@ -15,9 +15,6 @@ limitations under the License. */
#pragma once
#ifdef ANDROID
#include <jni.h>
#include "common/log.h"
#include "framework/tensor.h"
#include "io/io.h"
#ifdef __cplusplus
extern "C" {
......@@ -25,18 +22,54 @@ extern "C" {
namespace paddle_mobile {
namespace jni {
/**
* load model & params of the net for android
* load separated model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
jclass thiz,
jstring modelPath);
/**
* load separated qualified model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
JNIEnv *env, jclass thiz, jstring modelPath);
/**
* load combined model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
/**
* load combined qualified model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
/**
* object detection for anroid
*/
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
JNIEnv *env, jclass thiz, jfloatArray buf);
JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims);
/**
* object detection for anroid
*/
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight,
jintArray ddims, jfloatArray meanValues);
/**
* object detection for anroid
*/
JNIEXPORT jfloatArray JNICALL
Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf);
/**
* setThreadCount for multithread
*/
JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
jclass thiz,
jint threadCount);
/**
* clear data of the net when destroy for android
*/
......
......@@ -12,16 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "memory/t_malloc.h"
#include <cstdlib>
#include <cstring>
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace paddle_mobile {
namespace memory {
const int MALLOC_ALIGN = 64;
#ifdef PADDLE_MOBILE_FPGA
namespace fpga = paddle_mobile::fpga;
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
void Free(void *ptr) {
if (ptr) {
fpga::fpga_free(ptr);
}
}
#else
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
......@@ -44,5 +64,7 @@ void Free(void *ptr) {
}
}
#endif
} // namespace memory
} // namespace paddle_mobile
......@@ -26,16 +26,16 @@ void BatchNormOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.OutputY()->Resize(x_dims);
}
template class BatchNormOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(batch_norm);
REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
......
......@@ -45,4 +45,13 @@ class BatchNormOp
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(batch_norm);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(batch_norm);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
......@@ -47,13 +47,12 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
this->param_.OutputBox()->Resize(framework::make_ddim(
{input_targetbox_dims[0], input_priorbox_dims[0], 4}));
}
template class BoxCoderOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(box_coder);
REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
......
......@@ -51,4 +51,12 @@ class BoxCoderOp
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(box_coder);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
......@@ -14,7 +14,9 @@ limitations under the License. */
#ifdef CONCAT_OP
#include "concat_op.h"
#include <vector>
#include "operators/concat_op.h"
namespace paddle_mobile {
namespace operators {
......@@ -56,19 +58,19 @@ void ConcatOp<Dtype, T>::InferShape() const {
this->param_.Out()->Resize(out_dims);
}
template class ConcatOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(concat);
REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
#endif
#endif
......@@ -46,4 +46,14 @@ class ConcatOp
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(concat);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(concat);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(concat);
#endif
#endif
......@@ -48,22 +48,17 @@ void ConvOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class ConvOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv2d);
REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv2d);
REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(conv2d);
REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
#endif
......
......@@ -46,4 +46,14 @@ class ConvOp
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv2d);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv2d);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(conv2d);
#endif
#endif
......@@ -12,16 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_USE_OPENMP
/**
* android-ndk-r17 has a problem when linking with openmp.
* if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
* after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
* will not work. see test/common/test_openmp.cc the detailed reason is still
* unclear, but this trick will work. a better solution is hacking the linker,
* try some flags to make it link omp_* functions, but I didn't find out how to
* make it work.
*/
#include <omp.h>
static int _ = omp_get_num_procs();
#ifdef CONV_TRANSPOSE
#include "operators/conv_transpose_op.h"
namespace paddle_mobile {
namespace operators {}
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_TRANSPOSE
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "operators/kernel/conv_transpose_kernel.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class ConvOpTranspose : public framework::OperatorWithKernel<
DeviceType, ConvTransposeParam,
operators::ConvTransposeKernel<DeviceType, T>> {
public:
ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, ConvTransposeParam,
operators::ConvTransposeKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void InferShape() const {
auto input = this->param_.Input();
auto in_dims = input->dims();
auto filter = this->param_.Filter();
auto filter_dims = filter->dims();
std::vector<int> strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
std::vector<int> dilations = this->param_.Dilations();
int groups = this->param_.Groups();
PADDLE_MOBILE_ENFORCE(
in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() == filter_dims.size(),
"ConvTransposeOp input dimension and filter dimension "
"should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims.size() - strides.size() == 2U,
"ConvTransposeOp input dimension and strides dimension should "
"be consistent.");
PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
"ConvTransposeOp paddings dimension and strides "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
"ConvTransposeOp paddings dimension and dilations "
"dimension should be the same.");
PADDLE_MOBILE_ENFORCE(
in_dims[1] == filter_dims[0],
"In ConvTransposeOp, The number of input channels should "
"be equal to the number of filter's channels.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
for (size_t i = 0; i < strides.size(); ++i) {
auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
2 * paddings[i] + filter_extent);
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
private:
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -49,14 +49,11 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class DepthwiseConvOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(depthwise_conv2d);
REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
......
......@@ -48,4 +48,12 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(depthwise_conv2d);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DROPOUT_OP
#include "operators/dropout_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void DropoutOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DROPOUT_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/dropout_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T>
class DropoutOp
: public framework::OperatorWithKernel<
DeviceType, DropoutParam, operators::DropoutKernel<DeviceType, T>> {
public:
DropoutOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, DropoutParam,
operators::DropoutKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
// using framework::OperatorWithKernel<DeviceType, DropoutParam,
// operators::DropoutKernel<DeviceType,
// T>>;
void InferShape() const override;
protected:
};
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(dropout);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(dropout);
#endif
#endif
......@@ -24,16 +24,16 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
auto x_dim = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dim);
}
template class ElementwiseAddOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(elementwise_add);
REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
......
......@@ -48,4 +48,13 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(elementwise_add);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(elementwise_add);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
......@@ -14,8 +14,16 @@ limitations under the License. */
#include "feed_op.h"
namespace paddle_mobile {
namespace operators {
template class FeedOp<CPU, float>;
}
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
#endif
......@@ -29,8 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope),
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
param_(inputs, outputs, attrs, scope.get()) {}
void InferShape() const {
auto out_dims = param_.Out()->dims();
......@@ -38,19 +37,42 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
param_.Out()->Resize(out_dims);
}
#ifdef PADDLE_MOBILE_FPGA
void RunImpl() const { fpga::PerformBypass(param_.FpgaArgs()); }
void Init() {
const Tensor *input = param_.InputX();
auto input_ptr = input->data<float>();
Tensor *output = param_.Out();
auto output_ptr = output->mutable_data<half>();
fpga::BypassArgs args;
args.convert_type = fpga::DATA_FP32_TO_FP16;
args.layout_type = fpga::LAYOUT_CHW_TO_HWC;
args.image.address = (void *)input_ptr;
args.image.channels = input->dims()[1];
args.image.height = input->dims()[2];
args.image.width = input->dims()[3];
args.output.address = output_ptr;
param_.SetFpgaArgs(args);
}
#else
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() {}
#endif
protected:
FeedParam param_;
};
namespace ops = paddle_mobile::operators;
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(feed);
REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(feed);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(feed);
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -14,8 +14,16 @@ limitations under the License. */
#include "fetch_op.h"
namespace paddle_mobile {
namespace operators {
template class FetchOp<CPU, float>;
}
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
#endif
......@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() {}
void InferShape() const {
auto x_dims = param_.InputX()->dims();
param_.Out()->Resize(x_dims);
......@@ -42,15 +44,15 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
FetchParam param_;
};
namespace ops = paddle_mobile::operators;
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fetch);
REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fetch);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fetch);
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -44,16 +44,16 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
template class FusionConvAddOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv_add);
REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
......
......@@ -11,9 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define FUSION_CONVADD_OP
#ifdef FUSION_CONVADD_OP
#ifdef FUSION_CONVADD_OP
#pragma once
#include <string>
......@@ -37,13 +36,11 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
}
std::string Type() { return G_OP_TYPE_CONV_ADD; }
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD; }
};
template <typename DeviceType, typename T>
......@@ -68,15 +65,39 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fusion_conv_add);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/fusion_conv_add_bn_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddBNOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_add_bn_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionConvAddBNMatcher : public framework::FusionOpMatcher {
public:
FusionConvAddBNMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; }
};
template <typename DeviceType, typename T>
class FusionConvAddBNOp : public framework::OperatorWithKernel<
DeviceType, FusionConvAddBNParam,
operators::ConvAddBNKernel<DeviceType, T>> {
public:
FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddBNParam,
operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_ADD_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
new FusionConvAddBNMatcher());
#define FUSION_CONV_ADD_BN_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_ADD_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
new FusionConvAddBNMatcher());
#define FUSION_CONV_ADD_BN_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_ADD_BN_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
new FusionConvAddBNMatcher());
#define FUSION_CONV_ADD_BN_REGISTER
#endif
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_bn);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_bn);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_add_bn_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
public:
FusionConvAddBNReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionConvAddBNReluOp
: public framework::OperatorWithKernel<
DeviceType, FusionConvAddBNReluParam,
operators::ConvAddBNReluKernel<DeviceType, T>> {
public:
FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddBNReluParam,
operators::ConvAddBNReluKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddBNReluParam,
operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
new FusionConvAddBNReluMatcher());
#define FUSION_CONV_ADD_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
new FusionConvAddBNReluMatcher());
#define FUSION_CONV_ADD_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
new FusionConvAddBNReluMatcher());
#define FUSION_CONV_ADD_BN_RELU_REGISTER
#endif
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(fusion_conv_add_bn_relu);
#endif
#endif
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONVADDRELU_OP
#ifdef FUSION_CONVADDRELU_OP
#include "fusion_conv_add_relu_op.h"
#include "operators/math/conv_func.h"
......@@ -49,12 +49,12 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_relu);
REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_ELEMENTWISEADDRELU_OP
#include "fusion_elementwise_add_relu_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
auto x_dim = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
// ops::FusionElementwiseAddReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
// REGISTER_OPERATOR_MALI_GPU(fusion_elementwise_add_relu,
// ops::FusionElementwiseAddReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
ops::FusionElementwiseAddReluOp);
#endif
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册