提交 abb4bb07 编写于 作者: Y Yao,kun

Merge remote-tracking branch 'upstream/develop' into develop

# Conflicts:
#	src/common/types.h
[submodule "src/operators/kernel/mali/ACL_Android"]
path = src/operators/kernel/mali/ACL_Android
url = https://github.com/halsay/ACL_Android.git
cmake_minimum_required(VERSION 3.0) cmake_minimum_required(VERSION 3.0)
project(paddle-mobile) project(paddle-mobile)
add_definitions(-DPADDLE_MOBILE_DEBUG)
add_definitions(-DENABLE_EXCEPTION)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") option(DEBUGING "enable debug mode" ON)
set(CMAKE_BUILD_TYPE RelWithDebInfo) option(USE_OPENMP "openmp support" OFF)
set(CMAKE_VERBOSE_MAKEFILE ON) option(USE_EXCEPTION "use std exception" ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) option(LOG_PROFILE "log profile" ON)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build) # select the platform to build
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) option(CPU "armv7 with neon" ON)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) option(MALI_GPU "mali gpu" ON)
option(FPGA "fpga" OFF)
set(DEBUGING ON)
if (CPU)
add_definitions(-DPADDLE_MOBILE_CPU)
endif()
if (MALI_GPU)
add_definitions(-DPADDLE_MOBILE_MALI_GPU)
add_definitions(-DUSE_ACL=1)
add_definitions(-DUSE_OPENCL)
set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
endif()
if(FPGA)
add_definitions(-DPADDLE_MOBILE_FPGA)
endif()
set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
if (DEBUGING)
message(STATUS "debug")
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
add_definitions(-DPADDLE_MOBILE_DEBUG)
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
add_definitions(-DARMV7)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
endif ()
else ()
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif ()
if (USE_EXCEPTION)
message(STATUS "use exception")
add_definitions(-DENABLE_EXCEPTION)
add_definitions(-fexceptions)
else()
add_definitions(-fno-exceptions)
endif ()
if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(USE_OPENMP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
endif()
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
# include headers if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
endif ()
include_directories(src/) include_directories(src/)
#include(ExternalProject) set(CMAKE_VERBOSE_MAKEFILE ON)
#ExternalProject_Add(openblas_proj set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# GIT_REPOSITORY "https://github.com/xianyi/OpenBLAS.git" set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
# GIT_TAG "v0.2.20" set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
# SOURCE_DIR "openblas/" set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# BUILD_IN_SOURCE 1
# CONFIGURE_COMMAND "" include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
# BUILD_COMMAND "make" "ONLY_CBLAS=1"
# INSTALL_COMMAND "make" "PREFIX=${CMAKE_BINARY_DIR}/" "install"
# )
#set_target_properties(openblas_proj PROPERTIES EXCLUDE_FROM_ALL 1)
#add_dependencies(paddle-mobile openblas_proj) # if (IS_IOS)
# add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
else ()
add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
endif ()
# gen static if(DEBUGING)
ADD_LIBRARY(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) add_subdirectory(test)
endif()
#add_dependencies(paddle-mobile openblas_proj)
add_subdirectory(test)
# 贡献代码
欢迎您对Paddle-Mobile项目的贡献。
我们诚挚的感谢你的贡献,这个文档描述了我们的工作方式和工作流程。Paddle-Mobile在PaddlePaddle org下,和服务器版本的Paddle工程的代码规范基本相同,开发者也可以同时参考Paddle的相关文档。
## Workflow
Paddle-Mobile 开发中使用到的几种模型在这个链接下载 [点我](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip).
之后是贡献代码的主要流程。
### Fork
* Paddle-Mobile采用Pull Request的方式提交代码,禁止直接push,所有的代码都需要人工review。首先要fork一份Paddle-Moble的代码 ["Fork" button](https://help.github.com/articles/fork-a-repo/).
* 跳转到[Paddle-Mobile](https://github.com/PaddlePaddle/paddle-mobile) GitHub首页,然后单击 `Fork` 按钮,生成自己目录下的仓库,比如 <https://github.com/你的用户名/paddle-mobile>
### Clone(克隆)
将远程仓库 clone 到本地:
```bash
➜ git clone https://github.com/你的用户名/paddle-mobile
cd Paddle
```
### 创建本地分支
Paddle-Mobile 和Paddle一样,目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发,测试,发行和维护,具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)
所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成,一般从 `develop` 分支上创建新分支。
使用 `git checkout -b` 创建并切换到新分支。
```bash
➜ git checkout -b my-cool-stuff
```
值得注意的是,在 checkout 之前,需要保持当前分支目录 clean,否则会把 untracked 的文件也带到新分支上,这可以通过 `git status` 查看。
### 使用 `pre-commit` 钩子
Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码(C++,Python),在提交(commit)前自动检查一些基本事宜(如每个文件只有一个 EOL,Git 中不要添加大文件等)。
`pre-commit`测试是 Travis-CI 中单元测试的一部分,不满足钩子的 PR 不能被提交到 Paddle,首先安装并在当前目录运行它:
```bash
pip install pre-commit
pre-commit -v -a
```
Paddle-Mobile 使用 `clang-format` 来调整 C/C++ 源代码格式,在格式化代码时不同的`clang-format`版本会有不同的表现形态,和Paddle不同的是,Paddle-Mobile开发人员使用的是更的5.0版本的llvm工具集。所以为了防止无法CI,请确保 `clang-format` 版本是 5.0 版本。
> 另外:通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的,Paddle 开发人员使用的是`pip install pre-commit`。
## 开始开发
在本例中,我删除了 README.md 中的一行,并创建了一个新文件。
通过 `git status` 查看当前状态,这会提示当前目录的一些变化,同时也可以通过 `git diff` 查看文件具体被修改的内容。
```bash
➜ git status
On branch test
Changes not staged for commit:
(use "git add <file>..." to update what will be committed)
(use "git checkout -- <file>..." to discard changes in working directory)
modified: README.md
Untracked files:
(use "git add <file>..." to include in what will be committed)
test
no changes added to commit (use "git add" and/or "git commit -a")
```
## 构建
paddle-mobile是为了移动端版本开发的,而移动端大多以arm平台为主。所以我们要交叉编译到arm平台。以cpu为例:
1. 安装NDK最新版
2. 配置ANDROID_NDK和NDK_ROOT环境变量
3. 开发,并写单元测试
4. sh build.sh
## 提交(commit)
接下来我们取消对 README.md 文件的改变,然后提交新添加的 test 文件。
```bash
➜ git checkout -- README.md
➜ git status
On branch test
Untracked files:
(use "git add <file>..." to include in what will be committed)
test
nothing added to commit but untracked files present (use "git add" to track)
➜ git add test
```
Git 每次提交代码,都需要写提交说明,这可以让其他人知道这次提交做了哪些改变,这可以通过`git commit` 完成。
```bash
▶ pre-commit run -a -v
[remove-crlf] CRLF end-lines remover........................................Passed
[remove-tabs] Tabs remover..................................................Passed
[check-added-large-files] Check for added large files.......................Passed
[check-merge-conflict] Check for merge conflicts............................Passed
[check-symlinks] Check for broken symlinks..................................Passed
[detect-private-key] Detect Private Key.....................................Passed
[end-of-file-fixer] Fix End of Files........................................Passed
[trailing-whitespace] Trim Trailing Whitespace..............................Passed
[copyright] copyright.......................................................Passed
[clang-format] clang-format.................................................Passed
[cpplint] cpplint...........................................................Passed
hookid: cpplint
Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
Done processing build_bak.sh
Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
Done processing build_bak.sh
```
## 保持本地仓库最新
在准备发起 Pull Request 之前,需要同步原仓库(<https://github.com/PaddlePaddle/paddle-mobile>)最新的代码。
首先通过 `git remote` 查看当前远程仓库的名字。
```bash
➜ git remote
origin
➜ git remote -v
origin https://github.com/USERNAME/paddle-mobile (fetch)
origin https://github.com/USERNAME/paddle-mobile (push)
```
这里 origin 是我们 clone 的远程仓库的名字,也就是自己用户名下的 paddle-mobile,接下来我们创建一个原始 paddle-mobile 仓库的远程主机,命名为 upstream。
```bash
➜ git remote add upstream https://github.com/PaddlePaddle/paddle-mobile
➜ git remote
origin
upstream
```
获取 upstream 的最新代码并更新当前分支。
```bash
➜ git fetch upstream
➜ git pull upstream develop
```
## Push 到远程仓库
将本地的修改推送到 GitHub 上,也就是 https://github.com/USERNAME/paddle-mobile。
```bash
# 推送到远程仓库 origin 的 my-cool-stuff 分支上
➜ git push origin my-cool-stuff
```
## 建立 Issue 并完成 Pull Request
建立一个 Issue 描述问题,并记录它的编号。
切换到所建分支,然后点击 `New pull request`
在 PR 的描述说明中,填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后,自动关闭对应的 Issue
> 具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>
## review
在接到PR后,可以看到该pr页面内正在运行CI。如果运行出现问题,可以点Details进入Travis平台上看详细内容。
![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833030073.jpg)
可以在travis上看到更加详细的信息。
![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833651326.jpg)
接下来等待 review,如果有需要修改的地方,参照上述步骤更新 origin 中的对应分支即可。
## 删除远程分支
在 PR 被 merge 进主仓库后,我们可以在 PR 的页面删除远程仓库的分支。
<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
也可以使用 `git push origin :分支名` 删除远程分支,如:
```bash
➜ git push origin :my-cool-stuff
```
## 删除本地分支
最后,删除本地分支。
```bash
# 切换到 develop 分支
➜ git checkout develop
# 删除 my-cool-stuff 分支
➜ git branch -D my-cool-stuff
```
至此,我们就完成了一次代码贡献的过程。
## 提交代码的一些约定
为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。
2. 提交Pull Request前:
- 请注意commit的数量:
- 原因:如果仅仅修改一个文件但提交了十几个commit,每个commit只做了少量的修改,这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改,且不排除commit之间的修改存在相互覆盖的情况。
- 建议:每次提交时,保持尽量少的commit,可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit,可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)
- 请注意每个commit的名称:应能反映当前commit的内容,不能太随意。
3. 如果解决了某个Issue的问题,请在该Pull Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)
此外,在回复评审人意见时,请您遵守以下约定:
1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢):
- 对评审意见同意且按其修改完的,给个简单的`Done`即可;
- 对评审意见不同意的,请给出您自己的反驳理由。
2. 如果评审意见比较多:
- 请给出总体的修改情况。
- 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复,而非直接回复的方式。原因是每个回复都会发送一封邮件,会造成邮件灾难。
FROM ubuntu:16.04
RUN echo '\
deb <mirror> <version> main restricted universe multiverse\n\
deb <mirror> <version>-updates main restricted universe multiverse\n\
deb <mirror> <version>-backports main restricted universe multiverse\n\
deb <mirror> <version>-security main restricted universe multiverse\n'\
> /etc/apt/sources.list
RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y --no-install-recommends \
curl \
unzip \
git \
make \
cmake \
cmake-curses-gui \
python \
python-pip \
python-setuptools \
clang-format-5.0 \
graphviz \
g++-arm-linux-gnueabi \
gcc-arm-linux-gnueabi
RUN apt-get autoremove -y && apt-get clean
RUN pip install --upgrade pip
RUN pip install wheel && pip install pre-commit
RUN ln -s clang-format-5.0 /usr/bin/clang-format
# RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
# RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
# ENV NDK_ROOT /opt/android-ndk-r17b
# 环境搭建
## 使用 docker
### 1. 安装 docker
安装 docker 的方式,参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
### 2. 使用 docker 搭建构建环境
首先进入 paddle-mobile 的目录下,执行 `docker build`
以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
```
$ docker build -t paddle-mobile:dev - < Dockerfile
```
使用 `docker images` 可以看到我们新建的 image
```
$ docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
paddle-mobile dev 33b146787711 45 hours ago 372MB
```
### 3. 使用 docker 构建
进入 paddle-mobile 目录,执行 docker run
```
$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
root@5affd29d4fc5:/ # cd /paddle-mobile
# 生成构建 android 产出的 Makefile
root@5affd29d4fc5:/ # rm CMakeCache.txt
root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
# 生成构建 linux 产出的 Makefile
root@5affd29d4fc5:/ # rm CMakeCache.txt
root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
```
### 4. 设置编译选项
可以通过 ccmake 设置编译选项
```
root@5affd29d4fc5:/ # ccmake .
Page 1 of 1
CMAKE_ASM_FLAGS
CMAKE_ASM_FLAGS_DEBUG
CMAKE_ASM_FLAGS_RELEASE
CMAKE_BUILD_TYPE
CMAKE_INSTALL_PREFIX /usr/local
CMAKE_TOOLCHAIN_FILE /paddle-mobile/tools/toolchains/arm-android-neon.cmake
CPU ON
DEBUGING ON
FPGA OFF
LOG_PROFILE ON
MALI_GPU OFF
NET googlenet
USE_EXCEPTION ON
USE_OPENMP OFF
```
修改选项后,按 `c`, `g` 更新 Makefile
### 5. 构建
使用 make 命令进行构建
```
root@5affd29d4fc5:/ # make
```
### 6. 查看构建产出
构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行
## 不使用 docker
不使用 docker 的方法,可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具,可能需要设置 CC,CXX 环境变量,或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake,或者增加自己需要的 toolchain file。
\ No newline at end of file
#!/usr/bin/env sh
push_fn () {
MODELS_PATH="../test/models/*"
EXE_FILE="../test/build/*"
EXE_DIR="data/local/tmp/bin"
MODELS_DIR="data/local/tmp/models"
LIB_PATH="../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR}
echo "test files sync completed"
}
push_fn
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <string>
#include <unordered_map>
#include <vector>
#include "framework/operator.h"
namespace paddle_mobile {
class depCore {
public:
template <typename Dtype>
void analysisDep(
const std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>& ops) {
std::unordered_map<std::string, int> vars;
size_t nop = ops.size();
deps.resize(nop);
next.resize(nop);
for (size_t i = 0; i < nop; i++) {
const auto& op = ops[i];
for (const auto& kv : op->Inputs()) {
for (const auto& v : kv.second) {
if (vars.find(v) == vars.end()) {
continue;
}
int di = vars[v];
if (di == i) {
continue;
}
if (std::find(deps[i].begin(), deps[i].end(), di) != deps[i].end()) {
continue;
}
deps[i].push_back(di);
next[di].push_back(i);
}
}
for (const auto& kv : op->Outputs()) {
for (const auto& v : kv.second) {
vars[v] = i;
}
}
}
}
const std::vector<int>& getNext(int i) { return next[i]; }
const std::vector<int>& getDeps(int i) { return deps[i]; }
std::vector<std::vector<int>> deps;
std::vector<std::vector<int>> next;
};
} // namespace paddle_mobile
#endif
...@@ -17,8 +17,6 @@ limitations under the License. */ ...@@ -17,8 +17,6 @@ limitations under the License. */
#ifdef ENABLE_EXCEPTION #ifdef ENABLE_EXCEPTION
#include <stdio.h> #include <stdio.h>
#include <exception> #include <exception>
#include <sstream>
#include <stdexcept>
#include <string> #include <string>
#endif #endif
...@@ -32,12 +30,11 @@ struct PaddleMobileException : public std::exception { ...@@ -32,12 +30,11 @@ struct PaddleMobileException : public std::exception {
PaddleMobileException(const char *header, const char *detail, PaddleMobileException(const char *header, const char *detail,
const char *file, const int line) { const char *file, const int line) {
std::stringstream ss; char buffer[1500];
ss << exception_prefix << "| " << header << "\n"; snprintf(buffer, sizeof(buffer),
ss << "| [in file] : " << file << " \n"; "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail] : %s\n",
ss << "| [on line] : " << line << " \n"; exception_prefix.c_str(), header, file, line, detail);
ss << "| [detail] : " << detail; message = std::string(buffer);
message = ss.str();
} }
const char *what() const noexcept { return message.c_str(); } const char *what() const noexcept { return message.c_str(); }
}; };
......
...@@ -16,15 +16,43 @@ limitations under the License. */ ...@@ -16,15 +16,43 @@ limitations under the License. */
#include <vector> #include <vector>
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
#include <cstring>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <string> #include <string>
#endif #endif
#ifdef ANDROID
#include <android/log.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
#ifdef PADDLE_MOBILE_DEBUG #ifdef PADDLE_MOBILE_DEBUG
#ifdef ANDROID
extern const char *ANDROID_LOG_TAG;
#define ANDROIDLOGI(...) \
__android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
printf(__VA_ARGS__)
#define ANDROIDLOGW(...) \
__android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \
printf(__VA_ARGS__)
#define ANDROIDLOGD(...) \
__android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
printf(__VA_ARGS__)
#define ANDROIDLOGE(...) \
__android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
printf(__VA_ARGS__)
#else
#define ANDROIDLOGI(...)
#define ANDROIDLOGW(...)
#define ANDROIDLOGD(...)
#define ANDROIDLOGE(...)
#endif
enum LogLevel { enum LogLevel {
kNO_LOG, kNO_LOG,
kLOG_ERROR, kLOG_ERROR,
...@@ -88,26 +116,29 @@ struct ToLog { ...@@ -88,26 +116,29 @@ struct ToLog {
Print printer_; Print printer_;
}; };
#define LOG(level) \ #define LOG(level) \
if (level > paddle_mobile::log_level) { \ if (level > paddle_mobile::log_level) { \
} else \ } else \
paddle_mobile::ToLog( \ paddle_mobile::ToLog( \
level, \ level, static_cast<std::stringstream &>( \
(std::stringstream() \ std::stringstream() \
<< "[file: " \ << "[file: " \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \ << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
<< "] [line: " << __LINE__ << "] ") \ : __FILE__) \
.str()) << "] [line: " << __LINE__ << "] ") \
.str())
#define DLOG \
if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \ #define DLOG \
} else \ if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
paddle_mobile::ToLog( \ } else \
paddle_mobile::kLOG_DEBUG, \ paddle_mobile::ToLog( \
(std::stringstream() \ paddle_mobile::kLOG_DEBUG, \
<< "[file: " \ static_cast<std::stringstream &>( \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \ std::stringstream() \
<< "] [line: " << __LINE__ << "] ") \ << "[file: " \
<< (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
: __FILE__) \
<< "] [line: " << __LINE__ << "] ") \
.str()) .str())
#define LOGF(level, format, ...) \ #define LOGF(level, format, ...) \
...@@ -122,6 +153,11 @@ struct ToLog { ...@@ -122,6 +153,11 @@ struct ToLog {
#else #else
#define ANDROIDLOGI(...)
#define ANDROIDLOGW(...)
#define ANDROIDLOGD(...)
#define ANDROIDLOGE(...)
enum LogLevel { enum LogLevel {
kNO_LOG, kNO_LOG,
kLOG_ERROR, kLOG_ERROR,
......
...@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "log.h" #pragma once
namespace paddle_mobile {} #define EXPORT __attribute__((visibility("default")))
...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef PADDLE_MOBILE_USE_OPENMP
/**
// Disable the copy and assignment operator for a class. * android-ndk-r17 has a problem when linking with openmp.
#ifndef DISABLE_COPY_AND_ASSIGN * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
#define DISABLE_COPY_AND_ASSIGN(classname) \ * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
private: \ * will not work. see test/common/test_openmp.cc the detailed reason is still
classname(const classname &) = delete; \ * unclear, but this trick will work. a better solution is hacking the linker,
classname(classname &&) = delete; \ * try some flags to make it link omp_* functions, but I didn't find out how to
classname &operator=(const classname &) = delete; \ * make it work.
classname &operator=(classname &&) = delete */
#include <omp.h>
static int _ = omp_get_num_procs();
#endif #endif
...@@ -711,47 +711,6 @@ static inline size_t uint32_pack(uint32_t value, uint8_t *out) { ...@@ -711,47 +711,6 @@ static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
return rv; return rv;
} }
/**
* Pack a signed 32-bit integer and return the number of bytes written.
* Negative numbers are encoded as two's complement 64-bit integers.
*
* \param value
* Value to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t int32_pack(int32_t value, uint8_t *out) {
if (value < 0) {
out[0] = value | 0x80;
out[1] = (value >> 7) | 0x80;
out[2] = (value >> 14) | 0x80;
out[3] = (value >> 21) | 0x80;
out[4] = (value >> 28) | 0x80;
out[5] = out[6] = out[7] = out[8] = 0xff;
out[9] = 0x01;
return 10;
} else {
return uint32_pack(value, out);
}
}
/**
* Pack a signed 32-bit integer using ZigZag encoding and return the number of
* bytes written.
*
* \param value
* Value to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t sint32_pack(int32_t value, uint8_t *out) {
return uint32_pack(zigzag32(value), out);
}
/** /**
* Pack a 64-bit unsigned integer using base-128 varint encoding and return the * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
* number of bytes written. * number of bytes written.
...@@ -789,116 +748,6 @@ static size_t uint64_pack(uint64_t value, uint8_t *out) { ...@@ -789,116 +748,6 @@ static size_t uint64_pack(uint64_t value, uint8_t *out) {
return rv; return rv;
} }
/**
* Pack a 64-bit signed integer in ZigZag encoding and return the number of
* bytes written.
*
* \param value
* Value to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t sint64_pack(int64_t value, uint8_t *out) {
return uint64_pack(zigzag64(value), out);
}
/**
* Pack a 32-bit quantity in little-endian byte order. Used for protobuf wire
* types fixed32, sfixed32, float. Similar to "htole32".
*
* \param value
* Value to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t fixed32_pack(uint32_t value, void *out) {
#if !defined(WORDS_BIGENDIAN)
memcpy(out, &value, 4);
#else
uint8_t *buf = out;
buf[0] = value;
buf[1] = value >> 8;
buf[2] = value >> 16;
buf[3] = value >> 24;
#endif
return 4;
}
/**
* Pack a 64-bit quantity in little-endian byte order. Used for protobuf wire
* types fixed64, sfixed64, double. Similar to "htole64".
*
* \todo The big-endian impl is really only good for 32-bit machines, a 64-bit
* version would be appreciated, plus a way to decide to use 64-bit math where
* convenient.
*
* \param value
* Value to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t fixed64_pack(uint64_t value, void *out) {
#if !defined(WORDS_BIGENDIAN)
memcpy(out, &value, 8);
#else
fixed32_pack(value, out);
fixed32_pack(value >> 32, ((char *)out) + 4);
#endif
return 8;
}
/**
* Pack a boolean value as an integer and return the number of bytes written.
*
* \todo Perhaps on some platforms *out = !!value would be a better impl, b/c
* that is idiomatic C++ in some STL implementations.
*
* \param value
* Value to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t boolean_pack(protobuf_c_boolean value, uint8_t *out) {
*out = value ? TRUE : FALSE;
return 1;
}
/**
* Pack a NUL-terminated C string and return the number of bytes written. The
* output includes a length delimiter.
*
* The NULL pointer is treated as an empty string. This isn't really necessary,
* but it allows people to leave required strings blank. (See Issue #13 in the
* bug tracker for a little more explanation).
*
* \param str
* String to encode.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static inline size_t string_pack(const char *str, uint8_t *out) {
if (str == NULL) {
out[0] = 0;
return 1;
} else {
size_t len = strlen(str);
size_t rv = uint32_pack(len, out);
memcpy(out + rv, str, len);
return rv + len;
}
}
/** /**
* Pack a ProtobufCBinaryData and return the number of bytes written. The output * Pack a ProtobufCBinaryData and return the number of bytes written. The output
* includes a length delimiter. * includes a length delimiter.
...@@ -918,30 +767,6 @@ static inline size_t binary_data_pack(const ProtobufCBinaryData *bd, ...@@ -918,30 +767,6 @@ static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
return rv + len; return rv + len;
} }
/**
* Pack a ProtobufCMessage and return the number of bytes written. The output
* includes a length delimiter.
*
* \param message
* ProtobufCMessage object to pack.
* \param[out] out
* Packed message.
* \return
* Number of bytes written to `out`.
*/
static inline size_t prefixed_message_pack(const ProtobufCMessage *message,
uint8_t *out) {
if (message == NULL) {
out[0] = 0;
return 1;
} else {
size_t rv = protobuf_c_message_pack(message, out + 1);
uint32_t rv_packed_size = uint32_size(rv);
if (rv_packed_size != 1) memmove(out + rv_packed_size, out + 1, rv);
return uint32_pack(rv, out) + rv;
}
}
/** /**
* Pack a field tag. * Pack a field tag.
* *
...@@ -963,143 +788,6 @@ static size_t tag_pack(uint32_t id, uint8_t *out) { ...@@ -963,143 +788,6 @@ static size_t tag_pack(uint32_t id, uint8_t *out) {
return uint64_pack(((uint64_t)id) << 3, out); return uint64_pack(((uint64_t)id) << 3, out);
} }
/**
* Pack a required field and return the number of bytes written.
*
* \param field
* Field descriptor.
* \param member
* The field member.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static size_t required_field_pack(const ProtobufCFieldDescriptor *field,
const void *member, uint8_t *out) {
size_t rv = tag_pack(field->id, out);
switch (field->type) {
case PROTOBUF_C_TYPE_SINT32:
out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
return rv + sint32_pack(*(const int32_t *)member, out + rv);
case PROTOBUF_C_TYPE_ENUM:
case PROTOBUF_C_TYPE_INT32:
out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
return rv + int32_pack(*(const int32_t *)member, out + rv);
case PROTOBUF_C_TYPE_UINT32:
out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
return rv + uint32_pack(*(const uint32_t *)member, out + rv);
case PROTOBUF_C_TYPE_SINT64:
out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
return rv + sint64_pack(*(const int64_t *)member, out + rv);
case PROTOBUF_C_TYPE_INT64:
case PROTOBUF_C_TYPE_UINT64:
out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
return rv + uint64_pack(*(const uint64_t *)member, out + rv);
case PROTOBUF_C_TYPE_SFIXED32:
case PROTOBUF_C_TYPE_FIXED32:
case PROTOBUF_C_TYPE_FLOAT:
out[0] |= PROTOBUF_C_WIRE_TYPE_32BIT;
return rv + fixed32_pack(*(const uint32_t *)member, out + rv);
case PROTOBUF_C_TYPE_SFIXED64:
case PROTOBUF_C_TYPE_FIXED64:
case PROTOBUF_C_TYPE_DOUBLE:
out[0] |= PROTOBUF_C_WIRE_TYPE_64BIT;
return rv + fixed64_pack(*(const uint64_t *)member, out + rv);
case PROTOBUF_C_TYPE_BOOL:
out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
return rv + boolean_pack(*(const protobuf_c_boolean *)member, out + rv);
case PROTOBUF_C_TYPE_STRING:
out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
return rv + string_pack(*(char *const *)member, out + rv);
case PROTOBUF_C_TYPE_BYTES:
out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
return rv +
binary_data_pack((const ProtobufCBinaryData *)member, out + rv);
case PROTOBUF_C_TYPE_MESSAGE:
out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
return rv + prefixed_message_pack(*(ProtobufCMessage *const *)member,
out + rv);
}
PROTOBUF_C__ASSERT_NOT_REACHED();
return 0;
}
/**
* Pack a oneof field and return the number of bytes written. Only packs the
* field that is selected by the case enum.
*
* \param field
* Field descriptor.
* \param oneof_case
* Enum value that selects the field in the oneof.
* \param member
* The field member.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static size_t oneof_field_pack(const ProtobufCFieldDescriptor *field,
uint32_t oneof_case, const void *member,
uint8_t *out) {
if (oneof_case != field->id) {
return 0;
}
if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
field->type == PROTOBUF_C_TYPE_STRING) {
const void *ptr = *(const void *const *)member;
if (ptr == NULL || ptr == field->default_value) return 0;
}
return required_field_pack(field, member, out);
}
/**
* Pack an optional field and return the number of bytes written.
*
* \param field
* Field descriptor.
* \param has
* Whether the field is set.
* \param member
* The field member.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static size_t optional_field_pack(const ProtobufCFieldDescriptor *field,
const protobuf_c_boolean has,
const void *member, uint8_t *out) {
if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
field->type == PROTOBUF_C_TYPE_STRING) {
const void *ptr = *(const void *const *)member;
if (ptr == NULL || ptr == field->default_value) return 0;
} else {
if (!has) return 0;
}
return required_field_pack(field, member, out);
}
/**
* Pack an unlabeled field and return the number of bytes written.
*
* \param field
* Field descriptor.
* \param member
* The field member.
* \param[out] out
* Packed value.
* \return
* Number of bytes written to `out`.
*/
static size_t unlabeled_field_pack(const ProtobufCFieldDescriptor *field,
const void *member, uint8_t *out) {
if (field_is_zeroish(field, member)) return 0;
return required_field_pack(field, member, out);
}
/** /**
* Given a field type, return the in-memory size. * Given a field type, return the in-memory size.
* *
...@@ -1139,236 +827,6 @@ static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) { ...@@ -1139,236 +827,6 @@ static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
return 0; return 0;
} }
/**
* Pack an array of 32-bit quantities.
*
* \param[out] out
* Destination.
* \param[in] in
* Source.
* \param[in] n
* Number of elements in the source array.
*/
static void copy_to_little_endian_32(void *out, const void *in,
const unsigned n) {
#if !defined(WORDS_BIGENDIAN)
memcpy(out, in, n * 4);
#else
unsigned i;
const uint32_t *ini = in;
for (i = 0; i < n; i++) fixed32_pack(ini[i], (uint32_t *)out + i);
#endif
}
/**
* Pack an array of 64-bit quantities.
*
* \param[out] out
* Destination.
* \param[in] in
* Source.
* \param[in] n
* Number of elements in the source array.
*/
static void copy_to_little_endian_64(void *out, const void *in,
const unsigned n) {
#if !defined(WORDS_BIGENDIAN)
memcpy(out, in, n * 8);
#else
unsigned i;
const uint64_t *ini = in;
for (i = 0; i < n; i++) fixed64_pack(ini[i], (uint64_t *)out + i);
#endif
}
/**
* Get the minimum number of bytes required to pack a field value of a
* particular type.
*
* \param type
* Field type.
* \return
* Number of bytes.
*/
static unsigned get_type_min_size(ProtobufCType type) {
if (type == PROTOBUF_C_TYPE_SFIXED32 || type == PROTOBUF_C_TYPE_FIXED32 ||
type == PROTOBUF_C_TYPE_FLOAT) {
return 4;
}
if (type == PROTOBUF_C_TYPE_SFIXED64 || type == PROTOBUF_C_TYPE_FIXED64 ||
type == PROTOBUF_C_TYPE_DOUBLE) {
return 8;
}
return 1;
}
/**
* Get the packed size of an array of same field type.
*
* \param field
* Field descriptor.
* \param count
* Number of elements of this type.
* \param array
* The elements to get the size of.
* \return
* Number of bytes required.
*/
static size_t get_packed_payload_length(const ProtobufCFieldDescriptor *field,
unsigned count, const void *array) {
unsigned rv = 0;
unsigned i;
switch (field->type) {
case PROTOBUF_C_TYPE_SFIXED32:
case PROTOBUF_C_TYPE_FIXED32:
case PROTOBUF_C_TYPE_FLOAT:
return count * 4;
case PROTOBUF_C_TYPE_SFIXED64:
case PROTOBUF_C_TYPE_FIXED64:
case PROTOBUF_C_TYPE_DOUBLE:
return count * 8;
case PROTOBUF_C_TYPE_ENUM:
case PROTOBUF_C_TYPE_INT32: {
const int32_t *arr = (const int32_t *)array;
for (i = 0; i < count; i++) rv += int32_size(arr[i]);
break;
}
case PROTOBUF_C_TYPE_SINT32: {
const int32_t *arr = (const int32_t *)array;
for (i = 0; i < count; i++) rv += sint32_size(arr[i]);
break;
}
case PROTOBUF_C_TYPE_UINT32: {
const uint32_t *arr = (const uint32_t *)array;
for (i = 0; i < count; i++) rv += uint32_size(arr[i]);
break;
}
case PROTOBUF_C_TYPE_SINT64: {
const int64_t *arr = (const int64_t *)array;
for (i = 0; i < count; i++) rv += sint64_size(arr[i]);
break;
}
case PROTOBUF_C_TYPE_INT64:
case PROTOBUF_C_TYPE_UINT64: {
const uint64_t *arr = (const uint64_t *)array;
for (i = 0; i < count; i++) rv += uint64_size(arr[i]);
break;
}
case PROTOBUF_C_TYPE_BOOL:
return count;
default:
PROTOBUF_C__ASSERT_NOT_REACHED();
}
return rv;
}
/**
* Pack an array of same field type to a virtual buffer.
*
* \param field
* Field descriptor.
* \param count
* Number of elements of this type.
* \param array
* The elements to get the size of.
* \param[out] buffer
* Virtual buffer to append data to.
* \return
* Number of bytes packed.
*/
static size_t pack_buffer_packed_payload(const ProtobufCFieldDescriptor *field,
unsigned count, const void *array,
ProtobufCBuffer *buffer) {
uint8_t scratch[16];
size_t rv = 0;
unsigned i;
switch (field->type) {
case PROTOBUF_C_TYPE_SFIXED32:
case PROTOBUF_C_TYPE_FIXED32:
case PROTOBUF_C_TYPE_FLOAT:
#if !defined(WORDS_BIGENDIAN)
rv = count * 4;
goto no_packing_needed;
#else
for (i = 0; i < count; i++) {
unsigned len = fixed32_pack(((uint32_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
#endif
case PROTOBUF_C_TYPE_SFIXED64:
case PROTOBUF_C_TYPE_FIXED64:
case PROTOBUF_C_TYPE_DOUBLE:
#if !defined(WORDS_BIGENDIAN)
rv = count * 8;
goto no_packing_needed;
#else
for (i = 0; i < count; i++) {
unsigned len = fixed64_pack(((uint64_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
#endif
case PROTOBUF_C_TYPE_ENUM:
case PROTOBUF_C_TYPE_INT32:
for (i = 0; i < count; i++) {
unsigned len = int32_pack(((int32_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
case PROTOBUF_C_TYPE_SINT32:
for (i = 0; i < count; i++) {
unsigned len = sint32_pack(((int32_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
case PROTOBUF_C_TYPE_UINT32:
for (i = 0; i < count; i++) {
unsigned len = uint32_pack(((uint32_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
case PROTOBUF_C_TYPE_SINT64:
for (i = 0; i < count; i++) {
unsigned len = sint64_pack(((int64_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
case PROTOBUF_C_TYPE_INT64:
case PROTOBUF_C_TYPE_UINT64:
for (i = 0; i < count; i++) {
unsigned len = uint64_pack(((uint64_t *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
break;
case PROTOBUF_C_TYPE_BOOL:
for (i = 0; i < count; i++) {
unsigned len = boolean_pack(((protobuf_c_boolean *)array)[i], scratch);
buffer->append(buffer, len, scratch);
rv += len;
}
return count;
default:
PROTOBUF_C__ASSERT_NOT_REACHED();
}
return rv;
#if !defined(WORDS_BIGENDIAN)
no_packing_needed:
buffer->append(buffer, rv, array);
return rv;
#endif
}
static inline int int_range_lookup(unsigned n_ranges, static inline int int_range_lookup(unsigned n_ranges,
const ProtobufCIntRange *ranges, int value) { const ProtobufCIntRange *ranges, int value) {
unsigned n; unsigned n;
...@@ -2638,147 +2096,3 @@ protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) { ...@@ -2638,147 +2096,3 @@ protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input, typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
ProtobufCClosure closure, void *closure_data); ProtobufCClosure closure, void *closure_data);
void protobuf_c_service_invoke_internal(ProtobufCService *service,
unsigned method_index,
const ProtobufCMessage *input,
ProtobufCClosure closure,
void *closure_data) {
GenericHandler *handlers;
GenericHandler handler;
/*
* Verify that method_index is within range. If this fails, you are
* likely invoking a newly added method on an old service. (Although
* other memory corruption bugs can cause this assertion too.)
*/
assert(method_index < service->descriptor->n_methods);
/*
* Get the array of virtual methods (which are enumerated by the
* generated code).
*/
handlers = (GenericHandler *)(service + 1);
/*
* Get our method and invoke it.
* \todo Seems like handler == NULL is a situation that needs handling.
*/
handler = handlers[method_index];
(*handler)(service, input, closure, closure_data);
}
void protobuf_c_service_generated_init(
ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
ProtobufCServiceDestroy destroy) {
ASSERT_IS_SERVICE_DESCRIPTOR(descriptor);
service->descriptor = descriptor;
service->destroy = destroy;
service->invoke = protobuf_c_service_invoke_internal;
memset(service + 1, 0, descriptor->n_methods * sizeof(GenericHandler));
}
void protobuf_c_service_destroy(ProtobufCService *service) {
service->destroy(service);
}
/* --- querying the descriptors --- */
const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
const ProtobufCEnumDescriptor *desc, const char *name) {
unsigned start = 0;
unsigned count;
if (desc == NULL || desc->values_by_name == NULL) return NULL;
count = desc->n_value_names;
while (count > 1) {
unsigned mid = start + count / 2;
int rv = strcmp(desc->values_by_name[mid].name, name);
if (rv == 0)
return desc->values + desc->values_by_name[mid].index;
else if (rv < 0) {
count = start + count - (mid + 1);
start = mid + 1;
} else
count = mid - start;
}
if (count == 0) return NULL;
if (strcmp(desc->values_by_name[start].name, name) == 0)
return desc->values + desc->values_by_name[start].index;
return NULL;
}
const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
const ProtobufCEnumDescriptor *desc, int value) {
int rv = int_range_lookup(desc->n_value_ranges, desc->value_ranges, value);
if (rv < 0) return NULL;
return desc->values + rv;
}
const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
const ProtobufCMessageDescriptor *desc, const char *name) {
unsigned start = 0;
unsigned count;
const ProtobufCFieldDescriptor *field;
if (desc == NULL || desc->fields_sorted_by_name == NULL) return NULL;
count = desc->n_fields;
while (count > 1) {
unsigned mid = start + count / 2;
int rv;
field = desc->fields + desc->fields_sorted_by_name[mid];
rv = strcmp(field->name, name);
if (rv == 0)
return field;
else if (rv < 0) {
count = start + count - (mid + 1);
start = mid + 1;
} else
count = mid - start;
}
if (count == 0) return NULL;
field = desc->fields + desc->fields_sorted_by_name[start];
if (strcmp(field->name, name) == 0) return field;
return NULL;
}
const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
const ProtobufCMessageDescriptor *desc, unsigned value) {
int rv = int_range_lookup(desc->n_field_ranges, desc->field_ranges, value);
if (rv < 0) return NULL;
return desc->fields + rv;
}
const ProtobufCMethodDescriptor *
protobuf_c_service_descriptor_get_method_by_name(
const ProtobufCServiceDescriptor *desc, const char *name) {
unsigned start = 0;
unsigned count;
if (desc == NULL || desc->method_indices_by_name == NULL) return NULL;
count = desc->n_methods;
while (count > 1) {
unsigned mid = start + count / 2;
unsigned mid_index = desc->method_indices_by_name[mid];
const char *mid_name = desc->methods[mid_index].name;
int rv = strcmp(mid_name, name);
if (rv == 0) return desc->methods + desc->method_indices_by_name[mid];
if (rv < 0) {
count = start + count - (mid + 1);
start = mid + 1;
} else {
count = mid - start;
}
}
if (count == 0) return NULL;
if (strcmp(desc->methods[desc->method_indices_by_name[start]].name, name) ==
0)
return desc->methods + desc->method_indices_by_name[start];
return NULL;
}
...@@ -798,76 +798,6 @@ uint32_t protobuf_c_version_number(void); ...@@ -798,76 +798,6 @@ uint32_t protobuf_c_version_number(void);
*/ */
#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000 #define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
/**
* Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by name.
*
* \param desc
* The `ProtobufCEnumDescriptor` object.
* \param name
* The `name` field from the corresponding `ProtobufCEnumValue` object to
* match.
* \return
* A `ProtobufCEnumValue` object.
* \retval NULL
* If not found or if the optimize_for = CODE_SIZE option was set.
*/
PROTOBUF_C__API
const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
const ProtobufCEnumDescriptor *desc, const char *name);
/**
* Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by numeric
* value.
*
* \param desc
* The `ProtobufCEnumDescriptor` object.
* \param value
* The `value` field from the corresponding `ProtobufCEnumValue` object to
* match.
*
* \return
* A `ProtobufCEnumValue` object.
* \retval NULL
* If not found.
*/
PROTOBUF_C__API
const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
const ProtobufCEnumDescriptor *desc, int value);
/**
* Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
* the name of the field.
*
* \param desc
* The `ProtobufCMessageDescriptor` object.
* \param name
* The name of the field.
* \return
* A `ProtobufCFieldDescriptor` object.
* \retval NULL
* If not found or if the optimize_for = CODE_SIZE option was set.
*/
PROTOBUF_C__API
const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
const ProtobufCMessageDescriptor *desc, const char *name);
/**
* Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
* the tag value of the field.
*
* \param desc
* The `ProtobufCMessageDescriptor` object.
* \param value
* The tag value of the field.
* \return
* A `ProtobufCFieldDescriptor` object.
* \retval NULL
* If not found.
*/
PROTOBUF_C__API
const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
const ProtobufCMessageDescriptor *desc, unsigned value);
/** /**
* Determine the number of bytes required to store the serialised message. * Determine the number of bytes required to store the serialised message.
* *
...@@ -947,33 +877,6 @@ PROTOBUF_C__API ...@@ -947,33 +877,6 @@ PROTOBUF_C__API
void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor, void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
void *message); void *message);
/**
* Free a service.
*
* \param service
* The service object to free.
*/
PROTOBUF_C__API
void protobuf_c_service_destroy(ProtobufCService *service);
/**
* Look up a `ProtobufCMethodDescriptor` by name.
*
* \param desc
* Service descriptor.
* \param name
* Name of the method.
*
* \return
* A `ProtobufCMethodDescriptor` object.
* \retval NULL
* If not found or if the optimize_for = CODE_SIZE option was set.
*/
PROTOBUF_C__API
const ProtobufCMethodDescriptor *
protobuf_c_service_descriptor_get_method_by_name(
const ProtobufCServiceDescriptor *desc, const char *name);
/** /**
* Initialise a `ProtobufCBufferSimple` object. * Initialise a `ProtobufCBufferSimple` object.
*/ */
...@@ -1011,18 +914,6 @@ PROTOBUF_C__API ...@@ -1011,18 +914,6 @@ PROTOBUF_C__API
void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len, void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
const unsigned char *data); const unsigned char *data);
PROTOBUF_C__API
void protobuf_c_service_generated_init(
ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
ProtobufCServiceDestroy destroy);
PROTOBUF_C__API
void protobuf_c_service_invoke_internal(ProtobufCService *service,
unsigned method_index,
const ProtobufCMessage *input,
ProtobufCClosure closure,
void *closure_data);
/**@}*/ /**@}*/
PROTOBUF_C__END_DECLS PROTOBUF_C__END_DECLS
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include <queue>
#include <stdexcept>
#include <thread>
#include <vector>
namespace paddle_mobile {
class ThreadPool {
public:
static ThreadPool& getThreadPool();
static int getThreadPoolThreadId();
explicit ThreadPool(size_t);
template <class F, class... Args>
auto enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>;
~ThreadPool();
int getTid(const std::thread::id& id) {
for (int i = 0; i < workers.size(); i++) {
if (workers[i].get_id() == id) {
return i;
}
}
return -1;
}
private:
// need to keep track of threads so we can join them
std::vector<std::thread> workers;
// the task queue
std::queue<std::function<void()>> tasks;
// synchronization
std::mutex queue_mutex;
std::condition_variable condition;
bool stop;
};
// the constructor just launches some amount of workers
inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
for (size_t i = 0; i < threads; ++i)
workers.emplace_back([this] {
for (;;) {
std::function<void()> task;
{
std::unique_lock<std::mutex> lock(this->queue_mutex);
this->condition.wait(
lock, [this] { return this->stop || !this->tasks.empty(); });
// for (;;) {
// if (this->stop || !this->tasks.empty()) {
// break;
// }
// lock.unlock();
// lock.lock();
// }
if (this->stop && this->tasks.empty()) return;
task = std::move(this->tasks.front());
this->tasks.pop();
}
task();
}
});
}
// add new work item to the pool
template <class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type> {
using return_type = typename std::result_of<F(Args...)>::type;
auto task = std::make_shared<std::packaged_task<return_type()>>(
std::bind(std::forward<F>(f), std::forward<Args>(args)...));
std::future<return_type> res = task->get_future();
{
std::unique_lock<std::mutex> lock(queue_mutex);
// don't allow enqueueing after stopping the pool
// if(stop)
// throw std::runtime_error("enqueue on stopped ThreadPool");
tasks.emplace([task]() { (*task)(); });
}
condition.notify_one();
return res;
}
// the destructor joins all threads
inline ThreadPool::~ThreadPool() {
{
std::unique_lock<std::mutex> lock(queue_mutex);
stop = true;
}
condition.notify_all();
for (std::thread& worker : workers) worker.join();
}
ThreadPool& ThreadPool::getThreadPool() {
static ThreadPool threadPool(3);
return threadPool;
}
int ThreadPool::getThreadPoolThreadId() {
return getThreadPool().getTid(std::this_thread::get_id());
}
} // namespace paddle_mobile
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once; #pragma once
#include <functional>
#include <map> #include <map>
#include <string> #include <string>
#include <unordered_set>
#include <vector> #include <vector>
#include "framework/attribute.h" #include "framework/attribute.h"
#include "framework/scope.h" #include "framework/scope.h"
...@@ -40,13 +40,6 @@ using OpCreator = std::function<framework::OperatorBase<Dtype> *( ...@@ -40,13 +40,6 @@ using OpCreator = std::function<framework::OperatorBase<Dtype> *(
const framework::AttributeMap & /*attrs*/, const framework::AttributeMap & /*attrs*/,
std::shared_ptr<framework::Scope> /*scope*/)>; std::shared_ptr<framework::Scope> /*scope*/)>;
using GradOpMakerFN =
std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
const framework::OpDesc &,
const std::unordered_set<std::string> & /*no_grad_set*/,
std::unordered_map<std::string, std::string> * /*grad_to_var*/,
const std::vector<framework::BlockDesc *> &grad_block)>;
using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/, using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
framework::BlockDesc * /*block*/)>; framework::BlockDesc * /*block*/)>;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "common/types.h"
#include <vector>
namespace paddle_mobile {
const std::string G_OP_TYPE_CONV = "conv2d";
const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
const std::string G_OP_TYPE_BOX_CODER = "box_coder";
const std::string G_OP_TYPE_CONCAT = "concat";
const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const std::string G_OP_TYPE_FC = "fc";
const std::string G_OP_TYPE_CONV_ADD = "conv_add";
const std::string G_OP_TYPE_LRN = "lrn";
const std::string G_OP_TYPE_MUL = "mul";
const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const std::string G_OP_TYPE_POOL2D = "pool2d";
const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
const std::string G_OP_TYPE_RELU = "relu";
const std::string G_OP_TYPE_RESHAPE = "reshape";
const std::string G_OP_TYPE_SIGMOID = "sigmoid";
const std::string G_OP_TYPE_SOFTMAX = "softmax";
const std::string G_OP_TYPE_TRANSPOSE = "transpose";
const std::string G_OP_TYPE_SPLIT = "split";
const std::string G_OP_TYPE_FEED = "feed";
const std::string G_OP_TYPE_FETCH = "fetch";
const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key = {
{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
{G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
{G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
{G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
{G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
{G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
{G_OP_TYPE_BOX_CODER,
{{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
{G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile
...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once; #pragma once
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility> #include <vector>
namespace paddle_mobile { namespace paddle_mobile {
enum class Precision : int { FP32 = 0 }; enum class Precision : int { FP32 = 0 };
...@@ -72,50 +72,32 @@ enum PMStatus { ...@@ -72,50 +72,32 @@ enum PMStatus {
PMWrongDevice = 0x08 /*!< un-correct device. */ PMWrongDevice = 0x08 /*!< un-correct device. */
}; };
static const std::string G_OP_TYPE_CONV = "conv2d"; extern const std::string G_OP_TYPE_CONV;
static const std::string G_OP_TYPE_BATCHNORM = "batch_norm"; extern const std::string G_OP_TYPE_BATCHNORM;
static const std::string G_OP_TYPE_BOX_CODER = "box_coder"; extern const std::string G_OP_TYPE_BOX_CODER;
static const std::string G_OP_TYPE_CONCAT = "concat"; extern const std::string G_OP_TYPE_CONCAT;
static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
"fusion_conv_add_relu"; extern const std::string G_OP_TYPE_FC;
static const std::string G_OP_TYPE_FC = "fc"; extern const std::string G_OP_TYPE_CONV_ADD;
static const std::string G_OP_TYPE_LRN = "lrn"; extern const std::string G_OP_TYPE_LRN;
static const std::string G_OP_TYPE_MUL = "mul"; extern const std::string G_OP_TYPE_MUL;
static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; extern const std::string G_OP_TYPE_MULTICLASS_NMS;
static const std::string G_OP_TYPE_POOL2D = "pool2d"; extern const std::string G_OP_TYPE_POOL2D;
static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box"; extern const std::string G_OP_TYPE_PRIOR_BOX;
static const std::string G_OP_TYPE_RELU = "relu"; extern const std::string G_OP_TYPE_RELU;
static const std::string G_OP_TYPE_RESHAPE = "reshape"; extern const std::string G_OP_TYPE_RESHAPE;
static const std::string G_OP_TYPE_SIGMOID = "sigmoid"; extern const std::string G_OP_TYPE_SIGMOID;
static const std::string G_OP_TYPE_SOFTMAX = "softmax"; extern const std::string G_OP_TYPE_SOFTMAX;
static const std::string G_OP_TYPE_TRANSPOSE = "transpose"; extern const std::string G_OP_TYPE_TRANSPOSE;
static const std::string G_OP_TYPE_SPLIT = "split"; extern const std::string G_OP_TYPE_SPLIT;
static const std::string G_OP_TYPE_FEED = "feed"; extern const std::string G_OP_TYPE_FEED;
static const std::string G_OP_TYPE_FETCH = "fetch"; extern const std::string G_OP_TYPE_FETCH;
static const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d"; extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
static const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence"; extern const std::string G_OP_TYPE_IM2SEQUENCE;
static std::unordered_map< extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>> std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key = { op_input_output_key;
{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
{G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
{G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
{G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
{G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
{G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
{G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
{G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
{G_OP_TYPE_BOX_CODER,
{{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
{G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream> #include "common/enforce.h"
#include "common/log.h" #include "common/log.h"
#pragma once #pragma once
...@@ -57,15 +56,11 @@ class RawData { ...@@ -57,15 +56,11 @@ class RawData {
char data[size]; char data[size];
RawData() {} RawData() {}
RawData(const RawData &raw_data) { strcpy(data, raw_data.data); } RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
// void operator=(const RawData &raw_data){
// strcpy(data, raw_data.data);
// }
}; };
template <typename... Ts> template <typename... Ts>
struct Variant { struct Variant {
Variant(const Variant &variant) { Variant(const Variant &variant) {
// std::cout << " 赋值构造函数 " << std::endl;
type_id = variant.type_id; type_id = variant.type_id;
data = variant.data; data = variant.data;
} }
...@@ -87,8 +82,7 @@ struct Variant { ...@@ -87,8 +82,7 @@ struct Variant {
if (type_id == typeid(T).hash_code()) { if (type_id == typeid(T).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(&data)); return *const_cast<T *>(reinterpret_cast<const T *>(&data));
} else { } else {
// std::cout << " bad cast in variant " << std::endl; PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
throw std::bad_cast();
} }
} }
......
...@@ -17,14 +17,8 @@ limitations under the License. */ ...@@ -17,14 +17,8 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
/*
* Variant<int, float, std::string, std::vector<int>, std::vector<float>,
std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
int64_t>
* */
struct PrintVistor : Vistor<Print &> { struct PrintVistor : Vistor<Print &> {
PrintVistor(Print &printer) : printer_(printer) {} explicit PrintVistor(Print &printer) : printer_(printer) {}
template <typename T> template <typename T>
Print &operator()(const T &value) { Print &operator()(const T &value) {
printer_ << value; printer_ << value;
......
...@@ -14,7 +14,11 @@ limitations under the License. */ ...@@ -14,7 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <typeinfo>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "common/enforce.h" #include "common/enforce.h"
#include "common/log.h" #include "common/log.h"
#include "common/variant.h" #include "common/variant.h"
...@@ -22,28 +26,15 @@ limitations under the License. */ ...@@ -22,28 +26,15 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
using std::string;
using std::vector;
class BlockDesc; class BlockDesc;
class Attribute { class Attribute {
public: public:
/*
* PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9
PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
*
* */
static Attribute GetAttrValue( static Attribute GetAttrValue(
PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) { PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) {
// std::cout << "begin get attr value" << std::endl;
Attribute attr; Attribute attr;
switch (attr_desc->type) { switch (attr_desc->type) {
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: {
...@@ -63,35 +54,35 @@ class Attribute { ...@@ -63,35 +54,35 @@ class Attribute {
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
std::vector<bool> val(attr_desc->n_bools); vector<bool> val(attr_desc->n_bools);
for (int i = 0; i < attr_desc->n_bools; ++i) { for (int i = 0; i < attr_desc->n_bools; ++i) {
val[i] = attr_desc->bools[i]; val[i] = attr_desc->bools[i];
} }
attr.Set<std::vector<bool>>(val); attr.Set<vector<bool>>(val);
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: {
std::vector<int> val(attr_desc->n_ints); vector<int> val(attr_desc->n_ints);
for (int i = 0; i < attr_desc->n_ints; ++i) { for (int i = 0; i < attr_desc->n_ints; ++i) {
val[i] = attr_desc->ints[i]; val[i] = attr_desc->ints[i];
} }
attr.Set<std::vector<int>>(val); attr.Set<vector<int>>(val);
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: {
std::vector<float> val(attr_desc->n_floats); vector<float> val(attr_desc->n_floats);
for (int i = 0; i < attr_desc->n_floats; ++i) { for (int i = 0; i < attr_desc->n_floats; ++i) {
val[i] = attr_desc->floats[i]; val[i] = attr_desc->floats[i];
} }
attr.Set<std::vector<float>>(val); attr.Set<vector<float>>(val);
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: {
std::vector<std::string> val(attr_desc->n_strings); vector<string> val(attr_desc->n_strings);
for (int i = 0; i < attr_desc->n_strings; ++i) { for (int i = 0; i < attr_desc->n_strings; ++i) {
val[i] = attr_desc->strings[i]; val[i] = attr_desc->strings[i];
} }
attr.Set<std::vector<std::string>>(val); attr.Set<vector<string>>(val);
break; break;
} }
case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: { case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: {
...@@ -122,47 +113,41 @@ class Attribute { ...@@ -122,47 +113,41 @@ class Attribute {
return vistor(attr.variant_.Get<int>()); return vistor(attr.variant_.Get<int>());
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) { } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
return vistor(attr.variant_.Get<float>()); return vistor(attr.variant_.Get<float>());
} else if (attr.variant_.TypeId() == typeid(std::string).hash_code()) { } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
return vistor(attr.variant_.Get<std::string>()); return vistor(attr.variant_.Get<string>());
} else if (attr.variant_.TypeId() == typeid(std::vector<int>).hash_code()) { } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
return vistor(attr.variant_.Get<std::vector<int>>()); return vistor(attr.variant_.Get<vector<int>>());
} else if (attr.variant_.TypeId() == } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {
typeid(std::vector<float>).hash_code()) { return vistor(attr.variant_.Get<vector<float>>());
return vistor(attr.variant_.Get<std::vector<float>>()); } else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
} else if (attr.variant_.TypeId() == return vistor(attr.variant_.Get<vector<string>>());
typeid(std::vector<std::string>).hash_code()) {
return vistor(attr.variant_.Get<std::vector<std::string>>());
} else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
return vistor(attr.variant_.Get<bool>()); return vistor(attr.variant_.Get<bool>());
} else if (attr.variant_.TypeId() == } else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
typeid(std::vector<bool>).hash_code()) { return vistor(attr.variant_.Get<vector<bool>>());
return vistor(attr.variant_.Get<std::vector<bool>>());
} else if (attr.variant_.TypeId() == typeid(int64_t).hash_code()) { } else if (attr.variant_.TypeId() == typeid(int64_t).hash_code()) {
return vistor(attr.variant_.Get<int64_t>()); return vistor(attr.variant_.Get<int64_t>());
} else { } else {
throw std::bad_exception(); PADDLE_MOBILE_THROW_EXCEPTION("type not support");
} }
} }
private: private:
Variant<int, float, std::string, std::vector<int>, std::vector<float>, Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
std::vector<std::string>, bool, std::vector<bool>, BlockDesc *, vector<bool>, BlockDesc *, int64_t>
int64_t>
variant_; variant_;
}; };
using AttributeMap = std::unordered_map<std::string, Attribute>; using AttributeMap = std::unordered_map<string, Attribute>;
class AttrReader { class AttrReader {
public: public:
explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {} explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
template <typename T> template <typename T>
inline T Get(const std::string &name) const { inline T Get(const string &name) const {
// PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
// be in "%s should be in AttributeMap", name);
// AttributeMap",
// name);
return ((Attribute)attrs_.at(name)).Get<T>(); return ((Attribute)attrs_.at(name)).Get<T>();
} }
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once #pragma once
#include <cctype> #include <cctype>
#include <iostream>
#include <string> #include <string>
namespace paddle_mobile { namespace paddle_mobile {
...@@ -40,7 +39,7 @@ inline DataLayout StringToDataLayout(const std::string &str) { ...@@ -40,7 +39,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
} else if (s == "ANYLAYOUT") { } else if (s == "ANYLAYOUT") {
return DataLayout::kAnyLayout; return DataLayout::kAnyLayout;
} else { } else {
// std::cout << "Unknown storage order string: %s", s; PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
} }
} }
...@@ -54,14 +53,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) { ...@@ -54,14 +53,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
return "ANY_LAYOUT"; return "ANY_LAYOUT";
default: default:
break; break;
// std::cout << "unknown DataLayou %d", data_layout;
} }
} }
inline std::ostream &operator<<(std::ostream &out, const DataLayout &l) {
out << DataLayoutToString(l);
return out;
}
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/data_transform.h"
namespace paddle_mobile {
namespace framework {
static void PassTensorData(Tensor *from, Tensor *to) {
to->ShareDataWith(*from);
*from = Tensor();
}
void DataTransform(const OpKernelType &expected_kernel_type,
const OpKernelType &kernel_type_for_var,
const Tensor &input_tensor, Tensor *output_tensor) {
bool transformed = false;
Tensor in;
in.ShareDataWith(input_tensor);
Tensor out;
// // do layout transform
// if (NeedTransformLayout(expected_kernel_type.data_layout_,
// kernel_type_for_var.data_layout_)) {
// TransDataLayout(kernel_type_for_var, expected_kernel_type, in,
// &out);
// transformed = true;
// PassTensorData(&out, &in);
// }
//
// // do data type transform
// if (expected_kernel_type.data_type_ !=
// kernel_type_for_var.data_type_) {
// TransDataType(kernel_type_for_var, expected_kernel_type, in,
// &out);
// transformed = true;
// PassTensorData(&out, &in);
// }
//
// // do device transform
// if (!platform::is_same_place(kernel_type_for_var.place_,
// expected_kernel_type.place_)) {
// TransDataDevice(in, expected_kernel_type.place_, &out);
// transformed = true;
// PassTensorData(&out, &in);
// }
//
// PADDLE_ENFORCE(transformed, "No transform is applied, please
// check!");
// get output data
output_tensor->ShareDataWith(in);
}
void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
Variable *out_var) {
// if (in_var.IsType<LoDTensor>()) {
// auto& in_lod_tensor = in_var.Get<LoDTensor>();
// auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
// tran_lod_tensor->set_lod(in_lod_tensor.lod());
// tran_lod_tensor->set_layout(in_lod_tensor.layout());
// tran_lod_tensor->ShareDataWith(tensor);
// } else if (in_var.IsType<SelectedRows>()) {
// auto& in_selected_rows = in_var.Get<SelectedRows>();
// auto* trans_selected_rows =
// out_var.GetMutable<SelectedRows>();
// trans_selected_rows->set_height(in_selected_rows.height());
// trans_selected_rows->set_rows(in_selected_rows.rows());
// trans_selected_rows->mutable_value()->ShareDataWith(tensor);
// } else {
// PADDLE_THROW("unknown var type");
// }
}
} // namespace framework
} // namespace paddle_mobile
...@@ -63,9 +63,6 @@ void make_ddim(DDim &ddim, const int64_t *dims, int n) { ...@@ -63,9 +63,6 @@ void make_ddim(DDim &ddim, const int64_t *dims, int n) {
ddim = make_dim<9>(dims); ddim = make_dim<9>(dims);
break; break;
default: default:
// std::cout << "Dynamic dimensions must have between [1,
// 9]
// dimensions.";
break; break;
} }
} }
...@@ -133,9 +130,6 @@ int64_t DDim::operator[](int idx) const { ...@@ -133,9 +130,6 @@ int64_t DDim::operator[](int idx) const {
int DDim::size() const { return arity(*this); } int DDim::size() const { return arity(*this); }
bool DDim::operator==(DDim d) const { bool DDim::operator==(DDim d) const {
// if (var.which() != d.getVar().which()) {
// return false;
// } else {
std::vector<int64_t> v1 = vectorize(*this); std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d); std::vector<int64_t> v2 = vectorize(d);
...@@ -157,7 +151,7 @@ DDim DDim::operator+(DDim d) const { ...@@ -157,7 +151,7 @@ DDim DDim::operator+(DDim d) const {
std::vector<int64_t> v3; std::vector<int64_t> v3;
assert(v1.size() == v2.size()); PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() != v2.size()");
for (unsigned int i = 0; i < v1.size(); i++) { for (unsigned int i = 0; i < v1.size(); i++) {
v3.push_back(v1[i] + v2[i]); v3.push_back(v1[i] + v2[i]);
...@@ -172,7 +166,7 @@ DDim DDim::operator*(DDim d) const { ...@@ -172,7 +166,7 @@ DDim DDim::operator*(DDim d) const {
std::vector<int64_t> v3; std::vector<int64_t> v3;
assert(v1.size() == v2.size()); PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() == v2.size()");
for (unsigned int i = 0; i < v1.size(); i++) { for (unsigned int i = 0; i < v1.size(); i++) {
v3.push_back(v1[i] * v2[i]); v3.push_back(v1[i] * v2[i]);
...@@ -183,7 +177,7 @@ DDim DDim::operator*(DDim d) const { ...@@ -183,7 +177,7 @@ DDim DDim::operator*(DDim d) const {
int64_t get(const DDim &ddim, int idx) { return ddim[idx]; } int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
void set(DDim &ddim, int idx, int value) { ddim[idx] = value; } void set(DDim *ddim, int idx, int value) { (*ddim)[idx] = value; }
/// @cond HIDDEN /// @cond HIDDEN
struct VectorizeVisitor : Vistor<void> { struct VectorizeVisitor : Vistor<void> {
...@@ -235,13 +229,10 @@ struct SliceVectorizeVisitor : Vistor<void> { ...@@ -235,13 +229,10 @@ struct SliceVectorizeVisitor : Vistor<void> {
SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e) SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
: vector(v), begin(b), end(e) { : vector(v), begin(b), end(e) {
// PADDLE_ENFORCE(begin < end, PADDLE_MOBILE_ENFORCE(
// "Begin index must be less than end index in begin < end, "Begin index must be less than end index in ddim slice.");
// ddim PADDLE_MOBILE_ENFORCE(begin >= 0,
// slice."); "Begin index can't be less than zero in ddim slice.");
// PADDLE_ENFORCE(begin >= 0,
// "Begin index can't be less than zero in
// ddim slice.");
} }
template <int S> template <int S>
...@@ -267,9 +258,7 @@ DDim slice_ddim(const DDim &ddim, int begin, int end) { ...@@ -267,9 +258,7 @@ DDim slice_ddim(const DDim &ddim, int begin, int end) {
std::vector<int64_t> vec; std::vector<int64_t> vec;
vec.reserve(end - begin); vec.reserve(end - begin);
SliceVectorizeVisitor visitor(vec, begin, end); SliceVectorizeVisitor visitor(vec, begin, end);
// boost::apply_visitor(visitor, dim);
DDim::ApplyVistor(visitor, ddim); DDim::ApplyVistor(visitor, ddim);
// visitor(ddim.var.Get<Dim<4>>());
return make_ddim(vec); return make_ddim(vec);
} }
...@@ -287,31 +276,19 @@ struct ArityVisitor : Vistor<int> { ...@@ -287,31 +276,19 @@ struct ArityVisitor : Vistor<int> {
int arity(const DDim &d) { int arity(const DDim &d) {
ArityVisitor arityVisitor = ArityVisitor(); ArityVisitor arityVisitor = ArityVisitor();
return DDim::ApplyVistor(arityVisitor, d); return DDim::ApplyVistor(arityVisitor, d);
// return arityVisitor(d.var.Get<Dim<4>>());
// return boost::apply_visitor(ArityVisitor(), d); }
} }
/// \cond HIDDEN
/// \endcond
struct OSVistor : Vistor<std::ostream &> { #ifdef PADDLE_MOBILE_DEBUG
OSVistor(std::ostream &os) : os_(os) {} Print &operator<<(Print &printer, const DDim &ddim) {
for (int j = 0; j < ddim.size(); ++j) {
template <int D> printer << ddim[j] << " ";
std::ostream &operator()(Dim<D> dim) const {
return os_ << dim;
} }
private: return printer;
std::ostream &os_;
};
std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
auto vistor = OSVistor(os);
DDim::ApplyVistor(vistor, ddim);
return os;
} }
#endif
DDim::DDim(std::initializer_list<int64_t> init_list) { DDim::DDim(std::initializer_list<int64_t> init_list) {
*this = make_ddim(init_list); *this = make_ddim(init_list);
} }
......
...@@ -14,10 +14,10 @@ limitations under the License. */ ...@@ -14,10 +14,10 @@ limitations under the License. */
#pragma once #pragma once
#include <assert.h>
#include <initializer_list> #include <initializer_list>
#include <stdexcept> #include <typeinfo>
#include <vector> #include <vector>
#include "common/enforce.h"
#include "common/variant.h" #include "common/variant.h"
#include "dim.h" #include "dim.h"
...@@ -58,9 +58,7 @@ struct DDim { ...@@ -58,9 +58,7 @@ struct DDim {
} else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) { } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
return vistor(d.var.Get<Dim<9>>()); return vistor(d.var.Get<Dim<9>>());
} else { } else {
printf(" dim not support \n"); DLOG << " dim not support";
throw std::bad_exception();
// return typename Vistor::type_t();
} }
} }
...@@ -83,17 +81,6 @@ struct DDim { ...@@ -83,17 +81,6 @@ struct DDim {
int64_t operator[](int idx) const; int64_t operator[](int idx) const;
// template <typename Visitor>
// typename Visitor::result_type apply_visitor(Visitor& visitor) {
// return var.apply_visitor(visitor);
// }
//
// template <typename Visitor>
// typename Visitor::result_type apply_visitor(Visitor& visitor)
// const {
// return var.apply_visitor(visitor);
// }
DDimVar getVar() { return var; } DDimVar getVar() { return var; }
bool operator==(DDim d) const; bool operator==(DDim d) const;
...@@ -126,7 +113,7 @@ DDim make_ddim(std::initializer_list<int64_t> dims); ...@@ -126,7 +113,7 @@ DDim make_ddim(std::initializer_list<int64_t> dims);
int64_t get(const DDim &dim, int idx); int64_t get(const DDim &dim, int idx);
void set(DDim &dim, int idx, int val); void set(DDim *dim, int idx, int val);
std::vector<int64_t> vectorize(const DDim &ddim); std::vector<int64_t> vectorize(const DDim &ddim);
...@@ -151,8 +138,6 @@ DDim slice_ddim(const DDim &dim, int begin, int end); ...@@ -151,8 +138,6 @@ DDim slice_ddim(const DDim &dim, int begin, int end);
int arity(const DDim &ddim); int arity(const DDim &ddim);
std::ostream &operator<<(std::ostream &, const DDim &);
// Reshape a tensor to a matrix. The matrix's first dimension(column // Reshape a tensor to a matrix. The matrix's first dimension(column
// length) // length)
// will be the product of tensor's first `num_col_dims` dimensions. // will be the product of tensor's first `num_col_dims` dimensions.
...@@ -163,5 +148,9 @@ DDim flatten_to_1d(const DDim &src); ...@@ -163,5 +148,9 @@ DDim flatten_to_1d(const DDim &src);
DDim stride(const DDim &ddim); DDim stride(const DDim &ddim);
DDim stride_numel(const DDim &ddim); DDim stride_numel(const DDim &ddim);
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const DDim &ddim);
#endif
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,13 +14,7 @@ limitations under the License. */ ...@@ -14,13 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include "common/enforce.h"
#include <sstream>
#include <stdexcept>
#include <type_traits>
#include "platform/hostdevice.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
...@@ -30,42 +24,35 @@ struct Dim { ...@@ -30,42 +24,35 @@ struct Dim {
static constexpr int dimensions = i; static constexpr int dimensions = i;
template <typename... Args> template <typename... Args>
HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
static_assert(sizeof...(_tail) == i - 1, static_assert(sizeof...(_tail) == i - 1,
"Dim initialized with the wrong number of parameters"); "Dim initialized with the wrong number of parameters");
} }
HOSTDEVICE
Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {} Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}
HOSTDEVICE
Dim() : head(0), tail() {} Dim() : head(0), tail() {}
/** Construct a Dim from a linear index and size. Uses Fortran /** Construct a Dim from a linear index and size. Uses Fortran
* order * order
* indexing. */ * indexing. */
HOSTDEVICE
Dim(int64_t idx, const Dim<i> &size) Dim(int64_t idx, const Dim<i> &size)
: head(idx % size.head), tail(idx / size.head, size.tail) {} : head(idx % size.head), tail(idx / size.head, size.tail) {}
/** Construct a Dim with each dimension set to the given index */ /** Construct a Dim with each dimension set to the given index */
HOSTDEVICE
Dim(int64_t idx) : head(idx), tail(idx) {} Dim(int64_t idx) : head(idx), tail(idx) {}
HOSTDEVICE
bool operator==(const Dim<i> &o) const { bool operator==(const Dim<i> &o) const {
return (head == o.head) && (tail == o.tail); return (head == o.head) && (tail == o.tail);
} }
HOSTDEVICE
bool operator!=(const Dim<i> &o) const { return !(*this == o); } bool operator!=(const Dim<i> &o) const { return !(*this == o); }
HOSTDEVICE
int64_t &operator[](int idx); int64_t &operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const; int64_t operator[](int idx) const;
HOST std::string to_string() const; std::string to_string() const;
int64_t head; int64_t head;
Dim<i - 1> tail; Dim<i - 1> tail;
...@@ -76,32 +63,22 @@ template <> ...@@ -76,32 +63,22 @@ template <>
struct Dim<0> { struct Dim<0> {
static constexpr int dimensions = 0; static constexpr int dimensions = 0;
HOSTDEVICE
Dim(int64_t _head) {} Dim(int64_t _head) {}
HOSTDEVICE
Dim() {} Dim() {}
HOSTDEVICE
Dim(int idx, const Dim<0> &size) { Dim(int idx, const Dim<0> &size) {
#ifndef __CUDA_ARCH__
if (idx > 0) { if (idx > 0) {
throw std::invalid_argument("Index out of range."); PADDLE_MOBILE_THROW_EXCEPTION("Index out of range.")
} }
#else
PADDLE_ASSERT(idx == 0);
#endif
} }
HOSTDEVICE
bool operator==(const Dim<0> &o) const { return true; } bool operator==(const Dim<0> &o) const { return true; }
HOSTDEVICE
bool operator!=(const Dim<0> &o) const { return false; } bool operator!=(const Dim<0> &o) const { return false; }
HOSTDEVICE
int64_t &operator[](int idx); int64_t &operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const; int64_t operator[](int idx) const;
}; };
...@@ -112,12 +89,12 @@ template <int i> ...@@ -112,12 +89,12 @@ template <int i>
struct DimGetter { struct DimGetter {
// Return a copy if Dim is const // Return a copy if Dim is const
template <typename D> template <typename D>
HOSTDEVICE static int64_t impl(const D &d) { static int64_t impl(const D &d) {
return DimGetter<i - 1>::impl(d.tail); return DimGetter<i - 1>::impl(d.tail);
} }
// Return a reference if Dim is mutable // Return a reference if Dim is mutable
template <typename D> template <typename D>
HOSTDEVICE static int64_t &impl(D &d) { static int64_t &impl(D &d) {
return DimGetter<i - 1>::impl(d.tail); return DimGetter<i - 1>::impl(d.tail);
} }
}; };
...@@ -127,25 +104,22 @@ template <> ...@@ -127,25 +104,22 @@ template <>
struct DimGetter<0> { struct DimGetter<0> {
// Return a copy if Dim is const // Return a copy if Dim is const
template <typename D> template <typename D>
HOSTDEVICE static int64_t impl(const D &d) { static int64_t impl(const D &d) {
return d.head; return d.head;
} }
// Return a reference if Dim is mutable // Return a reference if Dim is mutable
template <typename D> template <typename D>
HOSTDEVICE static int64_t &impl(D &d) { static int64_t &impl(D &d) {
return d.head; return d.head;
} }
}; };
template <int D> template <int D>
HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) { int64_t &indexer(Dim<D> &dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx < 0) { if (idx < 0) {
throw std::invalid_argument("Tried to access a negative dimension"); PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
} }
#else
PADDLE_ASSERT(idx >= 0);
#endif
if (idx == 0) { if (idx == 0) {
return dim.head; return dim.head;
} }
...@@ -153,31 +127,15 @@ HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) { ...@@ -153,31 +127,15 @@ HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
} }
template <> template <>
HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) { int64_t &indexer<0>(Dim<0> &dim, int idx) {
#ifndef __CUDA_ARCH__ PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
throw std::invalid_argument("Invalid index");
#else
PADDLE_ASSERT(false);
#if CUDA_VERSION < 8000
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
#endif
} }
template <int D> template <int D>
HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) { int64_t indexer(const Dim<D> &dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx < 0) { if (idx < 0) {
throw std::invalid_argument("Tried to access a negative dimension"); PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
} }
#else
PADDLE_ASSERT(idx >= 0);
#endif
if (idx == 0) { if (idx == 0) {
return dim.head; return dim.head;
} }
...@@ -185,102 +143,84 @@ HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) { ...@@ -185,102 +143,84 @@ HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
} }
template <> template <>
HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) { int64_t indexer<0>(const Dim<0> &dim, int idx) {
#ifndef __CUDA_ARCH__ PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
throw std::invalid_argument("Invalid index");
#else
PADDLE_ASSERT(false);
#if CUDA_VERSION < 8000
// On CUDA versions previous to 8.0, only __shared__ variables
// could be declared as static in the device code.
int64_t head = 0;
#else
static int64_t head = 0;
#endif
return head;
#endif
} }
} // namespace } // namespace
// Static access to constant Dim // Static access to constant Dim
template <int i, int l> template <int i, int l>
HOSTDEVICE int64_t get(const Dim<l> &d) { int64_t get(const Dim<l> &d) {
return DimGetter<i>::impl(d); return DimGetter<i>::impl(d);
} }
// Static access to mutable Dim // Static access to mutable Dim
template <int i, int l> template <int i, int l>
HOSTDEVICE int64_t &get(Dim<l> &d) { int64_t &get(Dim<l> &d) {
return DimGetter<i>::impl(d); return DimGetter<i>::impl(d);
} }
// Dynamic access to constant Dim // Dynamic access to constant Dim
template <int l> template <int l>
HOSTDEVICE int64_t Dim<l>::operator[](int i) const { int64_t Dim<l>::operator[](int i) const {
// std::cout << "l: " << l << std::endl; // std::cout << "l: " << l << std::endl;
return indexer(*this, i); return indexer(*this, i);
} }
// Dynamic access to mutable Dim // Dynamic access to mutable Dim
template <int l> template <int l>
HOSTDEVICE int64_t &Dim<l>::operator[](int i) { int64_t &Dim<l>::operator[](int i) {
return indexer(*this, i); return indexer(*this, i);
} }
// Dynamic access to constant Dim // Dynamic access to constant Dim
inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const { inline int64_t Dim<0>::operator[](int i) const { return indexer(*this, i); }
return indexer(*this, i);
}
// Dynamic access to mutable Dim // Dynamic access to mutable Dim
inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) { inline int64_t &Dim<0>::operator[](int i) { return indexer(*this, i); }
return indexer(*this, i);
}
// Dynamic access to constant Dim // Dynamic access to constant Dim
// without std::enable_if will try to instantiate this on get<0>(d) // without std::enable_if will try to instantiate this on get<0>(d)
template <int l> template <int l>
HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d, typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d, int i) {
int i) {
return d[i]; return d[i];
} }
// Dynamic access to mutable Dim // Dynamic access to mutable Dim
template <int l> template <int l>
HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d, typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d, int i) {
int i) {
return d[i]; return d[i];
} }
// Dot product of two dims // Dot product of two dims
template <int i> template <int i>
HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) { int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
return a.head * b.head + linearize(a.tail, b.tail); return a.head * b.head + linearize(a.tail, b.tail);
} }
// Base case dot product of two Dims // Base case dot product of two Dims
// Notice it is inline because it is no longer a template // Notice it is inline because it is no longer a template
template <> template <>
HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) { inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
return 0; return 0;
} }
// Product of a Dim // Product of a Dim
template <int i> template <int i>
HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) { int64_t product(const Dim<i> &a, int prod = 1) {
return prod * a.head * product(a.tail); return prod * a.head * product(a.tail);
} }
// Base case product of a Dim // Base case product of a Dim
// Notice it is inline because it is no longer a template // Notice it is inline because it is no longer a template
template <> template <>
HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) { inline int64_t product(const Dim<0> &a, int prod) {
return prod; return prod;
} }
// Is 0 <= idx_i < size_i for all i? // Is 0 <= idx_i < size_i for all i?
template <int i> template <int i>
HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) { bool contained(const Dim<i> &idx, const Dim<i> &size) {
return ((0 <= idx.head) && (idx.head < size.head) && return ((0 <= idx.head) && (idx.head < size.head) &&
contained(idx.tail, size.tail)); contained(idx.tail, size.tail));
} }
...@@ -288,7 +228,7 @@ HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) { ...@@ -288,7 +228,7 @@ HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
// Base case of is 0 <= idx_i < size_i ? // Base case of is 0 <= idx_i < size_i ?
// Notice it is inline because it is no longer a template // Notice it is inline because it is no longer a template
template <> template <>
HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) { inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
return true; return true;
} }
...@@ -296,7 +236,7 @@ HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) { ...@@ -296,7 +236,7 @@ HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
* \brief Compute exclusive prefix-multiply of a Dim. * \brief Compute exclusive prefix-multiply of a Dim.
*/ */
template <int i> template <int i>
HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) { Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head)); return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
} }
...@@ -304,7 +244,7 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) { ...@@ -304,7 +244,7 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
// Base case of ex_prefix_mul // Base case of ex_prefix_mul
// Notice it is inline because it is no longer a template // Notice it is inline because it is no longer a template
template <> template <>
HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) { inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
return Dim<0>(); return Dim<0>();
} }
///\endcond ///\endcond
...@@ -313,18 +253,18 @@ HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) { ...@@ -313,18 +253,18 @@ HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
* Add two dimensions together * Add two dimensions together
*/ */
template <int i> template <int i>
HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) { Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail)); return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
} }
// Base case // Base case
template <> template <>
HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) { inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
return Dim<0>(); return Dim<0>();
} }
template <int i> template <int i>
HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) { Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
return dim_plus(lhs, rhs); return dim_plus(lhs, rhs);
} }
...@@ -332,18 +272,18 @@ HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) { ...@@ -332,18 +272,18 @@ HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
* Multiply two dimensions together * Multiply two dimensions together
*/ */
template <int i> template <int i>
HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) { Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail)); return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
} }
// Base case // Base case
template <> template <>
HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) { inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
return Dim<0>(); return Dim<0>();
} }
template <int i> template <int i>
HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) { Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
return dim_mult(lhs, rhs); return dim_mult(lhs, rhs);
} }
...@@ -358,7 +298,7 @@ HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) { ...@@ -358,7 +298,7 @@ HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
*/ */
template <int i> template <int i>
HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) { Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
int norm_stride = size.head == 1 ? 0 : stride.head; int norm_stride = size.head == 1 ? 0 : stride.head;
return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail)); return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
} }
...@@ -366,8 +306,7 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) { ...@@ -366,8 +306,7 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
///\cond HIDDEN ///\cond HIDDEN
template <> template <>
HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size, inline Dim<0> normalize_strides(const Dim<0> &size, const Dim<0> &stride) {
const Dim<0> &stride) {
return Dim<0>(); return Dim<0>();
} }
...@@ -382,54 +321,9 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size, ...@@ -382,54 +321,9 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
*/ */
template <typename... Args> template <typename... Args>
HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) { Dim<sizeof...(Args)> make_dim(Args... idxes) {
return Dim<sizeof...(Args)>(idxes...); return Dim<sizeof...(Args)>(idxes...);
} }
// Allows us to output a Dim
// XXX For some reason, overloading fails to resolve this correctly
template <int i>
typename std::enable_if<(i > 1), std::ostream &>::type operator<<(
std::ostream &os, const Dim<i> &d) {
os << d.head << ", " << d.tail;
return os;
}
// Base case that allows us to output a Dim
// XXX I wish this could be an overload instead of a template
template <int i>
typename std::enable_if<(i == 1), std::ostream &>::type operator<<(
std::ostream &os, const Dim<i> &d) {
os << d.head;
return os;
}
inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
return os;
}
template <int i>
HOST std::string Dim<i>::to_string() const {
std::stringstream stream;
stream << *this;
return stream.str();
}
template <int D>
HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
Dim<D> result;
for (int i = 0; i < D - 1; ++i) {
result[i] = linear_index % extents[i];
linear_index /= extents[i];
}
result[D - 1] = linear_index;
return result;
}
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -13,72 +13,56 @@ See the License for the specific language governing permissions and ...@@ -13,72 +13,56 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lod_tensor.h" #include "lod_tensor.h"
#include <stdint.h>
#include <string.h>
#include <algorithm> #include <algorithm>
#include <iterator>
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
std::ostream &operator<<(std::ostream &os, const LoD &lod) { // std::ostream &operator<<(std::ostream &os, const LoD &lod) {
os << "{"; // os << "{";
for (auto &v : lod) { // for (auto &v : lod) {
os << "{"; // os << "{";
bool is_first = true; // bool is_first = true;
for (auto &i : v) { // for (auto &i : v) {
if (is_first) { // if (is_first) {
os << i; // os << i;
is_first = false; // is_first = false;
} else { // } else {
os << ", " << i; // os << ", " << i;
} // }
} // }
os << "}"; // os << "}";
} // }
os << "}"; // os << "}";
//
return os; // return os;
} //}
//
std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
// PADDLE_ENFORCE(t.type().hash_code() == // PADDLE_MOBILE_ENFORCE(t.type().hash_code() == typeid(float).hash_code(),
// typeid(float).hash_code()); // "t.type() is not float");
// os << "dim: " << t.dims() << "\n";
// if (!platform::is_cpu_place(t.place())) { // os << "lod: " << t.lod() << "\n";
// LoDTensor tt; // // only print first ten elements
// framework::TensorCopy(t, platform::CPUPlace(), &tt); // int64_t size = t.numel() < 10 ? t.numel() : 10;
// platform::DeviceContextPool &pool = // for (int64_t i = 0; i < size; ++i) {
// platform::DeviceContextPool::Instance(); auto &dev_ctx = // os << t.data<float>()[i] << " ";
// *pool.Get(t.place()); dev_ctx.Wait(); // }
// //
// os << tt; // return os;
// return os; //}
// }
// std::string LoDToString(const LoD &lod) {
os << "dim: " << t.dims() << "\n"; // std::ostringstream stream;
os << "lod: " << t.lod() << "\n"; // stream << lod;
// return stream.str();
// only print first ten elements //}
int64_t size = t.numel() < 10 ? t.numel() : 10;
for (int64_t i = 0; i < size; ++i) {
os << t.data<float>()[i] << " ";
}
return os;
}
std::string LoDToString(const LoD &lod) {
std::ostringstream stream;
stream << lod;
return stream.str();
}
LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
size_t elem_end) { size_t elem_end) {
// PADDLE_ENFORCE_LT(level, in.size()); PADDLE_MOBILE_ENFORCE(level < in.size(), "level should >= in.size()");
// PADDLE_ENFORCE_LT(elem_end, in[level].size()); PADDLE_MOBILE_ENFORCE(elem_end < in[level].size(),
"elem_end >= in[level].size()");
LoD res; LoD res;
res.resize(in.size() - level); res.resize(in.size() - level);
// copy the first level // copy the first level
...@@ -152,7 +136,7 @@ bool CheckLoD(const LoD &in, int tensor_height) { ...@@ -152,7 +136,7 @@ bool CheckLoD(const LoD &in, int tensor_height) {
if (a < b) return true; if (a < b) return true;
return false; return false;
})) { })) {
std::cout << "ascending error"; PADDLE_MOBILE_THROW_EXCEPTION("ascending error")
return false; return false;
} }
} }
...@@ -211,8 +195,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, ...@@ -211,8 +195,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
LoD sub_lod; LoD sub_lod;
for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) { for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
// PADDLE_ENFORCE_LE(start_idx, end_idx); PADDLE_MOBILE_ENFORCE(start_idx <= end_idx, "start_idx > end_idx");
// PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size()); PADDLE_MOBILE_ENFORCE(end_idx < lod[level_idx].size(),
"end_idx >= lod[level_idx].size()");
std::vector<size_t> level_lens; std::vector<size_t> level_lens;
for (size_t i = start_idx; i < end_idx; ++i) { for (size_t i = start_idx; i < end_idx; ++i) {
level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]); level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
...@@ -226,10 +211,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, ...@@ -226,10 +211,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
} }
void AppendLoD(LoD *lod, const LoD &lod_length) { void AppendLoD(LoD *lod, const LoD &lod_length) {
// PADDLE_ENFORCE( PADDLE_MOBILE_ENFORCE(
// lod->empty() || lod->size() == lod_length.size(), lod->empty() || lod->size() == lod_length.size(),
// "The lod_length should has the same size with the appended "The lod_length should has the same size with the appended lod.");
// lod.");
if (lod->empty()) { if (lod->empty()) {
for (size_t i = 0; i < lod_length.size(); ++i) { for (size_t i = 0; i < lod_length.size(); ++i) {
lod->emplace_back(1, 0); // size = 1, value = 0; lod->emplace_back(1, 0); // size = 1, value = 0;
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "tensor.h" #include "tensor.h"
#include "tensor_util.h" #include "tensor_util.h"
......
...@@ -25,9 +25,8 @@ template <typename Dtype> ...@@ -25,9 +25,8 @@ template <typename Dtype>
struct OpInfo { struct OpInfo {
OpCreator<Dtype> creator_; OpCreator<Dtype> creator_;
const OpCreator<Dtype> &Creator() const { const OpCreator<Dtype> &Creator() const {
// PADDLE_ENFORCE_NOT_NULL(creator_, PADDLE_MOBILE_ENFORCE(creator_ != nullptr,
// "Operator Creator has not been "Operator Creator has not been registered");
// registered");
return creator_; return creator_;
} }
}; };
...@@ -48,17 +47,15 @@ class OpInfoMap { ...@@ -48,17 +47,15 @@ class OpInfoMap {
} }
void Insert(const std::string &type, const OpInfo<Dtype> &info) { void Insert(const std::string &type, const OpInfo<Dtype> &info) {
// PADDLE_ENFORCE(!Has(type), "Operator %s has been PADDLE_MOBILE_ENFORCE(!Has(type), "Operator %s has been registered",
// registered", type); type.c_str());
map_.insert({type, info}); map_.insert({type, info});
} }
const OpInfo<Dtype> &Get(const std::string &type) const { const OpInfo<Dtype> &Get(const std::string &type) const {
auto op_info_ptr = GetNullable(type); auto op_info_ptr = GetNullable(type);
// PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not PADDLE_MOBILE_ENFORCE(op_info_ptr != nullptr,
// been "Operator %s has not been registered", type.c_str());
// registered",
// type);
return *op_info_ptr; return *op_info_ptr;
} }
......
...@@ -96,24 +96,39 @@ class OpRegistry { ...@@ -96,24 +96,39 @@ class OpRegistry {
} }
}; };
#define REGISTER_OPERATOR(op_type, op_class) \ #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type) \
template <typename Dtype, typename T> \ template <typename Dtype, typename T> \
class _OpClass_##op_type##_ : public op_class<Dtype, T> { \ class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> { \
public: \ public: \
DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class); \ DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class); \
}; \ }; \
static paddle_mobile::framework::OperatorRegistrar< \ static paddle_mobile::framework::OperatorRegistrar< \
paddle_mobile::CPU, _OpClass_##op_type##_<paddle_mobile::CPU, float>> \ device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
__op_registrar_##op_type##__(#op_type); \ __op_registrar_##op_type##_##device_name(#op_type); \
int TouchOpRegistrar_##op_type() { \ int TouchOpRegistrar_##op_type##_##device_name() { \
__op_registrar_##op_type##__.Touch(); \ __op_registrar_##op_type##_##device_name.Touch(); \
return 0; \ return 0; \
} }
#define USE_OP(op_type) \ #define REGISTER_OPERATOR_CPU(op_type, op_class) \
extern int TouchOpRegistrar_##op_type(); \ REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
static int use_op_itself_##op_type##_ __attribute__((unused)) = \
TouchOpRegistrar_##op_type() #define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
#define USE_OP(op_type, device_name) \
extern int TouchOpRegistrar_##op_type##_##device_name(); \
static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
TouchOpRegistrar_##op_type##_##device_name()
#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -58,7 +58,8 @@ void OperatorBase<Dtype>::Run() const { ...@@ -58,7 +58,8 @@ void OperatorBase<Dtype>::Run() const {
} }
template class OperatorBase<CPU>; template class OperatorBase<CPU>;
template class OperatorWithKernel<CPU>; template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>;
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <map> #include <map>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "common/enforce.h" #include "common/enforce.h"
...@@ -27,7 +26,6 @@ limitations under the License. */ ...@@ -27,7 +26,6 @@ limitations under the License. */
#include "framework/op_info.h" #include "framework/op_info.h"
#include "framework/op_kernel_type.h" #include "framework/op_kernel_type.h"
#include "framework/op_registry.h" #include "framework/op_registry.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/block_desc.h" #include "framework/program/block_desc.h"
#include "framework/program/program-optimize/node.h" #include "framework/program/program-optimize/node.h"
#include "framework/scope.h" #include "framework/scope.h"
...@@ -52,7 +50,7 @@ static T *GetVarValue(const string &key, const VariableNameMap &var_map, ...@@ -52,7 +50,7 @@ static T *GetVarValue(const string &key, const VariableNameMap &var_map,
} }
template <typename Dtype> template <typename Dtype>
class OperatorBase : PaddleMobileObject { class OperatorBase {
public: public:
/* /*
* @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor * @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
...@@ -65,6 +63,7 @@ class OperatorBase : PaddleMobileObject { ...@@ -65,6 +63,7 @@ class OperatorBase : PaddleMobileObject {
std::vector<string> GetOutKeys() const; std::vector<string> GetOutKeys() const;
virtual void RunImpl() const = 0; virtual void RunImpl() const = 0;
virtual void Init() const = 0;
/* /*
* @b op 运算所需的输入, 如上一层的输出结果、卷积核 * @b op 运算所需的输入, 如上一层的输出结果、卷积核
* */ * */
...@@ -105,31 +104,55 @@ class OperatorBase : PaddleMobileObject { ...@@ -105,31 +104,55 @@ class OperatorBase : PaddleMobileObject {
/* /*
* @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
* */ * */
template <typename Dtype> template <typename Dtype, typename ParamType, typename KernelType>
class OperatorWithKernel : public OperatorBase<Dtype> { class OperatorWithKernel : public OperatorBase<Dtype> {
public: public:
OperatorWithKernel(const std::string &type, const VariableNameMap &inputs, OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs, const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope) std::shared_ptr<Scope> scope)
: OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {} : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
param_(inputs, outputs, attrs, *scope) {}
virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
virtual void RunImpl() const = 0;
virtual void InferShape() const = 0; virtual void InferShape() const = 0;
void Init() const {
PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), " %s kernel init failed",
this->type_.c_str());
}
protected:
KernelType kernel_;
ParamType param_;
}; };
/* /*
* @b 所有kernel的父类 * @b 所有kernel的父类
* */ * */
template <typename Dtype, typename P> template <typename Dtype, typename P>
class OpKernelBase : PaddleMobileObject { class OpKernelBase {
public: public:
/* /*
* @b 所有kernel 需实现 Compute 方法 * @b 所有kernel 需实现 Compute 方法
* @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
* 所有结构体存在与: paddle-mobile/src/operators/op_param.h * 所有结构体存在与: paddle-mobile/src/operators/op_param.h
* */ * */
#ifdef PADDLE_MOBILE_MALI_GPU
OpKernelBase() { acl_op_ = nullptr; }
void *GetAclOp() const { return acl_op_; }
void SetAclOp(void *op, void *ob) const {
reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
}
#endif
virtual void Compute(const P &para) const = 0; virtual void Compute(const P &para) const = 0;
virtual bool Init(const P &para) const { return true; };
virtual ~OpKernelBase() = default; virtual ~OpKernelBase() = default;
private:
#ifdef PADDLE_MOBILE_MALI_GPU
void *acl_op_;
#endif
}; };
#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \ #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
...@@ -139,20 +162,23 @@ class OpKernelBase : PaddleMobileObject { ...@@ -139,20 +162,23 @@ class OpKernelBase : PaddleMobileObject {
std::shared_ptr<::paddle_mobile::framework::Scope> scope) \ std::shared_ptr<::paddle_mobile::framework::Scope> scope) \
: parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {} : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}
class FusionOpMatcher : PaddleMobileObject { class FusionOpMatcher {
public: public:
FusionOpMatcher() {} FusionOpMatcher() {}
virtual std::string Type() = 0; virtual std::string Type() = 0;
virtual void FolderNodes(Node *node) { virtual void FolderNodes(
node->Folder(node_.Depth(), Type(), {}); Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(), {}, removed_nodes);
} }
virtual Node &BeginNode() { return node_; } virtual Node &BeginNode() { return node_; }
std::string BeginType() { return node_.Type(); } std::string BeginType() { return node_.Type(); }
// virtual bool Fusion();
protected: protected:
Node node_; Node node_;
std::string type_; std::string type_;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle_mobile_object.h"
...@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "block_desc.h" #include "block_desc.h"
#include <algorithm>
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const { std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const { return vars_; }
std::vector<std::shared_ptr<VarDesc>> res;
for (const auto &p : vars_) {
res.push_back(p.second);
}
return res;
}
std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; } std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
...@@ -31,10 +26,14 @@ BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc) ...@@ -31,10 +26,14 @@ BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
: index_(desc->idx), parent_index_(desc->idx) { : index_(desc->idx), parent_index_(desc->idx) {
for (int i = 0; i < desc->n_vars; ++i) { for (int i = 0; i < desc->n_vars; ++i) {
PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i]; PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
vars_[std::string(var_desc->name)] = vars_.emplace_back(std::shared_ptr<VarDesc>(new VarDesc(var_desc)));
std::shared_ptr<VarDesc>(new VarDesc(var_desc));
} }
std::sort(vars_.begin(), vars_.end(),
[](std::shared_ptr<VarDesc> left, std::shared_ptr<VarDesc> right) {
return left->Name() < right->Name();
});
for (int j = 0; j < desc->n_ops; ++j) { for (int j = 0; j < desc->n_ops; ++j) {
PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j]; PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
ops_.emplace_back(new framework::OpDesc(op_desc)); ops_.emplace_back(new framework::OpDesc(op_desc));
......
...@@ -15,14 +15,13 @@ limitations under the License. */ ...@@ -15,14 +15,13 @@ limitations under the License. */
#pragma once #pragma once
#include "framework/framework.pb-c.h" #include "framework/framework.pb-c.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/op_desc.h" #include "framework/program/op_desc.h"
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class BlockDesc : PaddleMobileObject { class BlockDesc {
public: public:
friend class Node; friend class Node;
friend class ProgramOptimize; friend class ProgramOptimize;
...@@ -35,10 +34,9 @@ class BlockDesc : PaddleMobileObject { ...@@ -35,10 +34,9 @@ class BlockDesc : PaddleMobileObject {
ops_.push_back(copy_op_desc); ops_.push_back(copy_op_desc);
} }
for (auto &var_desc : block_desc.vars_) { for (int i = 0; i < block_desc.vars_.size(); ++i) {
std::shared_ptr<VarDesc> copy_var_desc = auto &var_desc = block_desc.vars_[i];
std::make_shared<VarDesc>(*var_desc.second); vars_.emplace_back(std::make_shared<VarDesc>(*var_desc));
vars_[var_desc.first] = copy_var_desc;
} }
} }
...@@ -64,7 +62,7 @@ class BlockDesc : PaddleMobileObject { ...@@ -64,7 +62,7 @@ class BlockDesc : PaddleMobileObject {
bool multi_thread_; bool multi_thread_;
int parent_index_; int parent_index_;
std::vector<std::shared_ptr<OpDesc>> ops_; std::vector<std::shared_ptr<OpDesc>> ops_;
std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_; std::vector<std::shared_ptr<VarDesc>> vars_;
}; };
} // namespace framework } // namespace framework
......
...@@ -20,12 +20,11 @@ limitations under the License. */ ...@@ -20,12 +20,11 @@ limitations under the License. */
#include "common/log.h" #include "common/log.h"
#include "common/type_define.h" #include "common/type_define.h"
#include "framework/framework.pb-c.h" #include "framework/framework.pb-c.h"
#include "framework/paddle_mobile_object.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class OpDesc : PaddleMobileObject { class OpDesc {
public: public:
friend class ProgramOptimize; friend class ProgramOptimize;
friend class FusionOpMatcher; friend class FusionOpMatcher;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fusion_op_register.h"
...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <sstream>
#include "framework/operator.h"
#include "framework/program/program-optimize/node.h" #include "framework/program/program-optimize/node.h"
#include <algorithm>
#include "framework/operator.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -45,54 +44,13 @@ bool Node::operator==(const Node &in) { ...@@ -45,54 +44,13 @@ bool Node::operator==(const Node &in) {
return true; return true;
} }
bool Node::CanSplit(std::unordered_set<std::string> complex_compute_set) { std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
bool split = false;
CanSplit(&split, false, 0, &complex_compute_set, this);
return split;
}
void Node::CanSplit(bool *split, bool spliting, int complex_count,
std::unordered_set<std::string> *complex_compute_set,
Node *pre_node) {
if (spliting) {
if (complex_compute_set->find(this->type_) != complex_compute_set->end()) {
complex_count++;
}
}
if (inputs_.size() > 1 && pre_node != inputs_.back()) {
return;
}
if (inputs_.size() > 1 && pre_node == inputs_.back()) {
if (complex_count > 1) {
*split = true;
return;
}
}
// multi output, to check
if (outputs_.size() > 1) {
spliting = true;
complex_compute_set = 0;
} else {
if (spliting == true && inputs_.size() > 0) {
spliting = false;
} else {
}
}
for (auto &output : outputs_) {
output->CanSplit(split, spliting, complex_count, complex_compute_set, this);
}
}
std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
std::vector<std::shared_ptr<framework::OpDesc>> op_descs; std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
OpDescs(size - 1, &op_descs); OpDescs(size - 1, &op_descs);
return op_descs; return op_descs;
} }
void Node::OpDescs(uint index, void Node::OpDescs(int index,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) { std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
if (index == 0) { if (index == 0) {
return; return;
...@@ -103,107 +61,6 @@ void Node::OpDescs(uint index, ...@@ -103,107 +61,6 @@ void Node::OpDescs(uint index,
} }
} }
void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
Node *node, bool adding_thread, int thread_num) {
if (outputs_.size() > 1) {
adding_thread = false;
}
bool can_add_split = false;
// 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
if (outputs_.size() > 1 &&
op_input_output_key[op_desc_->type_].second.size() == 1) {
can_add_split = true;
// 遍历当前节点的 output 节点
for (const auto &output : outputs_) {
// 不支持 output 有多个 output 的情况
if (output->outputs_.size() > 0) {
can_add_split = false;
break;
}
//与节点关联的 OpDesc
std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
//获取这个 op 的 inputs key 和 outputs key
auto inputs_and_outputs = op_input_output_key[op_desc->type_];
//判断现在 是否存在这个 op
//判断这个 output 和 input key 的 size 等于 1
if (op_input_output_key.find(op_desc->type_) !=
op_input_output_key.end() &&
inputs_and_outputs.first.size() == 1 &&
inputs_and_outputs.second.size() == 1) {
auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
// 判断一下, 如果输入和输出没有同名, 是支持的
for (int i = 0; i < inputs_of_output.size(); ++i) {
std::string input_of_output = inputs_of_output[i];
for (int j = 0; j < outputs_of_output.size(); ++j) {
std::string output_of_output = outputs_of_output[j];
if (input_of_output == output_of_output) {
DLOG << "output的 output 包含 input" << input_of_output;
can_add_split = false;
break;
}
}
}
} else { // 如果模型中包含没有的 op, 则不支持添加 split
DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
can_add_split = false;
}
}
}
if (inputs_.size() > 1 && node != inputs_.back()) {
return;
} else if (inputs_.size() > 1 && node == inputs_.back()) {
adding_thread = false;
op_desc->push_back(this->op_desc_);
} else {
op_desc->push_back(this->op_desc_);
}
if (adding_thread) {
Attribute attr;
attr.Set<int>(thread_num);
this->op_desc_->attrs_["thread"] = attr;
}
if (can_add_split) {
adding_thread = true;
std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
split_op_desc->type_ = G_OP_TYPE_SPLIT;
auto outputs = this->op_desc_->Output(
op_input_output_key[this->op_desc_->Type()].second[0]);
split_op_desc->inputs_ = {
{op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
auto &split_outputs =
split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
for (const auto &output : outputs_) {
split_outputs.push_back(outputs[0]);
}
DLOG << "add split";
op_desc->push_back(split_op_desc);
}
for (int i = 0; i < outputs_.size(); ++i) {
auto &output = outputs_[i];
if (can_add_split) {
output->OpDescs(op_desc, this, adding_thread, i);
} else {
output->OpDescs(op_desc, this, adding_thread, thread_num);
}
}
}
std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
OpDescs(&op_descs, this, false, 0);
return op_descs;
}
std::shared_ptr<Node> Node::To(int size) { std::shared_ptr<Node> Node::To(int size) {
std::shared_ptr<Node> node = std::make_shared<Node>(); std::shared_ptr<Node> node = std::make_shared<Node>();
this->To(size - 1, node); this->To(size - 1, node);
...@@ -224,24 +81,25 @@ void Node::To(int index, std::shared_ptr<Node> node) { ...@@ -224,24 +81,25 @@ void Node::To(int index, std::shared_ptr<Node> node) {
} }
} }
uint Node::Depth(uint begin) { int Node::Depth(int begin) {
uint depth = 0; int depth = 0;
begin++; begin++;
for (int i = 0; i < outputs_.size(); ++i) { for (int i = 0; i < outputs_.size(); ++i) {
uint output_depth = outputs_[i]->Depth(begin); int output_depth = outputs_[i]->Depth(begin);
depth = output_depth > depth ? output_depth : depth; depth = output_depth > depth ? output_depth : depth;
} }
return begin > depth ? begin : depth; return begin > depth ? begin : depth;
} }
Node &Node::Folder( Node &Node::Folder(
uint size, std::string type, int size, std::string type,
std::map<std::string, std::pair<std::string, std::string>> change) { std::map<std::string, std::pair<std::string, std::string>> change,
std::vector<std::shared_ptr<Node>> *removed_nodes) {
std::shared_ptr<framework::OpDesc> op_desc = std::shared_ptr<framework::OpDesc> op_desc =
std::make_shared<framework::OpDesc>(); std::make_shared<framework::OpDesc>();
op_desc->inputs_ = this->op_desc_->inputs_; op_desc->inputs_ = this->op_desc_->inputs_;
std::vector<std::shared_ptr<Node>> outputs; std::vector<std::shared_ptr<Node>> outputs;
this->Folder(op_desc, &outputs, size - 1, &change, this); this->Folder(op_desc, &outputs, size - 1, &change, this, removed_nodes);
this->outputs_ = outputs; this->outputs_ = outputs;
this->type_ = type; this->type_ = type;
this->op_desc_ = op_desc; this->op_desc_ = op_desc;
...@@ -251,9 +109,9 @@ Node &Node::Folder( ...@@ -251,9 +109,9 @@ Node &Node::Folder(
void Node::Folder( void Node::Folder(
std::shared_ptr<framework::OpDesc> op_desc, std::shared_ptr<framework::OpDesc> op_desc,
std::vector<std::shared_ptr<Node>> *outputs, uint index, std::vector<std::shared_ptr<Node>> *outputs, int index,
std::map<std::string, std::pair<std::string, std::string>> *change, std::map<std::string, std::pair<std::string, std::string>> *change,
Node *begin_node) { Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
if (change->find(this->type_) != change->end()) { if (change->find(this->type_) != change->end()) {
auto change_pair = (*change)[this->type_]; auto change_pair = (*change)[this->type_];
op_desc->GetInputs()[change_pair.second] = op_desc->GetInputs()[change_pair.second] =
...@@ -266,7 +124,9 @@ void Node::Folder( ...@@ -266,7 +124,9 @@ void Node::Folder(
if (index > 0) { if (index > 0) {
--index; --index;
for (auto output : outputs_) { for (auto output : outputs_) {
output->Folder(op_desc, outputs, index, change, begin_node); removed_nodes->push_back(output);
output->Folder(op_desc, outputs, index, change, begin_node,
removed_nodes);
} }
} else { } else {
for (auto &op_output : this->op_desc_->outputs_) { for (auto &op_output : this->op_desc_->outputs_) {
...@@ -285,7 +145,7 @@ void Node::Folder( ...@@ -285,7 +145,7 @@ void Node::Folder(
} }
} }
} }
#ifdef PADDLE_MOBILE_DEBUG
std::string Node::ToString(std::string blank, const Node *node) const { std::string Node::ToString(std::string blank, const Node *node) const {
std::stringstream ss; std::stringstream ss;
ss << type_ << "-> \n"; ss << type_ << "-> \n";
...@@ -316,6 +176,7 @@ Print &operator<<(Print &printer, const Node &node) { ...@@ -316,6 +176,7 @@ Print &operator<<(Print &printer, const Node &node) {
printer << node.ToString(); printer << node.ToString();
return printer; return printer;
} }
#endif
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,20 +14,17 @@ limitations under the License. */ ...@@ -14,20 +14,17 @@ limitations under the License. */
#pragma once #pragma once
#include <cinttypes>
#include <map> #include <map>
#include <string> #include <string>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "common/log.h" #include "common/log.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/op_desc.h" #include "framework/program/op_desc.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class Node : PaddleMobileObject { class Node {
friend class ProgramOptimize; friend class ProgramOptimize;
public: public:
...@@ -37,35 +34,34 @@ class Node : PaddleMobileObject { ...@@ -37,35 +34,34 @@ class Node : PaddleMobileObject {
: op_desc_(op_desc), type_(op_desc->Type()) {} : op_desc_(op_desc), type_(op_desc->Type()) {}
Node &operator>(std::shared_ptr<Node> node); Node &operator>(std::shared_ptr<Node> node);
bool operator==(const Node &in); bool operator==(const Node &in);
bool CanSplit(std::unordered_set<std::string> complex_compute_set);
#ifdef PADDLE_MOBILE_DEBUG
std::string ToString() const; std::string ToString() const;
void Description();
#endif
std::shared_ptr<Node> To(int size); std::shared_ptr<Node> To(int size);
uint Depth(uint begin = 0); int Depth(int begin = 0);
Node &Folder( Node &Folder(
uint size, std::string type, int size, std::string type,
std::map<std::string, std::pair<std::string, std::string>> change_map); std::map<std::string, std::pair<std::string, std::string>> change_map,
std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size); std::vector<std::shared_ptr<Node>> *removed_nodes);
std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(); std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; } std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
std::string Type() { return type_; } std::string Type() { return type_; }
void Description();
private: private:
void CanSplit(bool *split, bool spliting, int complex_count, void OpDescs(int size,
std::unordered_set<std::string> *complex_compute_set,
Node *pre_node);
void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
Node *node, bool adding_thread, int thread_num);
void OpDescs(uint size,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc); std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
void To(int index, std::shared_ptr<Node>); void To(int index, std::shared_ptr<Node>);
void Folder( void Folder(
std::shared_ptr<framework::OpDesc> op_desc, std::shared_ptr<framework::OpDesc> op_desc,
std::vector<std::shared_ptr<Node>> *outputs, uint index, std::vector<std::shared_ptr<Node>> *outputs, int index,
std::map<std::string, std::pair<std::string, std::string>> *change, std::map<std::string, std::pair<std::string, std::string>> *change,
Node *begin_node); Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
std::shared_ptr<framework::OpDesc> op_desc_; std::shared_ptr<framework::OpDesc> op_desc_;
#ifdef PADDLE_MOBILE_DEBUG
std::string ToString(std::string blank, const Node *node) const; std::string ToString(std::string blank, const Node *node) const;
#endif
std::vector<std::shared_ptr<Node>> outputs_; std::vector<std::shared_ptr<Node>> outputs_;
std::vector<Node *> inputs_; std::vector<Node *> inputs_;
std::string type_; std::string type_;
......
...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "framework/program/program-optimize/program_optimize.h" #include "framework/program/program-optimize/program_optimize.h"
#include <algorithm>
#include "framework/program/program-optimize/fusion_op_register.h" #include "framework/program/program-optimize/fusion_op_register.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize( std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
std::shared_ptr<ProgramDesc> ori_des, bool add_split) { std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
// ProgramDesc *optimize_program = new ProgramDesc(*ori_des); // ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
std::shared_ptr<ProgramDesc> optimize_program = std::shared_ptr<ProgramDesc> optimize_program =
...@@ -31,6 +32,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize( ...@@ -31,6 +32,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>>
type_map; type_map;
std::vector<std::shared_ptr<Node>> nodes;
std::shared_ptr<Node> begin_node; std::shared_ptr<Node> begin_node;
auto block = optimize_program->Block(i); auto block = optimize_program->Block(i);
// DLOG << " ops size: " << block->Ops().size(); // DLOG << " ops size: " << block->Ops().size();
...@@ -38,11 +41,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize( ...@@ -38,11 +41,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
auto op = block->Ops()[j]; auto op = block->Ops()[j];
auto op_type = op->Type(); auto op_type = op->Type();
if (op_input_output_key.find(op->Type()) == op_input_output_key.end()) { if (op_input_output_key.find(op->Type()) == op_input_output_key.end()) {
LOG(kLOG_ERROR) << "return null "; LOG(kLOG_ERROR) << "has not support op return null "
<< " op type: " << op->Type();
return nullptr; return nullptr;
} }
std::shared_ptr<Node> node = std::make_shared<Node>(op); std::shared_ptr<Node> node = std::make_shared<Node>(op);
nodes.push_back(node);
// //
type_map[op->Type()].push_back(node); type_map[op->Type()].push_back(node);
...@@ -87,21 +92,29 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize( ...@@ -87,21 +92,29 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
// DLOG << " match success " << " fusion node: \n" << // DLOG << " match success " << " fusion node: \n" <<
// matcher->BeginNode() << "\nsub node: \n" << *sub_node; // matcher->BeginNode() << "\nsub node: \n" << *sub_node;
// DLOG << "match node\n"<< *match_node; // DLOG << "match node\n"<< *match_node;
matcher->FolderNodes(match_node.get());
// DLOG << " after match node\n"<< *match_node;
// match_node->Description();
// DLOG << "begin node: \n" << *begin_node; std::vector<std::shared_ptr<Node>> removed_nodes;
matcher->FolderNodes(match_node.get(), &removed_nodes);
for (int j = 0; j < removed_nodes.size(); ++j) {
auto removed_node = removed_nodes[j];
auto removed_ite =
std::find(nodes.begin(), nodes.end(), removed_node);
nodes.erase(removed_ite);
}
} }
} }
} }
// DLOG << "node: \n" << *begin_node;
std::vector<std::shared_ptr<framework::OpDesc>> op_descs; std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
// bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV, if (add_split) {
// G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV}); GenerateOps(&op_descs, begin_node.get(), add_split);
GenerateOps(&op_descs, begin_node.get()); } else {
for (int m = 0; m < nodes.size(); ++m) {
auto &node = nodes[m];
op_descs.push_back(node->op_desc_);
}
}
block->ops_ = op_descs; block->ops_ = op_descs;
} }
...@@ -118,6 +131,14 @@ void ProgramOptimize::GenerateOps( ...@@ -118,6 +131,14 @@ void ProgramOptimize::GenerateOps(
Node *current_node) { Node *current_node) {
if (current_node->inputs_.size() > 1 && if (current_node->inputs_.size() > 1 &&
input_node != current_node->inputs_.back()) { input_node != current_node->inputs_.back()) {
DLOG << " current type " << current_node->type_;
DLOG << " inputs size of current node > 0 ";
for (int i = 0; i < current_node->inputs_.size(); ++i) {
DLOG << " input i: " << current_node->inputs_[i]->type_;
}
return; return;
} else if (current_node->inputs_.size() > 1 && } else if (current_node->inputs_.size() > 1 &&
input_node == current_node->inputs_.back()) { input_node == current_node->inputs_.back()) {
...@@ -250,12 +271,12 @@ void ProgramOptimize::GenerateOps( ...@@ -250,12 +271,12 @@ void ProgramOptimize::GenerateOps(
} }
void ProgramOptimize::GenerateOps( void ProgramOptimize::GenerateOps(
std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, Node *begin_node,
Node *begin_node) { bool can_add_split) {
// std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
// Node *input_node, Node *current_node, bool adding_thread, int // Node *input_node, Node *current_node, bool adding_thread, int
// thread_num // thread_num
if (false) { if (can_add_split) {
this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr); this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
} else { } else {
this->GenerateOps(op_descs, begin_node, begin_node); this->GenerateOps(op_descs, begin_node, begin_node);
......
...@@ -27,14 +27,14 @@ namespace framework { ...@@ -27,14 +27,14 @@ namespace framework {
class ProgramOptimize { class ProgramOptimize {
public: public:
ProgramOptimize() {} ProgramOptimize() {}
std::shared_ptr<ProgramDesc> FushionOptimize( std::shared_ptr<ProgramDesc> FusionOptimize(
std::shared_ptr<ProgramDesc> ori_des, bool add_split = false); std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
private: private:
int current_block_; int current_block_;
std::vector<std::shared_ptr<BlockDesc>> new_blocks_; std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
Node *begin_node); Node *begin_node, bool can_add_split);
void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
Node *input_node, Node *current_node); Node *input_node, Node *current_node);
void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
namespace paddle_mobile {
namespace framework {}
} // namespace paddle_mobile
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once #pragma once
#include "common/types.h" #include "common/types.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/program_desc.h" #include "framework/program/program_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
...@@ -23,12 +22,14 @@ namespace paddle_mobile { ...@@ -23,12 +22,14 @@ namespace paddle_mobile {
namespace framework { namespace framework {
template <typename Dtype, Precision P = Precision::FP32> template <typename Dtype, Precision P = Precision::FP32>
class Program : PaddleMobileObject { class Program {
public: public:
std::shared_ptr<ProgramDesc> originProgram; std::shared_ptr<ProgramDesc> originProgram;
std::shared_ptr<ProgramDesc> optimizeProgram; std::shared_ptr<ProgramDesc> optimizeProgram;
std::shared_ptr<Scope> scope; std::shared_ptr<Scope> scope;
std::string model_path; std::string model_path;
std::string para_path;
bool is_commbine = false;
private: private:
}; };
......
...@@ -18,13 +18,12 @@ limitations under the License. */ ...@@ -18,13 +18,12 @@ limitations under the License. */
#include "common/types.h" #include "common/types.h"
#include "framework/framework.pb-c.h" #include "framework/framework.pb-c.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/block_desc.h" #include "framework/program/block_desc.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class ProgramDesc : PaddleMobileObject { class ProgramDesc {
public: public:
friend class Node; friend class Node;
friend class ProgramOptimize; friend class ProgramOptimize;
......
...@@ -14,40 +14,14 @@ limitations under the License. */ ...@@ -14,40 +14,14 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "framework/framework.pb-c.h" #include "framework/framework.pb-c.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/tensor_desc.h" #include "framework/program/tensor_desc.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
/*
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE = 18
*/
class VarDesc { class VarDesc {
public: public:
VarDesc(const VarDesc &var_desc) { VarDesc(const VarDesc &var_desc) {
...@@ -56,14 +30,6 @@ class VarDesc { ...@@ -56,14 +30,6 @@ class VarDesc {
this->persistable_ = var_desc.persistable_; this->persistable_ = var_desc.persistable_;
this->tensor_desc_ = var_desc.tensor_desc_; this->tensor_desc_ = var_desc.tensor_desc_;
this->type_ = var_desc.type_; this->type_ = var_desc.type_;
/*
*
* std::string name_;
bool persistable_;
TensorDesc tensor_desc_;
VarType_Type type_;
VarType_Type data_type_;
* */
} }
VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) { VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
type_ = (VarType_Type)desc->type->type; type_ = (VarType_Type)desc->type->type;
...@@ -102,39 +68,6 @@ class VarDesc { ...@@ -102,39 +68,6 @@ class VarDesc {
const TensorDesc &Tensor_desc() const { return tensor_desc_; } const TensorDesc &Tensor_desc() const { return tensor_desc_; }
// const proto::VarType::ChannelDesc &channel_desc() const {
// switch (desc_.type().type()) {
// case proto::VarType::CHANNEL:
// return desc_.type().channel();
// default:
// break;
// }
// }
// proto::VarType::Type GetDataType() const {
// switch (desc_.type().type()) {
// case proto::VarType::CHANNEL:
// return channel_desc().data_type();
// break;
// default:
// return tensor_desc().data_type();
// }
// }
// template <typename T>
// std::vector<T> RepeatedToVector(
// const google::protobuf::RepeatedField<T> &repeated_field) const {
// std::vector<T> ret;
// ret.reserve(repeated_field.size());
// std::copy(repeated_field.begin(), repeated_field.end(),
// std::back_inserter(ret));
// return ret;
// }
// std::vector<int64_t> GetShape() const {
// return this->RepeatedToVector(tensor_desc().dims());
// }
private: private:
std::string name_; std::string name_;
bool persistable_; bool persistable_;
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "framework/scope.h" #include "framework/scope.h"
#include <algorithm>
#include <set> #include <set>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -22,7 +23,6 @@ namespace paddle_mobile { ...@@ -22,7 +23,6 @@ namespace paddle_mobile {
namespace framework { namespace framework {
Scope &Scope::NewScope() const { Scope &Scope::NewScope() const {
std::unique_lock<std::mutex> lock(mutex_);
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
...@@ -72,11 +72,9 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -72,11 +72,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope *scope) const { void Scope::DeleteScope(Scope *scope) const {
std::unique_lock<std::mutex> lock(mutex_);
auto it = std::find(kids_.begin(), kids_.end(), scope); auto it = std::find(kids_.begin(), kids_.end(), scope);
kids_.erase(it); kids_.erase(it);
delete scope; delete scope;
// deferent
} }
void Scope::EraseVars(const std::vector<std::string> &var_names) { void Scope::EraseVars(const std::vector<std::string> &var_names) {
...@@ -104,14 +102,6 @@ void Scope::Rename(const std::string &origin_name, ...@@ -104,14 +102,6 @@ void Scope::Rename(const std::string &origin_name,
vars_[new_name] = origin_it->second; vars_[new_name] = origin_it->second;
vars_.erase(origin_it); vars_.erase(origin_it);
} }
//
// std::string Scope::Rename(const std::string& origin_name)
// const {
// auto var_name = string::Sprintf("%p.%d", this,
// vars_.size());
// Rename(origin_name, var_name);
// return var_name;
// }
Variable *Scope::FindVarLocally(const std::string &name) const { Variable *Scope::FindVarLocally(const std::string &name) const {
auto it = vars_.find(name); auto it = vars_.find(name);
......
...@@ -14,17 +14,16 @@ limitations under the License. */ ...@@ -14,17 +14,16 @@ limitations under the License. */
#pragma once #pragma once
#include <list> //std::list #include <list>
#include <mutex> //std::mutex #include <unordered_map>
#include <unordered_map> //std::unordered_map
#include "variable.h" #include "variable.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
class Scope { class Scope {
public: public:
Scope() {} Scope() = default;
~Scope() {} ~Scope() = default;
Scope &NewScope() const; Scope &NewScope() const;
...@@ -70,8 +69,6 @@ class Scope { ...@@ -70,8 +69,6 @@ class Scope {
mutable std::unordered_map<std::string, Variable *> vars_; mutable std::unordered_map<std::string, Variable *> vars_;
mutable std::list<Scope *> kids_; mutable std::list<Scope *> kids_;
Scope const *parent_{nullptr}; Scope const *parent_{nullptr};
mutable std::mutex mutex_;
}; };
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,14 +14,15 @@ limitations under the License. */ ...@@ -14,14 +14,15 @@ limitations under the License. */
#pragma once #pragma once
#include <common/enforce.h>
#include <cstdint> #include <cstdint>
#include <cstring> #include <cstring>
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include <typeindex> #include <typeindex>
#include <vector> #include <vector>
#include "common/enforce.h"
#include "common/enforce.h"
#include "framework/data_layout.h" #include "framework/data_layout.h"
#include "framework/ddim.h" #include "framework/ddim.h"
#include "memory/t_malloc.h" #include "memory/t_malloc.h"
...@@ -84,6 +85,12 @@ class Tensor { ...@@ -84,6 +85,12 @@ class Tensor {
} }
} }
Tensor(const Tensor &inTensor) {
this->dims_ = inTensor.dims_;
this->holder_ = inTensor.holder_;
this->offset_ = inTensor.offset_;
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
inline T *data() { inline T *data() {
...@@ -130,7 +137,6 @@ class Tensor { ...@@ -130,7 +137,6 @@ class Tensor {
} }
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.") PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
int64_t size = numel() * SizeOfType(type); int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || holder_->size() < size + offset_) { if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type)); holder_.reset(new PlaceholderImpl(size, type));
offset_ = 0; offset_ = 0;
...@@ -169,7 +175,9 @@ class Tensor { ...@@ -169,7 +175,9 @@ class Tensor {
/*! The internal of two tensors share the same memory block. */ /*! The internal of two tensors share the same memory block. */
inline Tensor &ShareDataWith(const Tensor &src) { inline Tensor &ShareDataWith(const Tensor &src) {
src.check_memory_size(); src.check_memory_size();
*this = src; if (holder_.get() != src.holder_.get()) {
*this = src;
}
return *this; return *this;
} }
...@@ -198,7 +206,6 @@ class Tensor { ...@@ -198,7 +206,6 @@ class Tensor {
size_t base = numel() / dims_[0]; size_t base = numel() / dims_[0];
Tensor dst; Tensor dst;
dst.holder_ = holder_; dst.holder_ = holder_;
dst.set_layout(layout_);
DDim dst_dims = dims_; DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx; dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims); dst.Resize(dst_dims);
...@@ -227,10 +234,6 @@ class Tensor { ...@@ -227,10 +234,6 @@ class Tensor {
"Tensor's dims_ is out of bound. "); "Tensor's dims_ is out of bound. ");
} }
inline DataLayout layout() const { return layout_; }
inline void set_layout(const DataLayout layout) { layout_ = layout; }
private: private:
/** /**
* @note Placeholder hides type T, so it doesn't appear as a * @note Placeholder hides type T, so it doesn't appear as a
...@@ -288,21 +291,6 @@ class Tensor { ...@@ -288,21 +291,6 @@ class Tensor {
DDim dims_; DDim dims_;
/**
* @brief the layout of memory block, default is NHWC.
*
* @note the memory allocation order, describe how weight/data is
* stored
* For example, in 4-D Tensor(rank=4), there are three
* commonly
* used layout. They are
* NCHW, NHWC, CHWN.
* N,C,H,W for respectively the batch size, the number of
* feature maps, the height, the width.
*/
DataLayout layout_ = DataLayout::kNHWC;
/** /**
* @brief A PlaceHolder may be shared by more than one tensor. * @brief A PlaceHolder may be shared by more than one tensor.
* *
......
...@@ -13,137 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,137 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "tensor_util.h" #include "tensor_util.h"
#include <algorithm>
#include <limits>
#include <vector>
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
void TensorCopy(const Tensor &src, Tensor *dst) { void TensorCopy(const Tensor &src, Tensor *dst) {
// VLOG(3) << "TensorCopy " << src.dims() << " from " <<
// src.place() << " to
// "
// << dst_place;
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(src.type());
auto size = src.numel() * SizeOfType(src.type());
memory::Copy(dst_ptr, src_ptr, size);
}
void TensorCopySync(const Tensor &src, Tensor *dst) {
// VLOG(3) << "TensorCopySync " << src.dims() << " from " <<
// src.place()
// << " to " << dst_place;
src.check_memory_size(); src.check_memory_size();
dst->Resize(src.dims()); dst->Resize(src.dims());
dst->set_layout(src.layout());
auto src_ptr = src.data<void>(); auto src_ptr = src.data<void>();
auto dst_ptr = dst->mutable_data(src.type()); auto dst_ptr = dst->mutable_data(src.type());
auto size = src.numel() * SizeOfType(src.type()); auto size = src.numel() * SizeOfType(src.type());
memory::Copy(dst_ptr, src_ptr, size); memory::Copy(dst_ptr, src_ptr, size);
} }
template <typename Predicate>
struct AnyDTypeVisitor {
Predicate predicate_;
const Tensor &tensor_;
Tensor *out_;
AnyDTypeVisitor(Predicate predicate, const Tensor &tensor, Tensor *out)
: predicate_(predicate), tensor_(tensor), out_(out) {}
template <typename T>
void operator()() const {
// auto t = EigenVector<T>::Flatten(tensor_);
// auto o = EigenScalar<bool>::From(*out_);
// return any of predicate_(t) is true.
// o.device(*ctx_.eigen_device()) = predicate_(t).any();
}
};
template <typename Predicate>
inline void AnyImpl(Predicate predicate, const Tensor &tensor,
framework::Tensor *out) {
VisitDataType(ToDataType(tensor.type()),
AnyDTypeVisitor<Predicate>(predicate, tensor, out));
}
template <typename Predicate>
struct AnyVisitor {
const framework::Tensor &tensor_;
Predicate predicate_;
AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
: tensor_(tensor), predicate_(std::move(predicate)) {}
bool operator()(void) const {
framework::Tensor out;
out.Resize({1});
out.mutable_data<bool>();
AnyImpl(predicate_, tensor_, &out);
return this->GetResult(out);
}
bool GetResult(const framework::Tensor &out) const {
return *out.data<bool>();
}
};
template <typename Predicate>
inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
AnyVisitor<Predicate> visitor(tensor, predicate);
// return platform::VisitPlace(visitor);
return visitor();
}
struct ContainsNANPredicate {
template <typename T>
auto operator()(const T &eigen_vec) const
-> decltype(std::declval<T>().isnan()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isnan();
}
};
bool TensorContainsNAN(const framework::Tensor &tensor) {
ContainsNANPredicate predicate;
return Any(tensor, predicate);
}
struct ContainsInfPredicate {
template <typename T>
auto operator()(const T &eigen_vec) const
-> decltype(std::declval<T>().isinf()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isinf();
}
};
bool TensorContainsInf(const framework::Tensor &tensor) {
ContainsInfPredicate predicate;
return Any(tensor, predicate);
}
struct DeserializedDataFunctor {
DeserializedDataFunctor(void **buf, Tensor *tensor)
: buf_(buf), tensor_(tensor) {}
template <typename T>
void operator()() {
*buf_ = tensor_->mutable_data<T>();
}
void **buf_;
Tensor *tensor_;
};
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -15,51 +15,12 @@ limitations under the License. */ ...@@ -15,51 +15,12 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "memory/t_malloc.h" #include "memory/t_malloc.h"
#include "platform/data_type.h"
#include "tensor.h" #include "tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
void TensorCopy(const Tensor &src, Tensor *dst); void TensorCopy(const Tensor &src, Tensor *dst);
void TensorCopySync(const Tensor &src, Tensor *dst);
template <typename T>
void TensorFromVector(const std::vector<T> &src, Tensor *dst);
template <typename T>
void TesnorToVector(const Tensor &src, std::vector<T> *dst);
bool TensorContainsNAN(const framework::Tensor &tensor);
bool TensorContainsInf(const framework::Tensor &tensor);
void TensorToStream(std::ostream &os, const Tensor &tensor);
void TensorFromStream(std::istream &is, Tensor *tensor);
//
// The implementation of template functions.
//
template <typename T>
void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
auto src_ptr = static_cast<const void *>(src.data());
dst->Resize({static_cast<int64_t>(src.size())});
auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
auto size = src.size() * sizeof(T);
memory::Copy(dst_ptr, src_ptr, size);
}
template <typename T>
void TensorToVector(const Tensor &src, std::vector<T> *dst) {
auto src_ptr = static_cast<const void *>(src.data<T>());
auto size = src.numel() * sizeof(T);
dst->resize(src.numel());
auto dst_ptr = static_cast<void *>(dst->data());
memory::Copy(dst_ptr, src_ptr, size);
}
} // namespace framework } // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,19 +14,17 @@ limitations under the License. */ ...@@ -14,19 +14,17 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream>
#include <memory> #include <memory>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
#include <typeinfo> #include <typeinfo>
#include "../common/variant.h" #include "../common/variant.h"
#include "paddle_mobile_object.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
using std::string; using std::string;
class Variable : public PaddleMobileObject { class Variable {
public: public:
template <typename T> template <typename T>
const T *Get() const { const T *Get() const {
......
...@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "io.h" #include "io/io.h"
#include <fstream> #include <algorithm>
#include <vector> #include <vector>
#include "common/log.h"
#include "common/enforce.h" #include "common/enforce.h"
#include "common/log.h"
#include "framework/framework.pb-c.h" #include "framework/framework.pb-c.h"
#include "framework/lod_tensor.h" #include "framework/lod_tensor.h"
#include "framework/operator.h" #include "framework/operator.h"
...@@ -26,20 +25,29 @@ limitations under the License. */ ...@@ -26,20 +25,29 @@ limitations under the License. */
#include "framework/program/var_desc.h" #include "framework/program/var_desc.h"
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
#include <utility>
#include "common/threadpool.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
using framework::Variable; using framework::Variable;
void ReadBinaryFile(const std::string &filename, std::string *contents) { char *Get_binary_data(std::string filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary); FILE *file = fopen(filename.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(fin.is_open(), "open file: %s failed", PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str()); filename.c_str());
fin.seekg(0, std::ios::end); fseek(file, 0, SEEK_END);
contents->clear(); long size = ftell(file);
contents->resize(fin.tellg()); PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
fin.seekg(0, std::ios::beg); rewind(file);
fin.read(&(contents->at(0)), contents->size()); char *data = new char[size];
fin.close(); size_t bytes_read = fread(data, 1, size, file);
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
fclose(file);
return data;
} }
static size_t ReadBuffer(const char *file_name, uint8_t **out) { static size_t ReadBuffer(const char *file_name, uint8_t **out) {
...@@ -66,110 +74,28 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { ...@@ -66,110 +74,28 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Loader<Dtype, P>::LoadVar(framework::Variable *variable, const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const framework::VarDesc &var_desc, const std::string &dirname, bool optimize, bool can_add_split) {
const std::string &file_path) { auto program =
auto tensor = variable->GetMutable<framework::LoDTensor>(); this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
std::ifstream is(file_path); program.model_path = dirname;
PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed", return program;
file_path.c_str());
std::fpos<mbstate_t> pos;
pos = is.tellg(); // save current position
is.seekg(0, std::ios::end);
is.seekg(pos); // restore saved position
// 1. version
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
// 2 Lod information
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::vector<size_t> tmp(size / sizeof(size_t));
is.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size));
for (auto j : tmp) {
LOG(kLOG_DEBUG1) << " lod - " << j;
}
lod[i] = tmp;
}
// 3. tensor version
uint32_t tensor_version;
is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
// 4. tensor desc
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
const framework::TensorDesc &desc = var_desc.Tensor_desc();
PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor_desc = NULL;
// void *v;
// PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure()(tensor_desc,
// buf.get());
// DLOG << "PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure- " <<
// tensor_desc;
// framework::TensorDesc &tensor_desc = variable->
// PaddleMobile__Framework__Proto__ProgramDesc *c_program;
// uint8_t *proto_buf = NULL;
// size_t read_size = ReadBuffer(file_path.c_str(), &proto_buf);
// c_program = paddle_mobile__framework__proto__program_desc__unpack(NULL,
// read_size, buf);
// paddle_mobile__framework__proto__var_type__tensor_desc__init()
int memory_size = 1;
for (auto l : desc.Dims()) {
memory_size *= l;
}
tensor->Resize(framework::make_ddim(desc.Dims()));
void *memory = tensor;
int type_size = 0;
switch (desc.DataType()) {
case framework::VARTYPE_TYPE_FP16:
type_size = 2;
break;
case framework::VARTYPE_TYPE_FP32:
type_size = 4;
memory = tensor->mutable_data<float>();
break;
case framework::VARTYPE_TYPE_FP64:
type_size = 8;
break;
case framework::VARTYPE_TYPE_INT32:
type_size = 4;
break;
case framework::VARTYPE_TYPE_INT64:
type_size = 8;
break;
case framework::VARTYPE_TYPE_BOOL:
type_size = 1;
break;
default:
break;
}
is.read(static_cast<char *>(memory), memory_size * type_size);
is.close();
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load( const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize) { const std::string &model_path, const std::string &para_path,
std::string model_filename = dirname + "/__model__"; bool optimize) {
auto program = this->LoadProgram(model_path, optimize);
program.para_path = para_path;
program.is_commbine = true;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program; PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL; uint8_t *buf = NULL;
size_t read_size = ReadBuffer(model_filename.c_str(), &buf); size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
...@@ -183,22 +109,16 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load( ...@@ -183,22 +109,16 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
// //
DLOG << "n_ops: " << (*c_program->blocks)->n_ops; DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
// //
std::shared_ptr<framework::ProgramDesc> originProgramDesc = auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
std::make_shared<framework::ProgramDesc>(c_program);
framework::Program<Dtype, P> program; framework::Program<Dtype, P> program;
program.model_path = dirname;
program.originProgram = originProgramDesc; program.originProgram = originProgramDesc;
std::shared_ptr<framework::Scope> scope = auto scope = std::make_shared<framework::Scope>();
std::make_shared<framework::Scope>();
program.scope = scope; program.scope = scope;
originProgramDesc->Block(0);
for (const auto &block : originProgramDesc->Blocks()) { for (const auto &block : originProgramDesc->Blocks()) {
for (int i = 0; i < block->Vars().size(); ++i) { for (auto var_desc : block->Vars()) {
std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
// DLOG << "var name-- " << var_desc->Name();
auto var = scope->Var(var_desc->Name()); auto var = scope->Var(var_desc->Name());
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
...@@ -224,7 +144,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load( ...@@ -224,7 +144,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
if (optimize) { if (optimize) {
framework::ProgramOptimize program_optimize; framework::ProgramOptimize program_optimize;
program.optimizeProgram = program.optimizeProgram =
program_optimize.FushionOptimize(originProgramDesc); program_optimize.FusionOptimize(originProgramDesc, can_add_split);
} }
if (optimize) { if (optimize) {
program.optimizeProgram->Description("optimize: "); program.optimizeProgram->Description("optimize: ");
...@@ -237,9 +157,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load( ...@@ -237,9 +157,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
} }
template class Loader<CPU, Precision::FP32>; template class Loader<CPU, Precision::FP32>;
template class Loader<FPGA, Precision::FP32>;
template class Loader<GPU_MALI, Precision::FP32>;
#pragma mark - executor #pragma mark - executor
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
bool use_optimize) bool use_optimize)
...@@ -253,6 +174,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -253,6 +174,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
variable_ptr[0].SetValue<int>(batch_size); variable_ptr[0].SetValue<int>(batch_size);
const std::vector<std::shared_ptr<framework::BlockDesc>> blocks = const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
to_predict_program_->Blocks(); to_predict_program_->Blocks();
#ifdef PADDLE_EXECUTOR_MULTITHREAD
depManager.resize(blocks.size());
#endif
for (int i = 0; i < blocks.size(); ++i) { for (int i = 0; i < blocks.size(); ++i) {
std::shared_ptr<framework::BlockDesc> block_desc = blocks[i]; std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
...@@ -263,40 +187,54 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -263,40 +187,54 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
program_.scope); program_.scope);
op_base->InferShape(); op_base->InferShape();
ops_of_block_[*block_desc.get()].push_back(op_base); ops_of_block_[*block_desc.get()].push_back(op_base);
#ifdef PADDLE_EXECUTOR_MULTITHREAD
depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
#endif
} }
} }
InitMemory(); if (program_.is_commbine) {
InitCombineMemory();
} else {
InitMemory();
}
std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0);
auto &ops = ops_of_block_[*to_predict_block.get()];
for (const auto &op : ops) {
op->Init();
}
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc, void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
framework::LoDTensor *tensor, framework::LoDTensor *tensor, char *&data) {
const std::string &file_path) {
std::ifstream is(file_path);
PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
file_path.c_str());
std::fpos<mbstate_t> pos;
pos = is.tellg(); // save current position
is.seekg(0, std::ios::end);
is.seekg(pos); // restore saved position
// 1. version // 1. version
uint32_t version; uint32_t version = *(uint32_t *)data;
is.read(reinterpret_cast<char *>(&version), sizeof(version)); data += sizeof(uint32_t);
// 2 Lod information // 2 Lod information
uint64_t lod_level; uint64_t *lod_level_ptr = new uint64_t();
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level)); memcpy(lod_level_ptr, data, sizeof(uint64_t));
uint64_t lod_level = *lod_level_ptr;
delete lod_level_ptr;
data += sizeof(uint64_t);
auto &lod = *tensor->mutable_lod(); auto &lod = *tensor->mutable_lod();
lod.resize(lod_level); lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) { for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size; uint64_t size = *(uint64_t *)data;
is.read(reinterpret_cast<char *>(&size), sizeof(size)); data += sizeof(uint64_t);
DLOG << "lod size: " << i << size;
std::vector<size_t> tmp(size / sizeof(size_t)); std::vector<size_t> tmp(size / sizeof(size_t));
is.read(reinterpret_cast<char *>(tmp.data()),
static_cast<std::streamsize>(size)); for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *(size_t *)data;
DLOG << "tmp[k]: " << k << *(size_t *)data;
data += sizeof(size_t);
}
for (auto j : tmp) { for (auto j : tmp) {
LOG(kLOG_DEBUG1) << " lod - " << j; LOG(kLOG_DEBUG1) << " lod - " << j;
} }
...@@ -304,17 +242,20 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc, ...@@ -304,17 +242,20 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
} }
// 3. tensor version // 3. tensor version
uint32_t tensor_version; uint32_t tensor_version = *(uint32_t *)data;
is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version)); data += sizeof(uint32_t);
// 4. tensor desc // 4. tensor desc
int32_t size; int32_t size = *(int32_t *)data;
is.read(reinterpret_cast<char *>(&size), sizeof(size)); data += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]); std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size); for (int m = 0; m < size; ++m) {
buf.get()[m] = data[m];
}
data += (sizeof(char) * size);
const framework::TensorDesc &desc = var_desc.Tensor_desc(); const framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1; int memory_size = 1;
for (auto l : desc.Dims()) { for (auto l : desc.Dims()) {
memory_size *= l; memory_size *= l;
...@@ -348,8 +289,10 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc, ...@@ -348,8 +289,10 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
break; break;
} }
is.read(static_cast<char *>(memory), memory_size * type_size); for (int n = 0; n < memory_size * type_size; ++n) {
is.close(); static_cast<char *>(memory)[n] = data[n];
}
data += (sizeof(char) * memory_size * type_size);
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
...@@ -362,8 +305,12 @@ void Executor<Dtype, P>::InitMemory() { ...@@ -362,8 +305,12 @@ void Executor<Dtype, P>::InitMemory() {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue; continue;
} }
LoadMemory(*var_desc, tensor,
program_.model_path + "/" + var_desc->Name()); char *origin_data =
Get_binary_data(program_.model_path + "/" + var_desc->Name());
char *data = origin_data;
LoadMemory(*var_desc, tensor, data);
delete origin_data;
} else { } else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto tensor = var->template GetMutable<framework::LoDTensor>(); auto tensor = var->template GetMutable<framework::LoDTensor>();
...@@ -375,6 +322,32 @@ void Executor<Dtype, P>::InitMemory() { ...@@ -375,6 +322,32 @@ void Executor<Dtype, P>::InitMemory() {
} }
} }
template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitCombineMemory() {
LOG(kLOG_INFO) << " begin init combine memory";
char *origin_data = Get_binary_data(program_.para_path);
char *data = origin_data;
for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
if (var_desc->Persistable()) {
auto tensor = var->template GetMutable<framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
LoadMemory(*var_desc, tensor, data);
} else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto tensor = var->template GetMutable<framework::LoDTensor>();
tensor->template mutable_data<Ptype>();
}
}
}
}
delete origin_data;
LOG(kLOG_INFO) << " end init combine memory ";
}
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
const framework::Tensor &t) { const framework::Tensor &t) {
...@@ -385,19 +358,135 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -385,19 +358,135 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
feed_tensor->ShareDataWith(t); feed_tensor->ShareDataWith(t);
std::shared_ptr<framework::BlockDesc> to_predict_block = std::shared_ptr<framework::BlockDesc> to_predict_block =
to_predict_program_->Block(0); to_predict_program_->Block(0);
for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { auto &ops = ops_of_block_[*to_predict_block.get()];
auto op = ops_of_block_[*to_predict_block.get()][j]; #ifdef PADDLE_MOBILE_PROFILE
op->Run(); std::vector<ProfInfo> profile(ops.size());
#endif
#ifdef PADDLE_EXECUTOR_MULTITHREAD
std::mutex m;
std::condition_variable cv;
std::queue<int> next;
next.push(0);
int rsize = ops.size();
std::vector<int> status(rsize, 0);
auto &threadPool = ThreadPool::getThreadPool();
auto &dep = depManager[0];
auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
std::lock_guard<std::mutex> lk(m);
rsize--;
status[opi] = 2;
for (int i : dep.getNext(opi)) {
bool ok = true;
for (int j : dep.getDeps(i)) {
if (status[j] != 2) {
ok = false;
break;
}
}
if (ok && (status[i] == 0)) {
next.push(i);
}
}
cv.notify_one();
};
for (;;) {
std::unique_lock<std::mutex> lk(m);
cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
if (rsize == 0) {
break;
}
while (next.size() > 0) {
int opi = next.front();
next.pop();
status[opi] = 1;
threadPool.enqueue([opi, &ops, &finishF, &profile] {
auto &op = ops[opi];
#ifdef PADDLE_MOBILE_PROFILE
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
profile[opi].tid = ThreadPool::getThreadPoolThreadId();
#endif
ops[opi]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
finishF(opi);
});
}
} }
auto ops = ops_of_block_[*to_predict_program_->Block(0)]; #else
for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
// to Run
ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
clock_gettime(CLOCK_MONOTONIC, &ts);
profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
}
#endif
auto last_op = ops.rbegin(); auto last_op = ops.rbegin();
auto output_map = (*last_op)->Outputs(); auto output_map = (*last_op)->Outputs();
std::vector<std::string> out_keys = (*last_op)->GetOutKeys(); std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output"); PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
framework::LoDTensor *output_tensor = framework::LoDTensor *output_tensor =
framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map, framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
*(program_.scope)); *(program_.scope));
return std::shared_ptr<framework::Tensor>(output_tensor); #ifdef PADDLE_MOBILE_PROFILE
#ifdef PADDLE_EXECUTOR_MULTITHREAD
// TODO expose profile info as an interface, user can get them to analysis
// the performance of their deepnet.
FILE *df = fopen("net.dot", "w");
fprintf(df, "digraph {\n");
for (int i = 0; i < ops.size(); i++) {
for (int j : dep.getNext(i)) {
fprintf(df, "op_%d -> op_%d\n", i, j);
}
}
for (int i = 0; i < ops.size(); i++) {
fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
}
fprintf(df, "}\n");
fclose(df);
#endif
FILE *pf = fopen("profile.out", "w");
std::unordered_map<std::string, uint64_t> _tp;
for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
_tp[ops[i]->Type()] += timeCost;
fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
}
fclose(pf);
printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>;
std::vector<prof_t> _tv(_tp.begin(), _tp.end());
uint64_t _ptotal = 0;
for (auto const &p : _tv) {
_ptotal += p.second;
}
auto compf = [](const prof_t &a, const prof_t &b) {
return a.second > b.second;
};
std::sort(_tv.begin(), _tv.end(), compf);
_tv.push_back(std::make_pair("total", _ptotal));
for (auto const &p : _tv) {
printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
(float)p.second / _ptotal * 100.0);
}
printf("====================[---------]======================\n");
#endif
return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
} }
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
...@@ -420,5 +509,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict( ...@@ -420,5 +509,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
} }
template class Executor<CPU, Precision::FP32>; template class Executor<CPU, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,51 +14,80 @@ limitations under the License. */ ...@@ -14,51 +14,80 @@ limitations under the License. */
#pragma once #pragma once
#include <memory.h>
#include <map> #include <map>
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "common/types.h" #include "common/types.h"
#include "framework/lod_tensor.h" #include "framework/lod_tensor.h"
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/paddle_mobile_object.h"
#include "framework/program/program.h" #include "framework/program/program.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <condition_variable>
#include <mutex>
#include <thread>
#include "common/dep_core.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
template <typename Dtype, Precision P = Precision::FP32> template <typename Dtype = CPU, Precision P = Precision::FP32>
class Loader : PaddleMobileObject { class Loader {
public: public:
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
const framework::Program<Dtype, P> Load(const std::string &dirname, const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = true); bool optimize = false,
bool can_add_split = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false);
private: private:
void LoadVar(framework::Variable *variable, const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
const framework::VarDesc &var_desc, bool optimize = false,
const std::string &file_path); bool can_add_split = false);
}; };
template <typename Dtype, Precision P = Precision::FP32> template <typename Dtype = CPU, Precision P = Precision::FP32>
class Executor { class Executor {
public: public:
typedef typename PrecisionTrait<P>::ptype Ptype; typedef typename PrecisionTrait<P>::ptype Ptype;
/*
* @b init executor with program load by Loader class
* @b 用 loader load 的 program 实例化 executor
* */
Executor(const framework::Program<Dtype> p, int batch_size = 1, Executor(const framework::Program<Dtype> p, int batch_size = 1,
bool use_optimize = true); bool use_optimize = true);
/*
* @b to predict
* */
std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t); std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
/*
* @b to predict with vector and dim
*
* @b 使用 输入 和 输入的维度信息 进行预测
* */
std::vector<Ptype> Predict(const std::vector<Ptype> &input, std::vector<Ptype> Predict(const std::vector<Ptype> &input,
const std::vector<int64_t> &dims); const std::vector<int64_t> &dims);
protected: protected:
Executor() = default; Executor() = default;
void InitMemory(); void InitMemory();
void LoadMemory(const framework::VarDesc var_desc, void LoadMemory(const framework::VarDesc var_desc,
framework::LoDTensor *tensor, const std::string &file_path); framework::LoDTensor *tensor, char *&data);
void InitCombineMemory();
framework::Program<Dtype> program_; framework::Program<Dtype> program_;
int batch_size_ = 1; int batch_size_ = 1;
std::shared_ptr<framework::ProgramDesc> to_predict_program_; std::shared_ptr<framework::ProgramDesc> to_predict_program_;
...@@ -68,6 +97,16 @@ class Executor { ...@@ -68,6 +97,16 @@ class Executor {
std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>> std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
ops_of_block_; ops_of_block_;
bool use_optimize_ = false; bool use_optimize_ = false;
#ifdef PADDLE_EXECUTOR_MULTITHREAD
std::vector<depCore> depManager;
#endif
#ifdef PADDLE_MOBILE_PROFILE
struct ProfInfo {
int tid = 0;
uint64_t runBegin = 0UL;
uint64_t runEnd = 0UL;
};
#endif
}; };
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ANDROID
#include "paddle_mobile_jni.h"
#ifdef __cplusplus
extern "C" {
#endif
namespace paddle_mobile {
namespace jni {
using framework::DDim;
using framework::Program;
using framework::Tensor;
using paddle_mobile::CPU;
using std::string;
extern const char *ANDROID_LOG_TAG =
"paddle_mobile LOG built on " __DATE__ " " __TIME__;
static Executor<CPU> *shared_executor_instance = nullptr;
// toDo mutex lock
// static std::mutex shared_mutex;
Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
bool use_optimize) {
if (nullptr == shared_executor_instance) {
shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
}
return shared_executor_instance;
}
string jstring2cppstring(JNIEnv *env, jstring jstr) {
const char *cstr = env->GetStringUTFChars(jstr, 0);
string cppstr(cstr);
env->ReleaseStringUTFChars(jstr, cstr);
return cppstr;
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
jclass thiz,
jstring modelPath) {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
bool optimize = true;
auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
shared_executor_instance = getExecutorInstance(program, 1, optimize);
return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
}
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
JNIEnv *env, jclass thiz, jfloatArray buf) {
jfloatArray result = NULL;
int count = 0;
float *dataPointer = nullptr;
if (nullptr != buf) {
dataPointer = env->GetFloatArrayElements(buf, NULL);
}
framework::Tensor input;
framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
input.Resize(ddim);
auto input_ptr = input.mutable_data<float>();
for (int i = 0; i < framework::product(ddim); i++) {
input_ptr[i] = dataPointer[i];
}
auto output = shared_executor_instance->Predict(input);
count = output->numel();
result = env->NewFloatArray(count);
env->SetFloatArrayRegion(result, 0, count, output->data<float>());
return result;
}
JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
jclass thiz) {}
} // namespace jni
} // namespace paddle_mobile
#ifdef __cplusplus
}
#endif
#endif
...@@ -13,25 +13,39 @@ See the License for the specific language governing permissions and ...@@ -13,25 +13,39 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef ANDROID
#include <jni.h>
#include "common/log.h"
#include "framework/tensor.h"
#include "io/io.h"
#ifdef __cplusplus
extern "C" {
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace jni {
/**
// inline proto::VarType::Type ToDataType(std::type_index type) { * load model & params of the net for android
// using namespace paddle_mobile::framework::proto; */
// if (typeid(float).hash_code() == type.hash_code()) { JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
// return proto::VarType::FP32; jclass thiz,
// } else if (typeid(double).hash_code() == type.hash_code()) { jstring modelPath);
// return proto::VarType::FP64;
// } else if (typeid(int).hash_code() == type.hash_code()) { /**
// return proto::VarType::INT32; * object detection for anroid
// } else if (typeid(int64_t).hash_code() == type.hash_code()) { */
// return proto::VarType::INT64; JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
// } else if (typeid(bool).hash_code() == type.hash_code()) { JNIEnv *env, jclass thiz, jfloatArray buf);
// return proto::VarType::BOOL;
// } else { /**
//// PADDLE_THROW("Not supported"); * clear data of the net when destroy for android
// } */
// } JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
} // namespace framework jclass thiz);
} // namespace jni
} // namespace paddle_mobile } // namespace paddle_mobile
#ifdef __cplusplus
}
#endif
#endif
...@@ -12,19 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #include "memory/t_malloc.h"
#include "t_malloc.h"
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
namespace paddle_mobile { namespace paddle_mobile {
namespace memory { namespace memory {
const int MALLOC_ALIGN = 16; const int MALLOC_ALIGN = 64;
void Copy(void *dst, const void *src, size_t num) { void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
}; }
void *Alloc(size_t size) { void *Alloc(size_t size) {
size_t offset = sizeof(void *) + MALLOC_ALIGN - 1; size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
......
...@@ -12,20 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,20 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef BATCHNORM_OP
#include "batchnorm_op.h" #include "batchnorm_op.h"
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void BatchNormOp<Dtype, T>::InferShape() const { void BatchNormOp<Dtype, T>::InferShape() const {
auto x_dims = param_.InputX()->dims(); auto x_dims = this->param_.InputX()->dims();
param_.OutputY()->Resize(x_dims); this->param_.OutputY()->Resize(x_dims);
} }
template class BatchNormOp<CPU, float>; template class BatchNormOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(batch_norm); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(batch_norm, ops::BatchNormOp); USE_OP_CPU(batch_norm);
REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(batch_norm);
REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -23,27 +25,24 @@ namespace paddle_mobile { ...@@ -23,27 +25,24 @@ namespace paddle_mobile {
namespace operators { namespace operators {
using std::string; using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class BatchNormOp : public framework::OperatorWithKernel<DeviceType> { class BatchNormOp
: public framework::OperatorWithKernel<DeviceType, BatchNormParam,
BatchNormKernel<DeviceType, T>> {
public: public:
BatchNormOp(const string &type, const VariableNameMap &inputs, BatchNormOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, BatchNormParam,
scope), BatchNormKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::BatchNormKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
BatchNormParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef BOXCODER_OP
#include "operators/box_coder_op.h" #include "operators/box_coder_op.h"
#include <vector> #include <vector>
namespace paddle_mobile { namespace paddle_mobile {
...@@ -19,11 +21,11 @@ namespace operators { ...@@ -19,11 +21,11 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void BoxCoderOp<Dtype, T>::InferShape() const { void BoxCoderOp<Dtype, T>::InferShape() const {
auto input_priorbox_dims = param_.InputPriorBox()->dims(); auto input_priorbox_dims = this->param_.InputPriorBox()->dims();
auto input_priorboxvar_dims = param_.InputPriorBoxVar()->dims(); auto input_priorboxvar_dims = this->param_.InputPriorBoxVar()->dims();
auto input_targetbox_dims = param_.InputTargetBox()->dims(); auto input_targetbox_dims = this->param_.InputTargetBox()->dims();
auto code_type = param_.CodeType(); auto code_type = this->param_.CodeType();
if (code_type == "encode_center_size") { if (code_type == "encode_center_size") {
if (input_targetbox_dims.size() != 2) { if (input_targetbox_dims.size() != 2) {
...@@ -42,7 +44,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const { ...@@ -42,7 +44,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
LOG(kLOG_ERROR) << " dimension not match"; LOG(kLOG_ERROR) << " dimension not match";
} }
} }
param_.OutputBox()->Resize(framework::make_ddim( this->param_.OutputBox()->Resize(framework::make_ddim(
{input_targetbox_dims[0], input_priorbox_dims[0], 4})); {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
} }
template class BoxCoderOp<CPU, float>; template class BoxCoderOp<CPU, float>;
...@@ -50,5 +52,13 @@ template class BoxCoderOp<CPU, float>; ...@@ -50,5 +52,13 @@ template class BoxCoderOp<CPU, float>;
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(box_coder); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(box_coder, ops::BoxCoderOp); USE_OP_CPU(box_coder);
REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef BOXCODER_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -26,27 +28,27 @@ namespace operators { ...@@ -26,27 +28,27 @@ namespace operators {
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class BoxCoderOp : public framework::OperatorWithKernel<DeviceType> { class BoxCoderOp
: public framework::OperatorWithKernel<
DeviceType, BoxCoderParam, operators::BoxCoderKernel<DeviceType, T>> {
public: public:
BoxCoderOp(const std::string &type, const VariableNameMap &inputs, BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, BoxCoderParam,
scope), operators::BoxCoderKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::BoxCoderKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, BoxCoderParam,
operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
BoxCoderParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONCAT_OP
#include "concat_op.h" #include "concat_op.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -19,7 +21,7 @@ namespace operators { ...@@ -19,7 +21,7 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void ConcatOp<Dtype, T>::InferShape() const { void ConcatOp<Dtype, T>::InferShape() const {
auto inputs = param_.Inputs(); auto inputs = this->param_.Inputs();
const size_t n = inputs.size(); const size_t n = inputs.size();
std::vector<DDim> inputs_dims; std::vector<DDim> inputs_dims;
...@@ -28,7 +30,7 @@ void ConcatOp<Dtype, T>::InferShape() const { ...@@ -28,7 +30,7 @@ void ConcatOp<Dtype, T>::InferShape() const {
inputs_dims.push_back(inputs[i]->dims()); inputs_dims.push_back(inputs[i]->dims());
} }
auto axis = static_cast<size_t>(param_.Axis()); auto axis = static_cast<size_t>(this->param_.Axis());
if (n == 1) { if (n == 1) {
DLOG << "Warning: concat op have only one input, " DLOG << "Warning: concat op have only one input, "
...@@ -52,7 +54,7 @@ void ConcatOp<Dtype, T>::InferShape() const { ...@@ -52,7 +54,7 @@ void ConcatOp<Dtype, T>::InferShape() const {
out_dims[axis] = -1; out_dims[axis] = -1;
} }
param_.Out()->Resize(out_dims); this->param_.Out()->Resize(out_dims);
} }
template class ConcatOp<CPU, float>; template class ConcatOp<CPU, float>;
...@@ -60,5 +62,15 @@ template class ConcatOp<CPU, float>; ...@@ -60,5 +62,15 @@ template class ConcatOp<CPU, float>;
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(concat); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(concat, ops::ConcatOp); USE_OP_CPU(concat);
REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(concat);
REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONCAT_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -22,26 +24,26 @@ namespace paddle_mobile { ...@@ -22,26 +24,26 @@ namespace paddle_mobile {
namespace operators { namespace operators {
using std::string; using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class ConcatOp : public framework::OperatorWithKernel<DeviceType> { class ConcatOp
: public framework::OperatorWithKernel<
DeviceType, ConcatParam, operators::ConcatKernel<DeviceType, T>> {
public: public:
ConcatOp(const string &type, const VariableNameMap &inputs, ConcatOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, ConcatParam,
scope), operators::ConcatKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::ConcatKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, ConcatParam,
operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
ConcatParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,42 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,42 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#include "operators/conv_op.h" #include "operators/conv_op.h"
#include <vector> #include <vector>
#include "framework/data_type.h"
#include "framework/op_proto_maker.h" #include "framework/op_proto_maker.h"
#include "framework/op_registry.h" #include "framework/op_registry.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void ConvOp<Dtype, T>::InferShape() const { void ConvOp<Dtype, T>::InferShape() const {
// std::cout << " begin get dims: " << std::endl; auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
auto in_dims = param_.Input()->dims(); const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
// std::cout << " end get in dims: " << std::endl; int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
// std::cout << " in_dims: " << in_dims << std::endl;
// std::cout << " begin get Filter " << std::endl;
auto filter_dims = param_.Filter()->dims();
// std::cout << " end get Filter " << std::endl;
// std::cout << " begin get Attrs " << std::endl;
const std::vector<int> &strides = param_.Strides();
// std::cout << " end get Attrs " << strides[0] << std::endl;
std::vector<int> paddings = param_.Paddings();
int groups = param_.Groups();
std::vector<int> dilations = param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() && dilations.size() == paddings.size() &&
...@@ -56,13 +39,13 @@ void ConvOp<Dtype, T>::InferShape() const { ...@@ -56,13 +39,13 @@ void ConvOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]}); std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], output_shape.push_back(
dilations[i], paddings[i], math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
strides[i])); paddings[i], strides[i]));
} }
framework::DDim ddim = framework::make_ddim(output_shape); framework::DDim ddim = framework::make_ddim(output_shape);
param_.Output()->Resize(ddim); this->param_.Output()->Resize(ddim);
} }
template class ConvOp<CPU, float>; template class ConvOp<CPU, float>;
...@@ -71,5 +54,17 @@ template class ConvOp<CPU, float>; ...@@ -71,5 +54,17 @@ template class ConvOp<CPU, float>;
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(conv2d); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(conv2d, ops::ConvOp); USE_OP_CPU(conv2d);
REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv2d);
REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA(conv2d);
REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -22,34 +24,26 @@ namespace paddle_mobile { ...@@ -22,34 +24,26 @@ namespace paddle_mobile {
namespace operators { namespace operators {
using std::string; using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class ConvOp : public framework::OperatorWithKernel<DeviceType> { class ConvOp
: public framework::OperatorWithKernel<
DeviceType, ConvParam, operators::ConvKernel<DeviceType, T>> {
public: public:
ConvOp(const std::string &type, const VariableNameMap &inputs, ConvOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap &attrs, const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, ConvParam,
scope), operators::ConvKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, ConvParam,
operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
void RunImpl() const {
operators::ConvKernel<DeviceType, T> kernel;
kernel.Compute(param_);
this->ClearVariables({"Filter", "Input"});
}
private: private:
ConvParam param_;
}; };
inline int ConvOutputSize(int input_size, int filter_size, int dilation,
int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,24 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,24 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEPTHWISECONV_OP
#include "operators/depthwise_conv_op.h" #include "operators/depthwise_conv_op.h"
#include <vector> #include <vector>
#include "framework/data_type.h"
#include "framework/op_proto_maker.h" #include "framework/op_proto_maker.h"
#include "framework/op_registry.h" #include "framework/op_registry.h"
#include "operators/conv_op.h" #include "operators/conv_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void DepthwiseConvOp<Dtype, T>::InferShape() const { void DepthwiseConvOp<Dtype, T>::InferShape() const {
auto in_dims = param_.Input()->dims(); auto in_dims = this->param_.Input()->dims();
auto filter_dims = param_.Filter()->dims(); auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = param_.Strides(); const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = param_.Paddings(); std::vector<int> paddings = this->param_.Paddings();
int groups = param_.Groups(); int groups = this->param_.Groups();
std::vector<int> dilations = param_.Dilations(); std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() && PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() && dilations.size() == paddings.size() &&
...@@ -38,13 +40,13 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const { ...@@ -38,13 +40,13 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]}); std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], output_shape.push_back(
dilations[i], paddings[i], math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
strides[i])); paddings[i], strides[i]));
} }
framework::DDim ddim = framework::make_ddim(output_shape); framework::DDim ddim = framework::make_ddim(output_shape);
param_.Output()->Resize(ddim); this->param_.Output()->Resize(ddim);
} }
template class DepthwiseConvOp<CPU, float>; template class DepthwiseConvOp<CPU, float>;
...@@ -53,5 +55,13 @@ template class DepthwiseConvOp<CPU, float>; ...@@ -53,5 +55,13 @@ template class DepthwiseConvOp<CPU, float>;
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(depthwise_conv2d); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp); USE_OP_CPU(depthwise_conv2d);
REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEPTHWISECONV_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -22,28 +24,28 @@ namespace paddle_mobile { ...@@ -22,28 +24,28 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> { class DepthwiseConvOp : public framework::OperatorWithKernel<
DeviceType, ConvParam,
operators::DepthwiseConvKernel<DeviceType, T>> {
public: public:
DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs, DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap &attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<
scope), DeviceType, ConvParam,
param_(inputs, outputs, attrs, *scope) {} operators::DepthwiseConvKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
using framework::OperatorWithKernel<
DeviceType, ConvParam,
operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
void RunImpl() const {
operators::DepthwiseConvKernel<DeviceType, T> kernel;
kernel.Compute(param_);
this->ClearVariables({"Filter", "Input"});
}
private: private:
ConvParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#include "elementwise_add_op.h" #include "elementwise_add_op.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -19,13 +21,23 @@ namespace operators { ...@@ -19,13 +21,23 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void ElementwiseAddOp<Dtype, T>::InferShape() const { void ElementwiseAddOp<Dtype, T>::InferShape() const {
auto x_dim = param_.InputX()->dims(); auto x_dim = this->param_.InputX()->dims();
param_.Out()->Resize(x_dim); this->param_.Out()->Resize(x_dim);
} }
template class ElementwiseAddOp<CPU, float>; template class ElementwiseAddOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(elementwise_add); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(elementwise_add, ops::ElementwiseAddOp); USE_OP_CPU(elementwise_add);
REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(elementwise_add);
REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -23,26 +25,27 @@ namespace paddle_mobile { ...@@ -23,26 +25,27 @@ namespace paddle_mobile {
namespace operators { namespace operators {
using std::string; using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> { class ElementwiseAddOp : public framework::OperatorWithKernel<
DeviceType, ElementwiseAddParam,
operators::ElementwiseAddKernel<DeviceType, T>> {
public: public:
ElementwiseAddOp(const string &type, const VariableNameMap &inputs, ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<
scope), DeviceType, ElementwiseAddParam,
param_(inputs, outputs, attrs, *scope) {} operators::ElementwiseAddKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::ElementwiseAddKernel<DeviceType, T> kernel; using framework::OperatorWithKernel<
kernel.Compute(param_); DeviceType, ElementwiseAddParam,
} operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
ElementwiseAddParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -32,6 +32,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -32,6 +32,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
param_(inputs, outputs, attrs, *scope) {} param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() const {}
void InferShape() const { void InferShape() const {
auto out_dims = param_.Out()->dims(); auto out_dims = param_.Out()->dims();
out_dims[0] = param_.BatchSize(); out_dims[0] = param_.BatchSize();
...@@ -43,8 +45,16 @@ class FeedOp : public framework::OperatorBase<DeviceType> { ...@@ -43,8 +45,16 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
}; };
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(feed); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(feed, ops::FeedOp); USE_OP_CPU(feed);
REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(feed);
REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> { ...@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
param_(inputs, outputs, attrs, *scope) {} param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
void Init() const {}
void InferShape() const { void InferShape() const {
auto x_dims = param_.InputX()->dims(); auto x_dims = param_.InputX()->dims();
param_.Out()->Resize(x_dims); param_.Out()->Resize(x_dims);
...@@ -43,8 +45,16 @@ class FetchOp : public framework::OperatorBase<DeviceType> { ...@@ -43,8 +45,16 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
}; };
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(fetch); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(fetch, ops::FetchOp); USE_OP_CPU(fetch);
REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fetch);
REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -12,66 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,66 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef FUSION_CONVADD_OP
#include <vector> #include "operators/fusion_conv_add.h"
#include "operators/math/conv_func.h"
#include "lod_tensor.h"
#include "tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace operators {
class SelectedRows { template <typename Dtype, typename T>
public: void FusionConvAddOp<Dtype, T>::InferShape() const {
SelectedRows(const std::vector<int64_t> &rows, const int64_t &height) auto in_dims = this->param_.Input()->dims();
: rows_(rows), height_(height) { auto filter_dims = this->param_.Filter()->dims();
value_.reset(new Tensor()); const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
} }
SelectedRows() { framework::DDim ddim = framework::make_ddim(output_shape);
height_ = 0; this->param_.Output()->Resize(ddim);
value_.reset(new Tensor()); }
} template class FusionConvAddOp<CPU, float>;
} // namespace operators
const Tensor &value() const { return *value_; }
Tensor *mutable_value() { return value_.get(); }
int64_t height() const { return height_; }
void set_height(int64_t height) { height_ = height; }
const std::vector<int64_t> &rows() const { return rows_; }
std::vector<int64_t> *mutable_rows() { return &rows_; }
void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }
/**
* get the index of id in rows
*/
int64_t index(int64_t id) const {
auto it = std::find(rows_.begin(), rows_.end(), id);
// PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims());
dims[0] = height_;
return make_ddim(dims);
}
private:
// Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9}
// here.
// SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled.
std::vector<int64_t> rows_;
std::unique_ptr<Tensor> value_{nullptr};
int64_t height_;
};
} // namespace framework
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(conv_add);
REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(conv_add);
REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define FUSION_CONVADD_OP
#ifdef FUSION_CONVADD_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/conv_add_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionConvAddMatcher : public framework::FusionOpMatcher {
public:
FusionConvAddMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
}
std::string Type() { return G_OP_TYPE_CONV_ADD; }
};
template <typename DeviceType, typename T>
class FusionConvAddOp : public framework::OperatorWithKernel<
DeviceType, FusionConvAddParam,
operators::ConvAddKernel<DeviceType, T>> {
public:
FusionConvAddOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType, FusionConvAddParam,
operators::ConvAddKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddParam,
operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,4 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,4 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONVADDRELU_OP
#include "fusion_conv_add_relu_op.h" #include "fusion_conv_add_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvAddReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_add_relu);
REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,38 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,38 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONVADDRELU_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h" #include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher { class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
public: public:
FushionConvAddReluOpMatcher() { FusionConvAddReluOpMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV); node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) > node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU); std::make_shared<framework::Node>(G_OP_TYPE_RELU);
} }
void FolderNodes(framework::Node *node) { void FolderNodes(
std::vector<std::shared_ptr<framework::OpDesc>> origin_descs = framework::Node *node,
node->OpDescs(node_.Depth()); std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(), node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}); {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
} }
std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; } std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
}; };
class FusionFcOp { template <typename DeviceType, typename T>
class FusionConvAddReluOp : public framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam,
operators::ConvAddReluKernel<DeviceType, T>> {
public: public:
private: FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam,
operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvAddReluParam,
operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
}; };
// static framework::FusionOpRegistrar fc_registrar( #ifdef PADDLE_MOBILE_CPU
// new FushionConvAddReluOpMatcher());
#ifndef CONV_ADD_RELU_REGISTER
#define CONV_ADD_RELU_REGISTER
// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
// FusionConvAddReluOpMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_FC_OP
#include "operators/fusion_fc_op.h" #include "operators/fusion_fc_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void FushionFcOp<Dtype, T>::InferShape() const { void FusionFcOp<Dtype, T>::InferShape() const {
auto x_dims = param_.InputX()->dims(); auto x_dims = this->param_.InputX()->dims();
auto y_dims = param_.InputY()->dims(); auto y_dims = this->param_.InputY()->dims();
int x_num_col_dims = param_.XNumColDims(); int x_num_col_dims = this->param_.XNumColDims();
int y_num_col_dims = param_.YNumColDims(); int y_num_col_dims = this->param_.YNumColDims();
assert(x_dims.size() > x_num_col_dims); assert(x_dims.size() > x_num_col_dims);
assert(y_dims.size() > y_num_col_dims); assert(y_dims.size() > y_num_col_dims);
...@@ -45,12 +47,22 @@ void FushionFcOp<Dtype, T>::InferShape() const { ...@@ -45,12 +47,22 @@ void FushionFcOp<Dtype, T>::InferShape() const {
} }
framework::DDim ddim = framework::make_ddim(output_dims); framework::DDim ddim = framework::make_ddim(output_dims);
param_.Out()->Resize(ddim); this->param_.Out()->Resize(ddim);
} }
template class FushionFcOp<CPU, float>; template class FusionFcOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(fc); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(fc, ops::FushionFcOp); USE_OP_CPU(fc);
REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(fc);
REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -19,7 +21,7 @@ limitations under the License. */ ...@@ -19,7 +21,7 @@ limitations under the License. */
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h" #include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fushion_fc_kernel.h" #include "operators/kernel/fusion_fc_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -32,40 +34,55 @@ class FusionFcMatcher : public framework::FusionOpMatcher { ...@@ -32,40 +34,55 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD); node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
} }
void FolderNodes(framework::Node *node) { void FolderNodes(
vector<std::shared_ptr<framework::OpDesc>> origin_descs = framework::Node *node,
node->OpDescs(node_.Depth()); std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(), node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}); {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
} }
std::string Type() { return G_OP_TYPE_FC; } std::string Type() { return G_OP_TYPE_FC; }
}; };
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FushionFcOp : public framework::OperatorWithKernel<DeviceType> { class FusionFcOp
: public framework::OperatorWithKernel<
DeviceType, FusionFcParam, operators::FusionFcKernel<DeviceType, T>> {
public: public:
FushionFcOp(const string &type, const VariableNameMap &inputs, FusionFcOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, FusionFcParam,
scope), operators::FusionFcKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::FushionFcKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, FusionFcParam,
operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
FushionFcParam param_;
}; };
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef CONV_CPU_REGISTER
#define CONV_CPU_REGISTER
static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher()); static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,82 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,82 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef BATCHNORM_OP
#include "operators/kernel/batchnorm_kernel.h" #include "operators/kernel/batchnorm_kernel.h"
#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const {
/// todo: test. return true;
const Tensor *input_x = param.InputX(); }
auto input_x_ptr = input_x->data<float>();
const auto &x_dims = input_x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int H = x_dims[2];
const int W = x_dims[3];
const int stride0 = C * H * W;
const int stride1 = H * W;
const int stride2 = W;
Tensor *out = param.OutputY();
auto out_ptr = out->mutable_data<float>();
const float epsilon = param.Epsilon();
const Tensor *mean = param.InputMean();
const Tensor *variance = param.InputVariance();
const Tensor *scale = param.InputScale();
const Tensor *bias = param.InputBias();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
Tensor inv_std;
auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
if (C != variance->numel()) {
std::cout << "C must equal to variance.numel()" << std::endl;
}
assert(C == variance->numel());
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to template <>
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
for (int i = 0; i < C; i++) { BatchnormCompute<float>(param);
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
DLOG << "input_x_ptr : " << input_x_ptr[102];
DLOG << "variance : " << variance_ptr[5];
DLOG << "inv_std_ptr : " << inv_std_ptr[5];
DLOG << "new_scale_ptr : " << new_scale_ptr[5];
DLOG << "new_bias_ptr : " << new_bias_ptr[5];
DLOG << "out_ptr : " << out_ptr[102];
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef BOXCODER_OP
#include "operators/kernel/box_coder_kernel.h" #include "operators/kernel/box_coder_kernel.h"
#include <cmath>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -109,6 +110,11 @@ void DecodeCenterSize(const framework::Tensor& target_box, ...@@ -109,6 +110,11 @@ void DecodeCenterSize(const framework::Tensor& target_box,
} }
} }
template <>
bool BoxCoderKernel<CPU, float>::Init(const BoxCoderParam& para) const {
return true;
}
template <> template <>
void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const { void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
const auto* input_priorbox = param.InputPriorBox(); const auto* input_priorbox = param.InputPriorBox();
...@@ -135,3 +141,5 @@ void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const { ...@@ -135,3 +141,5 @@ void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h" #include "operators/kernel/concat_kernel.h"
...@@ -52,6 +52,11 @@ class ConcatFunctor { ...@@ -52,6 +52,11 @@ class ConcatFunctor {
} }
}; };
template <>
bool ConcatKernel<CPU, float>::Init(const ConcatParam &para) const {
return true;
}
template <> template <>
void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const { void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
auto inputs = param.Inputs(); auto inputs = param.Inputs();
...@@ -85,3 +90,5 @@ void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const { ...@@ -85,3 +90,5 @@ void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddKernel<CPU, float>::Init(const FusionConvAddParam &para) const {
return true;
}
template <>
void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1));
}
}
}
template class ConvAddKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,9 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "var_desc.h" #ifdef FUSION_CONVADD_RELU_OP
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators {
template <>
bool ConvAddReluKernel<CPU, float>::Init(
const FusionConvAddReluParam &para) const {
return true;
}
namespace framework {} // namespace framework template <>
void ConvAddReluKernel<CPU, float>::Compute(
const FusionConvAddReluParam &param) const {
ConvAddReluCompute<float>(param);
}
template class ConvAddReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,103 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,103 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h" #include "operators/kernel/conv_kernel.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
void ConvKernel<CPU, float>::Compute(const ConvParam &param) const { bool ConvKernel<CPU, float>::Init(const ConvParam &para) const {
LOG(kLOG_DEBUG) << param; return true;
}
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
// DLOG << " compute end get Attrs " << strides[0];
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
DLOG << " filter.dims() = " << filter.dims();
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm template <>
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); ConvCompute<float>(param);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
} }
template class ConvKernel<CPU, float>; template class ConvKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,115 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,115 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEPTHWISECONV_OP
#include "operators/kernel/depthwise_conv_kernel.h" #include "operators/kernel/depthwise_conv_kernel.h"
#include "operators/kernel/conv_kernel.h" #include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const { bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
LOG(kLOG_DEBUG) << param; return true;
}
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
// DLOG << " compute end get Attrs " << strides[0];
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
// DLOG << " col_shape = " << col_shape;
// DLOG << " col_matrix_shape = " << col_matrix_shape;
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
// DLOG << " input_shape = " << input_shape;
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
// DLOG << " filter.dims() = " << filter.dims();
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
// DLOG << " in_batch.dims() = " << in_batch.dims();
// DLOG << " out_batch.dims() = " << out_batch.dims();
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm template <>
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); DepthwiseConvCompute<float>(param);
// DLOG << " out_slice " << out_slice.dims();
// DLOG << " filter_slice " << filter_slice.dims();
// DLOG << " col_matrix " << col_matrix.dims();
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
auto filter_ptr = filter_slice.data<float>();
}
}
} }
template class DepthwiseConvKernel<CPU, float>; template class DepthwiseConvKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once #pragma once
#include "operators/kernel/elementwise_add_kernel.h" #include "operators/kernel/elementwise_add_kernel.h"
...@@ -24,6 +26,12 @@ struct AddFunctor { ...@@ -24,6 +26,12 @@ struct AddFunctor {
inline T operator()(T a, T b) const { return a + b; } inline T operator()(T a, T b) const { return a + b; }
}; };
template <>
bool ElementwiseAddKernel<CPU, float>::Init(
const ElementwiseAddParam &para) const {
return true;
}
template <> template <>
void ElementwiseAddKernel<CPU, float>::Compute( void ElementwiseAddKernel<CPU, float>::Compute(
const ElementwiseAddParam &param) const { const ElementwiseAddParam &param) const {
...@@ -40,3 +48,5 @@ template class ElementwiseAddKernel<CPU, float>; ...@@ -40,3 +48,5 @@ template class ElementwiseAddKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,15 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once #pragma once
#include "operators/kernel/fushion_fc_kernel.h" #include "operators/kernel/fusion_fc_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const { bool FusionFcKernel<CPU, float>::Init(const FusionFcParam &para) const {
return true;
}
template <>
void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY(); const Tensor *input_y = param.InputY();
const Tensor *input_z = param.InputZ(); const Tensor *input_z = param.InputZ();
...@@ -65,3 +72,5 @@ void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const { ...@@ -65,3 +72,5 @@ void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef LRN_OP
#pragma once #pragma once
#include "operators/kernel/lrn_kernel.h" #include "operators/kernel/lrn_kernel.h"
...@@ -19,17 +21,23 @@ limitations under the License. */ ...@@ -19,17 +21,23 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <>
bool LrnKernel<CPU, float>::Init(const LrnParam &para) const {
return true;
}
template <> template <>
void LrnKernel<CPU, float>::Compute(const LrnParam &param) const { void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
auto x_dims = input_x->dims(); auto x_dims = input_x->dims();
Tensor *out = param.Out();
out->mutable_data<float>();
/// data_format = NCHW /// data_format = NCHW
const int N = x_dims[0]; const int N = x_dims[0];
const int C = x_dims[1]; const int C = x_dims[1];
const int H = x_dims[2]; const int H = x_dims[2];
const int W = x_dims[3]; const int W = x_dims[3];
Tensor *out = param.Out();
out->mutable_data<float>();
const int n = param.N(); const int n = param.N();
const float alpha = param.Alpha(); const float alpha = param.Alpha();
const float beta = param.Beta(); const float beta = param.Beta();
...@@ -42,3 +50,5 @@ template class LrnKernel<CPU, float>; ...@@ -42,3 +50,5 @@ template class LrnKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MUL_OP
#pragma once #pragma once
#include "operators/kernel/mul_kernel.h" #include "operators/kernel/mul_kernel.h"
...@@ -19,6 +21,11 @@ limitations under the License. */ ...@@ -19,6 +21,11 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <>
bool MulKernel<CPU, float>::Init(const MulParam &para) const {
return true;
}
template <> template <>
void MulKernel<CPU, float>::Compute(const MulParam &param) const { void MulKernel<CPU, float>::Compute(const MulParam &param) const {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
...@@ -48,3 +55,5 @@ template class MulKernel<CPU, float>; ...@@ -48,3 +55,5 @@ template class MulKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef MULTICLASSNMS_OP
#include "operators/kernel/multiclass_nms_kernel.h" #include "operators/kernel/multiclass_nms_kernel.h"
#include <algorithm>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -203,6 +203,12 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes, ...@@ -203,6 +203,12 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
} }
} }
template <>
bool MultiClassNMSKernel<CPU, float>::Init(
const MultiClassNMSParam& para) const {
return true;
}
template <> template <>
void MultiClassNMSKernel<CPU, float>::Compute( void MultiClassNMSKernel<CPU, float>::Compute(
const MultiClassNMSParam& param) const { const MultiClassNMSParam& param) const {
...@@ -273,3 +279,5 @@ void MultiClassNMSKernel<CPU, float>::Compute( ...@@ -273,3 +279,5 @@ void MultiClassNMSKernel<CPU, float>::Compute(
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#include <operators/kernel/pool_kernel.h> #include <operators/kernel/pool_kernel.h>
#include "common/log.h" #include "common/log.h"
...@@ -33,6 +35,11 @@ inline void PoolBasic(std::string pooling_type, std::vector<int> ksize, ...@@ -33,6 +35,11 @@ inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
} }
} }
template <>
bool PoolKernel<CPU, float>::Init(const PoolParam &para) const {
return true;
}
template <> template <>
void PoolKernel<CPU, float>::Compute(const PoolParam &param) const { void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
const Tensor *in_x = param.Input(); const Tensor *in_x = param.Input();
...@@ -54,22 +61,25 @@ void PoolKernel<CPU, float>::Compute(const PoolParam &param) const { ...@@ -54,22 +61,25 @@ void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
paddings[i] = 0; paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]); ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
} }
} } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
if (pooling_type == "max") {
math::Pool3x3Max(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool3x3Avg(strides, paddings, in_x, out);
}
} else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
if (pooling_type == "max") {
math::Pool2x2Max(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avg(strides, paddings, in_x, out);
}
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); } else {
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
// if (param.isGlobalPooling() || ksize[0] != ksize[1] || }
// strides[0] != strides[1] || strides[1] != 2 ||
// paddings[0] != paddings[1] || paddings[1] > 1) {
// PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
//
// } else if (ksize[0] == 2) {
//
// } else if (ksize[0] == 3) {
//
// } else {
// PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
// }
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef PRIORBOX_OP
#include "operators/kernel/prior_box_kernel.h" #include "operators/kernel/prior_box_kernel.h"
...@@ -26,6 +26,11 @@ struct ClipFunctor { ...@@ -26,6 +26,11 @@ struct ClipFunctor {
} }
}; };
template <>
bool PriorBoxKernel<CPU, float>::Init(const PriorBoxParam &para) const {
return true;
}
template <> template <>
void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const { void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
const auto *input_ = param.Input(); const auto *input_ = param.Input();
...@@ -143,3 +148,5 @@ void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const { ...@@ -143,3 +148,5 @@ void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef RELU_OP
#include "operators/kernel/relu_kernel.h" #include "operators/kernel/relu_kernel.h"
#include <operators/math/transform.h> #include <operators/math/transform.h>
...@@ -25,6 +25,11 @@ struct ReluFunctor { ...@@ -25,6 +25,11 @@ struct ReluFunctor {
inline T operator()(T in) const { return in > 0 ? in : 0; } inline T operator()(T in) const { return in > 0 ? in : 0; }
}; };
template <>
bool ReluKernel<CPU, float>::Init(const ReluParam &para) const {
return true;
}
/* /*
* @b 特化到具体平台的实现, param 从 op 层传入 * @b 特化到具体平台的实现, param 从 op 层传入
* */ * */
...@@ -35,13 +40,74 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const { ...@@ -35,13 +40,74 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out = param.Out(); auto *out = param.Out();
auto *out_ptr = out->mutable_data<float>(); auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel();
// if (numel > 64) {
// asm volatile(
// "pld [%[input_x_ptr], #0] \n\t"
// "vmov.f32 q8, #0.0 \n\t"
// "subs %[num], %[num], #32 \n\t"
// "blt end_num_%= \n\t"
// "loop_num_%=: \n\t"
// "pld [%[input_x_ptr], #1024] \n\t"
//
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
//
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
//
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
//
// "subs %[num], %[num], #32 \n\t"
// "bge loop_num_%= \n\t"
// "end_num_%=: \n\t"
// "cmp %[num], #0 \n\t"
// "bge end_%= \n\t"
// "mov r6, #4 \n\t"
// "mul r5, %[num], r6 \n\t"
// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
// "add %[out_ptr], %[out_ptr], r5 \n\t"
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// "end_%=: \n\t"
// :
// :
// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
// "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
// "q7", "q8", "r5",
// "r6");
// } else {
ReluFunctor<float> func_; ReluFunctor<float> func_;
math::Transform trans; math::Transform trans;
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_); trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
// for (int i = 0; i < input_x->numel(); i++) {
// out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
// } // }
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #ifdef RESHAPE_OP
#include "operators/kernel/reshape_kernel.h" #include "operators/kernel/reshape_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <>
bool ReshapeKernel<CPU, float>::Init(const ReshapeParam &para) const {
return true;
}
template <> template <>
void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const { void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
const auto *input_x = param.InputX(); const auto *input_x = param.InputX();
...@@ -49,3 +54,5 @@ void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const { ...@@ -49,3 +54,5 @@ void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SIGMOID_OP
#include "../sigmoid_kernel.h" #include "../sigmoid_kernel.h"
#if __ARM_NEON #if __ARM_NEON
#include "../../math/math_func_neon.h" #include "../../math/math_func_neon.h"
#endif #endif
#include <cmath>
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -25,35 +27,23 @@ using framework::Tensor; ...@@ -25,35 +27,23 @@ using framework::Tensor;
void sigmoid(const Tensor *X, Tensor *Y) { void sigmoid(const Tensor *X, Tensor *Y) {
#if __ARM_NEON #if __ARM_NEON
DLOG << "step1";
const float *input = X->data<float>(); const float *input = X->data<float>();
DLOG << "step11";
float *output = Y->mutable_data<float>(); float *output = Y->mutable_data<float>();
DLOG << "step2";
const DDim &dDim = X->dims(); const DDim &dDim = X->dims();
DLOG << "step3";
int axis_index = 1; int axis_index = 1;
if (dDim.size() < 4) { if (dDim.size() < 4) {
axis_index = 0; axis_index = 0;
} }
DLOG << "step4";
DDim outer_ddim = DDim outer_ddim =
paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1); paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
DDim inner_ddim = DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size()); paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
DLOG << "step5";
int out_size = paddle_mobile::framework::product(outer_ddim); int out_size = paddle_mobile::framework::product(outer_ddim);
int inner_size = paddle_mobile::framework::product(inner_ddim); int inner_size = paddle_mobile::framework::product(inner_ddim);
DLOG << "step6";
#pragma omp parallel for
DLOG << "outsize=" << out_size; DLOG << "outsize=" << out_size;
DLOG << "innersize=" << inner_size; DLOG << "innersize=" << inner_size;
#pragma omp parallel for
for (int i = 0; i < out_size; ++i) { for (int i = 0; i < out_size; ++i) {
const float *input_outer_ptr = input + i * inner_size; const float *input_outer_ptr = input + i * inner_size;
float *output_outer_ptr = output + i * inner_size; float *output_outer_ptr = output + i * inner_size;
...@@ -81,6 +71,11 @@ void sigmoid(const Tensor *X, Tensor *Y) { ...@@ -81,6 +71,11 @@ void sigmoid(const Tensor *X, Tensor *Y) {
#endif #endif
} }
template <>
bool SigmoidKernel<CPU, float>::Init(const SigmoidParam &para) const {
return true;
}
template <> template <>
void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const { void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
const Tensor *in_x = param.InputX(); const Tensor *in_x = param.InputX();
...@@ -93,3 +88,5 @@ void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const { ...@@ -93,3 +88,5 @@ void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
template class SigmoidKernel<CPU, float>; template class SigmoidKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,11 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,11 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP
#include "../softmax_kernel.h" #include "../softmax_kernel.h"
#include "../../math/softmax.h" #include "../../math/softmax.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <>
bool SoftmaxKernel<CPU, float>::Init(const SoftmaxParam &para) const {
return true;
}
template <> template <>
void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const { void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
const Tensor *in_x = param.InputX(); const Tensor *in_x = param.InputX();
...@@ -29,3 +36,5 @@ void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const { ...@@ -29,3 +36,5 @@ void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
template class SoftmaxKernel<CPU, float>; template class SoftmaxKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -11,28 +11,32 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,28 +11,32 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef TRANSPOSE_OP
#pragma once
#include "operators/kernel/transpose_kernel.h" #include "operators/kernel/transpose_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename T> // vector<int> pos;
void TransposeFunc(const int numel, const T* input, const vector<int> axis, // template <typename T>
const vector<int> old_strides, const vector<int> new_strides, // void TransposeFunc(const int numel, const T* input, const vector<int> axis,
T* output) { // const vector<int> old_strides, const vector<int>
for (int i = 0; i < numel; ++i) { // new_strides, T* output) {
int old_idx = 0; // for (int i = 0; i < numel; ++i) {
int idx = i; // int old_idx = 0;
for (int j = 0; j < axis.size(); ++j) { // int idx = i;
int order = axis[j]; // for (int j = 0; j < axis.size(); ++j) {
old_idx += (idx / new_strides[j]) * old_strides[order]; // int order = axis[j];
idx %= new_strides[j]; // old_idx += (idx / new_strides[j]) * old_strides[order];
} // idx %= new_strides[j];
output[i] = input[old_idx]; // }
} // output[i] = input[old_idx];
// }
// }
template <>
bool TransposeKernel<CPU, float>::Init(const TransposeParam& para) const {
return true;
} }
template <> template <>
...@@ -44,29 +48,41 @@ void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const { ...@@ -44,29 +48,41 @@ void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
const auto* input_x_data = input_x->data<float>(); const auto* input_x_data = input_x->data<float>();
auto* out_data = out->mutable_data<float>(); auto* out_data = out->mutable_data<float>();
size_t axis_size = axis.size(); size_t ndim = axis.size();
std::vector<int> new_dims; std::vector<int> xdim(ndim);
new_dims.reserve(axis_size); std::vector<int> xstride(ndim);
for (auto c : axis) { std::vector<int> xout(ndim);
new_dims.push_back(input_x_dims[c]); for (int i = 0; i < ndim; i++) {
int j = ndim - 1 - i;
xdim[j] = input_x_dims[axis[i]];
xstride[j] = 1;
for (int k = axis[i] + 1; k < ndim; k++) {
xstride[j] *= input_x_dims[k];
}
xout[j] = xstride[j] * xdim[j];
} }
std::vector<int> old_strides; auto numel = input_x->numel();
std::vector<int> new_strides; size_t pind = 0;
for (int i = 0; i < axis.size(); i++) { std::vector<int> ind(ndim);
int temp_old = 1; for (int i = 0; i < numel; i++) {
int temp_new = 1; out_data[i] = input_x_data[pind];
for (int j = i + 1; j < axis.size(); j++) { ind[0]++;
temp_old *= input_x_dims[j]; pind += xstride[0];
temp_new *= new_dims[j]; for (int j = 0; j < ndim - 1; j++) {
if (ind[j] == xdim[j]) {
ind[j + 1]++;
ind[j] = 0;
pind += xstride[j + 1];
pind -= xout[j];
} else {
break;
}
} }
old_strides.push_back(temp_old);
new_strides.push_back(temp_new);
} }
TransposeFunc<float>(input_x->numel(), input_x_data, axis, old_strides,
new_strides, out_data);
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -26,7 +29,10 @@ class BatchNormKernel ...@@ -26,7 +29,10 @@ class BatchNormKernel
: public framework::OpKernelBase<DeviceType, BatchNormParam> { : public framework::OpKernelBase<DeviceType, BatchNormParam> {
public: public:
void Compute(const BatchNormParam &param) const; void Compute(const BatchNormParam &param) const;
bool Init(const BatchNormParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef BOXCODER_OP
#pragma once
#include <vector> #include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/transform.h" #include "operators/math/transform.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -28,6 +30,9 @@ class BoxCoderKernel ...@@ -28,6 +30,9 @@ class BoxCoderKernel
: public framework::OpKernelBase<DeviceType, BoxCoderParam> { : public framework::OpKernelBase<DeviceType, BoxCoderParam> {
public: public:
void Compute(const BoxCoderParam& param) const; void Compute(const BoxCoderParam& param) const;
bool Init(const BoxCoderParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef BATCHNORM_OP
#pragma once
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void BatchnormCompute(const BatchNormParam &param) {
const Tensor *input_x = param.InputX();
auto input_x_ptr = input_x->data<float>();
const auto &x_dims = input_x->dims();
const int N = x_dims[0];
const int C = x_dims[1];
const int H = x_dims[2];
const int W = x_dims[3];
const int stride0 = C * H * W;
const int stride1 = H * W;
const int stride2 = W;
Tensor *out = param.OutputY();
auto out_ptr = out->mutable_data<float>();
const float epsilon = param.Epsilon();
const Tensor *mean = param.InputMean();
const Tensor *variance = param.InputVariance();
const Tensor *scale = param.InputScale();
const Tensor *bias = param.InputBias();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE(C == variance->numel(),
"C must equal to variance.numel()");
int HXW = H * W;
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
float *volatile new_scale_ptr = new float[NXC * 4];
float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) {
int index = i / 4;
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i];
new_bias_ptr[i + 3] = new_bias_ptr[i];
}
for (int j = C * 4; j < NXC * 4; ++j) {
new_scale_ptr[j] = new_scale_ptr[j - C * 4];
new_bias_ptr[j] = new_bias_ptr[j - C * 4];
}
asm volatile(
"subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t"
"loop_n_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"blt end_c_%= \n\t"
"loop_c_%=: \n\t"
"vld1.32 {q9}, [%[new_scale_ptr]]! \n\t"
"vld1.32 {q10}, [%[new_bias_ptr]]! \n\t"
"mov r6, %[HXW] \n\t"
"subs r6, r6, #32 \n\t"
"blt end_hw_%= \n\t"
"loop_hw_%=: \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"subs r6, r6, #32 \n\t"
"bge loop_hw_%= \n\t"
"end_hw_%=: \n\t"
"cmp r6, #0 \n\t"
"bge end_remainder_%= \n\t"
"mov r5, #4 \n\t"
"mul r6, r6, r5 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r6 \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"add %[out_ptr], %[out_ptr], r6 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"end_remainder_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"bge loop_c_%= \n\t"
"end_c_%=: \n\t"
"subs %[N], %[N], #1 \n\t"
"bge loop_n_%= \n\t"
"end_n_%=: \n\t"
:
: [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
[N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete[] inv_std_ptr;
delete[] new_scale_ptr;
delete[] new_bias_ptr;
} else {
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr =
new_scale.mutable_data<float>(framework::make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
delete[] inv_std_ptr;
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#pragma once
#include <vector>
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvAddReluCompute(const FusionConvAddReluParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
int axis = param.Axis();
Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(1), true);
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void ConvCompute(const ConvParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef DEPTHWISECONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void DepthwiseConvCompute(const ConvParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
// DLOG << " compute end get Attrs " << strides[0];
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice,
static_cast<float>(0));
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONCAT_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
...@@ -25,7 +27,10 @@ template <typename DeviceType, typename T> ...@@ -25,7 +27,10 @@ template <typename DeviceType, typename T>
class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> { class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
public: public:
void Compute(const ConcatParam &param) const; void Compute(const ConcatParam &param) const;
bool Init(const ConcatParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,24 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,24 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_CONVADD_OP
#pragma once #pragma once
#include "framework.pb.h"
#include "lod_tensor.h" #include <vector>
#include "selected_rows.h" #if __ARM_NEON
#include "variable.h" #include <arm_neon.h>
#endif
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace operators {
inline proto::VarType::Type ToVarType(std::type_index type) {
if (type.hash_code() == typeid(LoDTensor).hash_code()) { using framework::DDim;
return proto::VarType_Type_LOD_TENSOR; using framework::OpKernelBase;
} else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
return proto::VarType_Type_SELECTED_ROWS; template <typename DeviceType, typename T>
} else { class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
// PADDLE_THROW("ToVarType:Unsupported type %s", public:
// type.name()); void Compute(const FusionConvAddParam &param) const;
} bool Init(const FusionConvAddParam &para) const;
} };
} // namespace framework } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -14,24 +14,32 @@ limitations under the License. */ ...@@ -14,24 +14,32 @@ limitations under the License. */
#pragma once #pragma once
#include <functional> #ifdef FUSION_CONVADD_RELU_OP
#include <utility>
#include <vector>
#include "framework/op_kernel_type.h" #include <vector>
#include "framework/selected_rows.h" #include "framework/ddim.h"
#include "framework/tensor.h" #include "framework/operator.h"
#include "framework/variable.h" #include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace operators {
void DataTransform(const OpKernelType &expected_kernel_type, using framework::DDim;
const OpKernelType &kernel_type_for_var, using framework::OpKernelBase;
const Tensor &input_tensor, Tensor *out);
void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor, template <typename DeviceType, typename T>
Variable *out_var); class ConvAddReluKernel
: public OpKernelBase<DeviceType, FusionConvAddReluParam> {
public:
void Compute(const FusionConvAddReluParam &param) const;
bool Init(const FusionConvAddReluParam &para) const;
};
} // namespace framework } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector> #include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
...@@ -19,8 +23,6 @@ limitations under the License. */ ...@@ -19,8 +23,6 @@ limitations under the License. */
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -30,22 +32,10 @@ template <typename DeviceType, typename T> ...@@ -30,22 +32,10 @@ template <typename DeviceType, typename T>
class ConvKernel : public OpKernelBase<DeviceType, ConvParam> { class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
public: public:
void Compute(const ConvParam &param) const; void Compute(const ConvParam &param) const;
bool Init(const ConvParam &para) const;
}; };
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides,
const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEPTHWISECONV_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -29,6 +31,9 @@ template <typename DeviceType, typename T> ...@@ -29,6 +31,9 @@ template <typename DeviceType, typename T>
class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> { class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
public: public:
void Compute(const ConvParam &param) const; void Compute(const ConvParam &param) const;
bool Init(const ConvParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once; #ifdef ELEMENTWISEADD_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/elementwise_op_function.h" #include "operators/math/elementwise_op_function.h"
...@@ -28,6 +30,9 @@ class ElementwiseAddKernel ...@@ -28,6 +30,9 @@ class ElementwiseAddKernel
: public framework::OpKernelBase<DeviceType, ElementwiseAddParam> { : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
public: public:
void Compute(const ElementwiseAddParam &param) const; void Compute(const ElementwiseAddParam &param) const;
bool Init(const ElementwiseAddParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,13 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
// template<> template <>
// void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const bool ConvKernel<FPGA, float>::Init(const ConvParam &para) const {
// {} return true;
//
// template class ConvKernel<FPGA, float>;
} }
template <>
void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
template class ConvKernel<FPGA, float>;
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,20 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,20 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class FushionFcKernel class FusionFcKernel
: public framework::OpKernelBase<DeviceType, FushionFcParam> { : public framework::OpKernelBase<DeviceType, FusionFcParam> {
public: public:
void Compute(const FushionFcParam& param) const; void Compute(const FusionFcParam& param) const;
bool Init(const FusionFcParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,9 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef LRN_OP
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
#include <cmath>
#ifdef __ARM_NEON
#include "arm_neon.h"
#include "operators/math/math_func_neon.h"
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -24,42 +32,137 @@ using namespace framework; ...@@ -24,42 +32,137 @@ using namespace framework;
template <typename T> template <typename T>
struct LRNFunctor { struct LRNFunctor {
void operator()(const framework::Tensor &input, framework::Tensor *out, int N, void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
int C, int H, int W, int n, T k, T alpha, T beta) { int C, int H, int W, int n, float k, float alpha,
auto input_ptr = input.data<T>(); float beta) {
const float *input_ptr = input.data<float>();
const int start = -(n - 1) / 2; const int start = -(n - 1) / 2;
const int end = start + n; const int end = start + n;
auto out_ptr = out->data<T>();
const int stride0 = C * H * W; const int stride0 = C * H * W;
const int stride1 = H * W; const int stride1 = H * W;
const int stride2 = W; const int stride2 = W;
const int stride3 = 1;
framework::Tensor sqr_buffer; framework::Tensor sqr_buffer;
auto sqr_buffer_ptr = sqr_buffer.mutable_data<T>(input.dims()); auto sqr_buffer_ptr = sqr_buffer.mutable_data<float>(input.dims());
std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), k); std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
for (int a = 0; a < N; a++) { for (int a = 0; a < N; a++) {
for (int b = 0; b < C; b++) { for (int b = 0; b < C; b++) {
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
int channel = b + index; int channel = b + index;
if (channel >= 0 && channel < C) { if (channel >= 0 && channel < C) {
int tmp_u = a * stride0 + b * stride1; int tmp_s = a * stride0 + b * stride1;
int tmp_i = a * stride0 + channel * stride1; int tmp_c = a * stride0 + channel * stride1;
for (int c = 0; c < H; c++) { #ifdef __ARM_NEON
for (int d = 0; d < W; d++) { int n4 = stride1 / 4;
int tmp = c * stride2 + d; int m4 = stride1 % 4;
int u = tmp_u + tmp; float32x4_t sqr0;
int i = tmp_i + tmp; float32x4_t in0;
sqr_buffer_ptr[u] += alpha * input_ptr[i] * input_ptr[i]; float32x4_t res0;
} for (int i = 0; i < n4; i++) {
sqr0 = vld1q_f32(sqr_buffer_ptr + tmp_s);
in0 = vld1q_f32(input_ptr + tmp_c);
res0 = vmlaq_f32(sqr0, in0, in0);
vst1q_f32(sqr_buffer_ptr + tmp_s, res0);
tmp_s += 4;
tmp_c += 4;
} }
for (int i = 0; i < m4; i++) {
int s_i = tmp_s + i;
int c_i = tmp_c + i;
sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
}
#else
for (int tmp = 0; tmp < stride1; tmp++) {
int s_i = tmp_s + tmp;
int c_i = tmp_c + tmp;
sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
}
#endif
} }
} }
} }
} }
auto out_ptr = out->data<T>();
#ifdef __ARM_NEON
float32x4_t sqr1, sqr2, sqr3, sqr4;
float32x4_t alpha4;
float32x4_t k4;
float32x4_t beta4;
float32x4_t res1, res2, res3, res4;
float32x4_t in1, in2, in3, in4;
beta4 = vdupq_n_f32(beta);
alpha4 = vdupq_n_f32(alpha);
k4 = vdupq_n_f32(k);
auto out_tmp_ptr = out_ptr;
int n16 = input.numel() / 16;
int m16 = input.numel() % 16;
int m16n4 = m16 / 4;
int m16m4 = m16 % 4;
for (int i = 0; i < n16; i++) {
sqr1 = vld1q_f32(sqr_buffer_ptr);
sqr2 = vld1q_f32(sqr_buffer_ptr + 4);
sqr3 = vld1q_f32(sqr_buffer_ptr + 8);
sqr4 = vld1q_f32(sqr_buffer_ptr + 12);
in1 = vld1q_f32(input_ptr);
in2 = vld1q_f32(input_ptr + 4);
in3 = vld1q_f32(input_ptr + 8);
in4 = vld1q_f32(input_ptr + 12);
sqr1 = vmlaq_f32(k4, sqr1, alpha4);
sqr2 = vmlaq_f32(k4, sqr2, alpha4);
sqr3 = vmlaq_f32(k4, sqr3, alpha4);
sqr4 = vmlaq_f32(k4, sqr4, alpha4);
sqr1 = pow_ps(sqr1, -beta4);
sqr2 = pow_ps(sqr2, -beta4);
sqr3 = pow_ps(sqr3, -beta4);
sqr4 = pow_ps(sqr4, -beta4);
sqr1 = vmulq_f32(sqr1, in1);
sqr2 = vmulq_f32(sqr2, in2);
sqr3 = vmulq_f32(sqr3, in3);
sqr4 = vmulq_f32(sqr4, in4);
vst1q_f32(out_tmp_ptr, sqr1);
vst1q_f32(out_tmp_ptr + 4, sqr2);
vst1q_f32(out_tmp_ptr + 8, sqr3);
vst1q_f32(out_tmp_ptr + 12, sqr4);
sqr_buffer_ptr += 4 * 4;
input_ptr += 4 * 4;
out_tmp_ptr += 4 * 4;
}
for (int i = 0; i < m16n4; i++) {
sqr4 = vld1q_f32(sqr_buffer_ptr);
in4 = vld1q_f32(input_ptr);
sqr4 = vmlaq_f32(k4, sqr4, alpha4);
sqr4 = pow_ps(sqr4, -beta4);
sqr4 = vmulq_f32(sqr4, in4);
vst1q_f32(out_tmp_ptr, sqr4);
sqr_buffer_ptr += 4;
input_ptr += 4;
out_tmp_ptr += 4;
}
for (int i = 0; i < m16m4; i++) {
out_tmp_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
}
#else
for (int i = 0; i < input.numel(); i++) { for (int i = 0; i < input.numel(); i++) {
out_ptr[i] = input_ptr[i] / pow(sqr_buffer_ptr[i], beta); out_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
} }
#endif
} }
}; };
...@@ -67,6 +170,9 @@ template <typename DeviceType, typename T> ...@@ -67,6 +170,9 @@ template <typename DeviceType, typename T>
class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> { class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
public: public:
void Compute(const LrnParam &param) const; void Compute(const LrnParam &param) const;
bool Init(const LrnParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if USE_ACL == 1
#include "acl_operator.h"
unsigned int bypass_acl_class_layer =
(0 | FLAGS_ENABLE_ACL_CONCAT |
/*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
|*/
0);
int enable_schedule = 0;
#ifdef USE_PROFILING
#include "arm_neon.h"
unsigned int acl_log_flags =
(0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */ /*MASK_LOG_ALLOCATE | */
/*MASK_LOG_RUN | */ /*MASK_LOG_CONFIG | */ /*MASK_LOG_COPY | */
MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
MASK_LOG_CONCAT | 0);
#include <stdio.h> /* printf */
#include <stdlib.h> /* getenv */
#endif // USE_PROFILING
static bool force_enable_gpu = false;
bool AclEnableSchedule(int enable) {
enable_schedule = enable;
if (enable) {
force_enable_gpu = true;
}
return true;
}
int isScheduleEnable() { return enable_schedule; }
namespace paddle_mobile {
namespace operators {
namespace acl {
bool ACLOperator::init_gpu_env = true;
#ifdef USE_OPENCL
bool ACLOperator::support_opencl_ = false;
bool opencl_is_available() { return arm_compute::opencl_is_available(); }
#elif defined(USE_OPENGLES)
bool ACLOperator::support_opengles_ = false;
#endif
ACLOperator::ACLOperator(bool is_gpu)
: operator_state_(operator_not_init),
force_bypass_acl_path_(false),
target_hint_(TargetHint::DONT_CARE),
convolution_method_hint_(ConvolutionMethodHint::GEMM),
_group(1),
name_(""),
input_idx_(0),
output_idx_(0),
is_gpu_(is_gpu) {
const char* pBypassACL;
if (init_gpu_env) {
#ifdef USE_OPENCL
try {
if (opencl_is_available()) {
arm_compute::CLScheduler::get().default_init();
support_opencl_ = true;
}
} catch (std::exception& e) {
support_opencl_ = false;
}
#elif defined(USE_OPENGLES)
try {
arm_compute::GCScheduler::get().default_init();
support_opengles_ = true;
} catch (std::exception& e) {
support_opengles_ = false;
}
#endif
init_gpu_env = false;
}
if (force_enable_gpu) is_gpu_ = true;
pBypassACL = getenv("BYPASSACL");
if (pBypassACL) {
unsigned int bacl;
sscanf(pBypassACL, "%i", &bacl);
if (bacl != bypass_acl_class_layer) {
bypass_acl_class_layer = bacl;
printf("BYPASSACL<%s>\n", pBypassACL);
printf("BYPASSACL: %x\n", bypass_acl_class_layer);
}
}
#ifdef USE_PROFILING
const char* pLogACL;
pLogACL = getenv("LOGACL");
if (pLogACL) {
unsigned int alf;
sscanf(pLogACL, "%i", &alf);
if (alf != acl_log_flags) {
acl_log_flags = alf;
printf("LOGACL<%s>\n", pLogACL);
printf("LOGACL: %x\n", acl_log_flags);
}
}
#endif // USE_PROFILING
const char* pEnableSchedule;
pEnableSchedule = getenv("ENABLESCHEDULE");
if (pEnableSchedule) {
int bshedule;
sscanf(pEnableSchedule, "%i", &bshedule);
if (bshedule != enable_schedule) {
enable_schedule = bshedule;
printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
printf("ENABLESCHEDULE: %x\n", enable_schedule);
}
if (enable_schedule) {
AclEnableSchedule(1);
}
}
}
ACLOperator::~ACLOperator() {}
bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
arm_compute::TensorShape& shape, void* mem,
bool commit) {
auto acl_tensor =
new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
acl_tensor->set_target(getTargetHint());
acl_tensor->bindmem(mem);
if (commit) acl_tensor->commit();
tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
return true;
}
bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
std::unique_ptr<ACLTensor>& parent,
arm_compute::TensorShape& shape,
arm_compute::Coordinates& coord) {
auto acl_tensor = new ACLSubTensor(parent, shape, coord);
acl_tensor->set_target(getTargetHint());
tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
return true;
}
void ACLTensor::commit(TensorType type) {
settensortype(type);
if (mem_) {
if (!allocate_) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_ALLOCATE_INFO);
#endif // USE_PROFILING
allocate();
allocate_ = true;
}
if (type_ != tensor_output) {
tensor_copy(mem_);
}
mem_ = nullptr;
}
}
int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
bool toTensor) {
#ifdef USE_PROFILING
logtime_util log_time(ACL_COPY_INFO);
#endif // USE_PROFILING
arm_compute::Window window;
// Iterate through the rows (not each element)
window.use_tensor_dimensions(tensor->info()->tensor_shape(),
/* first_dimension =*/arm_compute::Window::DimY);
int width = tensor->info()->tensor_shape()[0];
int height = tensor->info()->tensor_shape()[1];
int deepth = tensor->info()->tensor_shape()[2];
map();
// Create an iterator:
arm_compute::Iterator it(tensor, window);
// Except it works for an arbitrary number of dimensions
if (toTensor) { // mem->tensor
arm_compute::execute_window_loop(
window,
[&](const arm_compute::Coordinates& id) {
memcpy(it.ptr(),
((char*)mem) +
((id[3] * (width * height * deepth) +
id.z() * (width * height) + id.y() * width + id.x()) *
tensor->info()->element_size()),
width * tensor->info()->element_size());
},
it);
} else { // tensor-->mem
arm_compute::execute_window_loop(
window,
[&](const arm_compute::Coordinates& id) {
memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
id.z() * (width * height) + id.y() * width) *
tensor->info()->element_size()),
it.ptr(), width * tensor->info()->element_size());
},
it);
}
unmap();
return 0;
}
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_OPERATOR_H_
#define ACL_OPERATOR_H_
#include <framework/tensor.h>
#include <operators/op_param.h>
#if USE_ACL == 1
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
#include "arm_compute/runtime/Tensor.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
#endif
#ifdef USE_OPENGLES
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
#endif
#include "acl_tensor.h"
#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
#define FLAGS_ENABLE_ACL_BNLL 0x00000002
#define FLAGS_ENABLE_ACL_CONV 0x00000004
#define FLAGS_ENABLE_ACL_FC 0x00000008
#define FLAGS_ENABLE_ACL_LRN 0x00000010
#define FLAGS_ENABLE_ACL_POOLING 0x00000020
#define FLAGS_ENABLE_ACL_RELU 0x00000040
#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
#define FLAGS_ENABLE_ACL_TANH 0x00000200
#define FLAGS_ENABLE_ACL_LC 0x00000400
#define FLAGS_ENABLE_ACL_BN 0x00000800
#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
extern unsigned int bypass_acl_class_layer;
#ifdef USE_PROFILING
#include <sys/time.h>
#define NANO_SEC_CONV 1000000
#define MASK_LOG_APP_TIME 0x00000001
#define MASK_LOG_ALLOCATE 0x00000002
#define MASK_LOG_RUN 0x00000004
#define MASK_LOG_CONFIG 0x00000008
#define MASK_LOG_COPY 0x00000010
#define MASK_LOG_ABSVAL 0x00000020
#define MASK_LOG_BNLL 0x00000040
#define MASK_LOG_CONV 0x00000080
#define MASK_LOG_FC 0x00000100
#define MASK_LOG_LRN 0x00000200
#define MASK_LOG_POOLING 0x00000400
#define MASK_LOG_RELU 0x00000800
#define MASK_LOG_SIGMOID 0x00001000
#define MASK_LOG_SOFTMAX 0x00002000
#define MASK_LOG_TANH 0x00004000
#define MASK_LOG_LC 0x00008000
#define MASK_LOG_BN 0x00010000
#define MASK_LOG_CONCAT 0x00020000
#define APP_TIME_INFO MASK_LOG_APP_TIME, "time: \t"
#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate: \t\t"
#define ACL_RUN_INFO MASK_LOG_RUN, "run: \t\t\t"
#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure: \t\t\t\t"
#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL :\t\t\t\t\t\t\t"
#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV :\t\t\t\t\t\t\t\t"
#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC :\t\t\t\t\t\t\t\t\t"
#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN :\t\t\t\t\t\t\t\t\t\t"
#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU :\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_SIGMOID_INFO \
MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_SOFTMAX_INFO \
MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_TANH_INFO \
MASK_LOG_TANH, "ACL_TANH :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_BN_INFO \
MASK_LOG_BN, "ACL_BN :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
#define ACL_CONCAT_INFO \
MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
extern unsigned int acl_log_flags;
class logtime_util {
public:
logtime_util() { mask = 0; }
logtime_util(int mask_, const char *information_) {
setlogtime_info(mask_, information_);
}
void setlogtime_info(int mask_, const char *information_) {
mask = mask_;
if (acl_log_flags & mask) {
strncpy(information, information_, 255);
gettimeofday(&tv[0], NULL);
}
}
~logtime_util() {
if (acl_log_flags & mask) {
int time[2];
gettimeofday(&tv[1], NULL);
time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
printf("%s %.6lf\n", information,
(((double)time[1] - time[0]) / NANO_SEC_CONV));
}
}
void log_time(bool start) {
if (acl_log_flags & mask) {
if (start) {
gettimeofday(&tv[0], NULL);
} else {
int time[2];
gettimeofday(&tv[1], NULL);
time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
printf("%s %.6lf\n", information,
(((double)time[1] - time[0]) / NANO_SEC_CONV));
}
}
}
private:
struct timeval tv[2];
int mask;
char information[256];
};
#endif // USE_PROFILING
namespace paddle_mobile {
namespace operators {
namespace acl {
class AclParameters {
public:
AclParameters() {
dilated = false;
dim = 2;
num_group = 1;
}
int batch;
int in_depth;
int in_rows;
int in_cols;
int out_depth;
int out_rows;
int out_cols;
int out_num;
int filter_rows;
int filter_cols;
int stride_rows;
int stride_cols;
int pad_rows;
int pad_cols;
int dilation_rows;
int dilation_cols;
int num_group;
bool dilated;
int dim;
int epsilon;
int nsize;
float alpha;
float beta;
float knorm;
void *input_data;
void *output_data;
void *weight_data;
void *biases_data;
void *mean_data;
void *var_data;
std::string pool_type;
std::string act_type;
std::string data_layout;
bool is_global_pool;
bool is_channel_concat;
std::vector<framework::LoDTensor *> in_tensor;
};
enum TensorType {
tensor_input,
tensor_output,
tensor_weights,
tensor_biases,
tensor_mean,
tensor_var,
tensor_beta,
tensor_gamma,
tensor_concat,
tensor_data,
};
enum OperatorState {
operator_not_init,
operator_init_done,
operator_reinit,
};
enum OperateType {
operate_type_pooling,
operate_type_activation,
operate_type_lrn,
operate_type_conv,
operate_type_lc,
operate_type_fc,
operate_type_bn,
operate_type_softmax,
operate_type_concat,
};
class BaseACLTensor {
public:
BaseACLTensor() : type_(tensor_input), allocate_(false) {}
virtual ~BaseACLTensor() {}
virtual void bindmem(void *mem) { mem_ = mem; }
virtual void settensortype(TensorType type) { type_ = type; }
virtual void map(bool blocking = true) {}
virtual void unmap() {}
virtual void commit(TensorType type = tensor_data) {}
int tensor_copy(arm_compute::ITensor *tensor, void *mem,
bool toTensor = true);
protected:
void *mem_;
TensorType type_;
bool allocate_;
};
class ACLTensor : public BaseACLTensor, public Tensor {
public:
explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
virtual void map(bool blocking = true) {
if (!allocate_) {
Tensor::allocate();
allocate_ = true;
}
Tensor::map(blocking);
}
virtual int tensor_copy(void *mem, bool toTensor = true) {
auto acl_tensor = this;
arm_compute::ITensor *tensor = acl_tensor->tensor();
BaseACLTensor::tensor_copy(tensor, mem, toTensor);
return 0;
}
virtual void unmap() { Tensor::unmap(); }
virtual void commit(TensorType type = tensor_data);
};
class ACLSubTensor : public BaseACLTensor, public SubTensor {
public:
ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
: SubTensor(parent.get(), shape, coord) {}
virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
};
template <typename T>
class TensorPair {
public:
TensorPair() {}
~TensorPair() {}
TensorType type;
std::unique_ptr<T> tensor;
};
template <typename T>
std::unique_ptr<T> &tensor_item(
std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
int idx) {
int count = 0;
for (auto &item : pool) {
if (item.get()->type == type) {
++count;
}
if (item.get()->type == type && idx == count - 1) {
return item.get()->tensor;
}
}
pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
auto item = pool[pool.size() - 1].get();
item->type = type;
item->tensor = NULL;
return item->tensor;
}
class ACLOperator {
public:
virtual void commit() {
for (auto &item : tensor_pool_) {
if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
}
}
inline void run() {
commit();
#ifdef USE_PROFILING
logtime_util log_time(ACL_RUN_INFO);
#endif // USE_PROFILING
for (auto &c : funcs_) {
c->run();
}
}
inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
return funcs_;
}
inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_input, idx);
}
inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_output, idx);
}
inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_weights, idx);
}
inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
return tensor_item(subtensor_pool_, tensor_biases, idx);
}
inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
return tensor_item(tensor_pool_, tensor_concat, idx);
}
inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
return tensor_item(tensor_pool_, tensor_input, idx);
}
inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
return tensor_item(tensor_pool_, tensor_output, idx);
}
inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
return tensor_item(tensor_pool_, tensor_weights, idx);
}
inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
return tensor_item(tensor_pool_, tensor_biases, idx);
}
inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
return tensor_item(tensor_pool_, tensor_mean, idx);
}
inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
return tensor_item(tensor_pool_, tensor_var, idx);
}
inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
return tensor_item(tensor_pool_, tensor_beta, idx);
}
inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
return tensor_item(tensor_pool_, tensor_gamma, idx);
}
inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
switch (type) {
case tensor_biases:
return biases();
break;
case tensor_weights:
return weights();
break;
case tensor_output:
return output();
break;
default:
case tensor_input:
return input();
break;
}
return input();
}
explicit ACLOperator(bool is_gpu = false);
virtual ~ACLOperator();
inline TargetHint getTargetHint() {
#ifdef USE_OPENCL
if (target_hint_ == TargetHint::DONT_CARE) {
if (is_gpu_) {
return TargetHint::OPENCL;
}
return TargetHint::NEON;
}
return target_hint_;
#elif defined(USE_OPENGLES)
if (target_hint_ == TargetHint::DONT_CARE) {
if (is_gpu_) {
return TargetHint::OPENGLES;
}
return TargetHint::NEON;
}
return target_hint_;
#else
return TargetHint::NEON;
#endif
}
inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
inline ConvolutionMethodHint &getConvMethod() {
return convolution_method_hint_;
}
inline void setConvMethod() {
convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
}
inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
tensor->bindmem(mem);
return true;
}
inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
tensor->tensor_copy(mem, false);
return true;
}
bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
arm_compute::TensorShape &shape, void *mem = nullptr,
bool commit = false);
bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
std::unique_ptr<ACLTensor> &parent,
arm_compute::TensorShape &shape,
arm_compute::Coordinates &coord);
inline int &group() { return _group; }
inline void set_operator_property(OperateType type, const char *name) {
name_ = name;
type_ = type;
}
inline void acl_run(void *input_data, void *output_data) {
if (input_data) tensor_mem(input(), input_data);
run();
tensor_mem(output_data, output());
}
inline int &input_idx() { return input_idx_; }
inline int &output_idx() { return output_idx_; }
protected:
inline bool isGPUMode() {
#ifdef USE_OPENCL
if (!support_opencl_) return false;
return getTargetHint() == TargetHint::OPENCL;
#elif defined(USE_OPENGLES)
if (!support_opengles_) return false;
return getTargetHint() == TargetHint::OPENGLES;
#endif
return false;
}
inline OperatorState &opstate() { return operator_state_; }
inline bool is_operator_init_done(arm_compute::TensorShape shape,
TensorType type = tensor_input) {
checkreshape(shape, type);
return operator_state_ == operator_init_done;
}
inline void set_operator_init_done() {
opstate() = operator_init_done;
set_bypass_state(false);
}
inline void set_bypass_state(bool state = false) {
force_bypass_acl_path_ = state;
}
inline OperatorState checkreshape(arm_compute::TensorShape shape,
TensorType type = tensor_input) {
opstate() = reshape(shape, type);
if (opstate() == operator_reinit) {
freeres();
}
return opstate();
}
inline OperatorState reshape(arm_compute::TensorShape &shape,
TensorType type) {
arm_compute::TensorShape _shape;
std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
if (!acl_tensor.get()) return operator_not_init;
_shape = acl_tensor->info().tensor_shape();
if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
_shape[1] == shape[1]) {
return operator_init_done;
}
return operator_reinit;
}
inline void freeres() {
tensor_pool_.clear();
subtensor_pool_.clear();
funcs_.clear();
}
inline const char *&name() { return name_; }
inline void set_in_out_index(int indata_idx, int outdata_idx) {
input_idx() = indata_idx;
output_idx() = outdata_idx;
}
protected:
std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
OperatorState operator_state_;
bool force_bypass_acl_path_;
TargetHint target_hint_;
ConvolutionMethodHint convolution_method_hint_;
static bool support_opengles_;
static bool support_opencl_;
static bool init_gpu_env;
int _group;
const char *name_;
OperateType type_;
int input_idx_, output_idx_;
bool is_gpu_;
};
int isScheduleEnable();
template <typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *output) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(dynamic_cast<TensorType *>(input),
dynamic_cast<TensorType *>(output));
return std::move(op);
}
template <typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *output) {
return instantiate_function<OperatorType, TensorType>(input, output);
}
template <typename OpType, typename OpTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
return func;
}
template <typename OperatorType, typename TensorType, typename VectorTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
VectorTensor inputs, arm_compute::ITensor *output) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(inputs, dynamic_cast<TensorType *>(output));
return std::move(op);
}
template <typename OperatorType, typename TensorType, typename VectorTensor>
std::unique_ptr<arm_compute::IFunction> instantiate(
VectorTensor inputs, arm_compute::ITensor *output) {
return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
output);
}
template <typename OpType, typename OpTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
static std::vector<OpTensor *> tensors;
tensors.clear();
for (int i = 0; i < num; ++i) {
tensors.push_back(
dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
}
func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
tensors, output->tensor());
return func;
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *output,
const OperatorInfo &info) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(dynamic_cast<TensorType *>(input),
dynamic_cast<TensorType *>(output), info);
return std::move(op);
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *output,
const OperatorInfo &info) {
return instantiate_function<OperatorType, TensorType, OperatorInfo>(
input, output, info);
}
template <typename OpType, typename OpTensor, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
const OperatorInfo &info, TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
output->tensor(), info);
return func;
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *weights,
arm_compute::ITensor *biases, arm_compute::ITensor *output,
const OperatorInfo &info) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(dynamic_cast<TensorType *>(input),
dynamic_cast<TensorType *>(weights),
dynamic_cast<TensorType *>(biases),
dynamic_cast<TensorType *>(output), info);
return std::move(op);
}
template <typename OperatorType, typename TensorType, typename OperatorInfo>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *weights,
arm_compute::ITensor *biases, arm_compute::ITensor *output,
const OperatorInfo &info) {
return instantiate_function<OperatorType, TensorType, OperatorInfo>(
input, weights, biases, output, info);
}
template <typename OpType, typename OpTensor, typename OperatorInfo,
typename ACLTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
const OperatorInfo &info, TargetHint &hint) {
std::unique_ptr<arm_compute::IFunction> func;
arm_compute::ITensor *biases_tensor = NULL;
if (biases.get()) {
biases_tensor = biases->tensor();
}
func = instantiate<OpType, OpTensor, OperatorInfo>(
input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
info);
return func;
}
template <typename Dtype, typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate_function(
arm_compute::ITensor *input, arm_compute::ITensor *output,
arm_compute::ITensor *mean, arm_compute::ITensor *var,
arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
auto op = cpp14::make_unique<OperatorType>();
op->configure(
dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
return std::move(op);
}
template <typename Dtype, typename OperatorType, typename TensorType>
std::unique_ptr<arm_compute::IFunction> instantiate(
arm_compute::ITensor *input, arm_compute::ITensor *output,
arm_compute::ITensor *mean, arm_compute::ITensor *var,
arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
return instantiate_function<Dtype, OperatorType, TensorType>(
input, output, mean, var, beta, gamma, eps);
}
template <typename Dtype, typename OpType, typename OpTensor>
std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
Dtype eps, TargetHint hint) {
std::unique_ptr<arm_compute::IFunction> func;
func = instantiate<Dtype, OpType, OpTensor>(
input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
beta->tensor(), gamma->tensor(), eps);
return func;
}
template <typename OperatorInfo>
bool instantiate_op_pooling(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
arm_compute::PoolingLayerInfo>(input, output, info,
hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
arm_compute::PoolingLayerInfo>(input, output, info,
hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
arm_compute::PoolingLayerInfo>(input, output, info,
hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_activation(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
arm_compute::ICLTensor,
arm_compute::ActivationLayerInfo>(
input, output, info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
arm_compute::IGCTensor,
arm_compute::ActivationLayerInfo>(
input, output, info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
arm_compute::ITensor,
arm_compute::ActivationLayerInfo>(
input, output, info, hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_lrn(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
arm_compute::ICLTensor,
arm_compute::NormalizationLayerInfo>(
input, output, info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
arm_compute::IGCTensor,
arm_compute::NormalizationLayerInfo>(
input, output, info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
arm_compute::ITensor,
arm_compute::NormalizationLayerInfo>(
input, output, info, hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_conv(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
std::unique_ptr<ACLTensor> &weights = acl_op->weights();
std::unique_ptr<ACLTensor> &biases = acl_op->biases();
ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
bool has_biases = biases.get() ? true : false;
int &groups = acl_op->group();
arm_compute::TensorShape input_shape = input->info().tensor_shape();
arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
arm_compute::TensorShape biases_shape;
if (has_biases) {
biases_shape = biases->info().tensor_shape();
}
arm_compute::TensorShape output_shape = output->info().tensor_shape();
if (groups == 1) {
if (conv_method == ConvolutionMethodHint::GEMM) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
arm_compute::IGCTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
}
} else {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
arm_compute::IGCTensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo>(
acl_op->input(), acl_op->weights(), acl_op->biases(),
acl_op->output(), info, hint));
}
}
return true;
}
// Calculate sub-tensor splits
const int input_split = input_shape.z() / groups;
const int output_split = output_shape.z() / groups;
const int weights_split = weights_shape[3] / groups;
const int biases_split = biases_shape.x() / groups;
// Calculate sub-tensor shapes
input_shape.set(2, input_split);
output_shape.set(2, output_split);
weights_shape.set(3, weights_split);
biases_shape.set(0, biases_split);
for (auto i = 0; i < groups; ++i) {
// Calculate sub-tensors starting coordinates
arm_compute::Coordinates input_coord(0, 0, input_split * i);
arm_compute::Coordinates output_coord(0, 0, output_split * i);
arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
arm_compute::Coordinates biases_coord(biases_split * i);
// Create sub-tensors for input, output, weights and bias
acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
input_coord);
acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
output_coord);
acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
weights_coord);
if (has_biases) {
acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
biases_coord);
}
bool use_opencl = false;
if (conv_method == ConvolutionMethodHint::GEMM) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
use_opencl = true;
func.push_back(
instantiate_op_func<arm_compute::CLConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
#endif
if (!use_opencl) {
func.push_back(
instantiate_op_func<arm_compute::NEConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
} else {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
use_opencl = true;
func.push_back(
instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
arm_compute::ICLTensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
#endif
if (!use_opencl) {
func.push_back(
instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
arm_compute::ITensor,
arm_compute::PadStrideInfo, ACLSubTensor>(
acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
acl_op->soutput(i), info, hint));
}
}
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_lc(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
std::unique_ptr<ACLTensor> &weights = acl_op->weights();
std::unique_ptr<ACLTensor> &biases = acl_op->biases();
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
input, weights, biases, output, info, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NELocallyConnectedLayer,
arm_compute::ITensor, arm_compute::PadStrideInfo>(
input, weights, biases, output, info, hint));
}
return true;
}
template <typename OperatorInfo>
bool instantiate_op_fc(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, const OperatorInfo &info) {
std::unique_ptr<ACLTensor> &weights = acl_op->weights();
std::unique_ptr<ACLTensor> &biases = acl_op->biases();
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
arm_compute::ICLTensor, bool>(
input, weights, biases, output, info, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
arm_compute::IGCTensor, bool>(
input, weights, biases, output, info, hint));
return true;
}
#endif
{
func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
arm_compute::ITensor, bool>(
input, weights, biases, output, info, hint));
}
return true;
}
template <typename Dtype>
bool instantiate_op_bn(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, Dtype eps) {
std::unique_ptr<ACLTensor> &mean = acl_op->mean();
std::unique_ptr<ACLTensor> &var = acl_op->var();
std::unique_ptr<ACLTensor> &beta = acl_op->beta();
std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
arm_compute::ICLTensor>(input, output, mean, var,
beta, gamma, eps, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
arm_compute::IGCTensor>(input, output, mean, var,
beta, gamma, eps, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
arm_compute::ITensor>(input, output, mean, var,
beta, gamma, eps, hint));
}
return true;
}
inline bool instantiate_op_softmax(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, void *data) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func<arm_compute::CLSoftmaxLayer,
arm_compute::ICLTensor>(input, output, hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func<arm_compute::GCSoftmaxLayer,
arm_compute::IGCTensor>(input, output, hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
input, output, hint));
}
return true;
}
inline bool instantiate_op_concat(
ACLOperator *acl_op,
std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
TargetHint hint, int num) {
#ifdef USE_OPENCL
if (hint == TargetHint::OPENCL) {
func.push_back(
instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
arm_compute::ICLTensor>(acl_op, output, num,
hint));
return true;
}
#elif defined(USE_OPENGLES)
if (hint == TargetHint::OPENGLES) {
func.push_back(
instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
arm_compute::IGCTensor>(acl_op, output, num,
hint));
return true;
}
#endif
{
func.push_back(
instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
arm_compute::ITensor>(acl_op, output, num,
hint));
}
return true;
}
template <typename Dtype>
void *InputdataPtr(ACLOperator *op,
const std::vector<framework::LoDTensor *> &input_data,
Dtype type, int index = -1) {
if (index == -1) index = 0;
return (void *)(input_data[index]->mutable_data<Dtype>());
}
template <typename Dtype>
void acl_run(ACLOperator *op,
const std::vector<framework::LoDTensor *> &in_data, void *out_data,
Dtype type, bool multi_input_run = true) {
for (int i = 0; i < in_data.size(); ++i) {
op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
}
op->acl_run(NULL, out_data);
}
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
#ifdef USE_PROFILING
#define acl_configure(opname, acl_op, args...) \
{ \
set_operator_property(acl::operate_type_##opname, #opname); \
logtime_util log_time(ACL_CONFIG_INFO); \
instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \
acl_op->output(), acl_op->getTargetHint(), args); \
}
#else
#define acl_configure(opname, acl_op, args...) \
{ \
set_operator_property(acl::operate_type_##opname, #opname); \
instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(), \
acl_op->output(), acl_op->getTargetHint(), args); \
}
#endif
#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
#endif // USE_ACL
#endif // ACL_OPERATOR_H_
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "acl_tensor.h"
namespace paddle_mobile {
namespace operators {
namespace acl {
#ifdef USE_ACL
template <typename TensorType>
std::unique_ptr<arm_compute::ITensor> initialise_tensor(
arm_compute::TensorInfo &info) {
auto tensor = cpp14::make_unique<TensorType>();
tensor->allocator()->init(info);
return std::move(tensor);
}
template <typename TensorType>
void tensor_allocate(arm_compute::ITensor &tensor) {
auto itensor = dynamic_cast<TensorType *>(&tensor);
itensor->allocator()->allocate();
}
Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
: _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
Tensor::Tensor(Tensor &&src) noexcept
: _target(src._target),
_info(std::move(src._info)),
_tensor(std::move(src._tensor)) {}
arm_compute::ITensor *Tensor::set_target(TargetHint target) {
switch (target) {
#ifdef USE_OPENCL
case TargetHint::OPENCL:
_tensor = initialise_tensor<arm_compute::CLTensor>(_info);
break;
#elif defined(USE_OPENGLES)
case TargetHint::OPENGLES:
_tensor = initialise_tensor<arm_compute::GCTensor>(_info);
break;
#endif
case TargetHint::NEON:
_tensor = initialise_tensor<arm_compute::Tensor>(_info);
break;
default:
break;
}
_target = target;
return _tensor.get();
}
void Tensor::allocate() {
switch (_target) {
#ifdef USE_OPENCL
case TargetHint::OPENCL:
tensor_allocate<arm_compute::CLTensor>(*_tensor);
break;
#elif defined(USE_OPENGLES)
case TargetHint::OPENGLES:
tensor_allocate<arm_compute::GCTensor>(*_tensor);
break;
#endif
case TargetHint::NEON:
tensor_allocate<arm_compute::Tensor>(*_tensor);
break;
default:
break;
}
}
void Tensor::map(bool blocking) {
#ifdef USE_OPENCL
if (_target == TargetHint::OPENCL)
dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
#elif defined(USE_OPENGLES)
if (_target == TargetHint::OPENGLES)
dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
#endif
}
void Tensor::unmap() {
#ifdef USE_OPENCL
if (_target == TargetHint::OPENCL)
dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
#elif defined(USE_OPENGLES)
if (_target == TargetHint::OPENGLES)
dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
#endif
}
template <typename SubTensorType, typename ParentTensorType>
std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
arm_compute::ITensor *parent, arm_compute::TensorShape shape,
arm_compute::Coordinates coords) {
auto ptensor = dynamic_cast<ParentTensorType *>(parent);
auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
return std::move(subtensor);
}
SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
arm_compute::Coordinates &coords) noexcept
: _target(TargetHint::DONT_CARE),
_tensor_shape(tensor_shape),
_coords(coords),
_parent(nullptr),
_subtensor(nullptr) {
_parent = parent->tensor();
_target = parent->target();
instantiate_subtensor();
}
arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
return (target == _target) ? _subtensor.get() : nullptr;
}
arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
const arm_compute::ITensor *SubTensor::tensor() const {
return _subtensor.get();
}
TargetHint SubTensor::target() const { return _target; }
void SubTensor::allocate() {
// NOP for sub-tensors
}
void SubTensor::instantiate_subtensor() {
switch (_target) {
#ifdef USE_OPENCL
case TargetHint::OPENCL:
_subtensor = initialise_subtensor<arm_compute::CLSubTensor,
arm_compute::ICLTensor>(
_parent, _tensor_shape, _coords);
break;
#endif
default:
case TargetHint::NEON:
_subtensor =
initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
_parent, _tensor_shape, _coords);
break;
}
}
#endif
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef ACL_TENSOR_H_
#define ACL_TENSOR_H_
#ifdef USE_ACL
#ifdef USE_OPENCL
#include "arm_compute/runtime/CL/CLSubTensor.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#elif defined(USE_OPENGLES)
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#endif
#include "arm_compute/runtime/SubTensor.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace paddle_mobile {
namespace operators {
namespace acl {
enum class TargetHint {
DONT_CARE,
OPENCL,
OPENGLES,
NEON,
};
enum class ConvolutionMethodHint {
GEMM,
DIRECT,
};
namespace cpp14 {
template <class T>
struct _Unique_if {
typedef std::unique_ptr<T> _Single_object;
};
template <class T>
struct _Unique_if<T[]> {
typedef std::unique_ptr<T[]> _Unknown_bound;
};
template <class T, size_t N>
struct _Unique_if<T[N]> {
typedef void _Known_bound;
};
template <class T, class... Args>
typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
}
template <class T>
typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
typedef typename std::remove_extent<T>::type U;
return std::unique_ptr<T>(new U[n]());
}
template <class T, class... Args>
typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
} // namespace cpp14
class Tensor {
public:
explicit Tensor(arm_compute::TensorInfo &info) noexcept;
virtual ~Tensor() {}
Tensor(Tensor &&src) noexcept;
void set_info(arm_compute::TensorInfo &&info) { _info = info; }
arm_compute::ITensor *set_target(TargetHint target);
const arm_compute::TensorInfo &info() const { return _info; }
arm_compute::ITensor *tensor() { return _tensor.get(); }
void allocate();
void init() {}
TargetHint target() const { return _target; }
virtual void map(bool blocking = true);
virtual void unmap();
private:
TargetHint _target;
arm_compute::TensorInfo _info;
std::unique_ptr<arm_compute::ITensor> _tensor;
};
class SubTensor {
public:
SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
arm_compute::Coordinates &coords) noexcept;
~SubTensor() {}
arm_compute::ITensor *tensor();
const arm_compute::ITensor *tensor() const;
TargetHint target() const;
void allocate();
arm_compute::ITensor *set_target(TargetHint target);
private:
/** Instantiates a sub-tensor */
void instantiate_subtensor();
private:
/**< Target that this tensor is pinned on */
TargetHint _target;
/**< SubTensor shape */
arm_compute::TensorShape _tensor_shape;
/**< SubTensor Coordinates */
arm_compute::Coordinates _coords;
/**< Parent tensor */
arm_compute::ITensor *_parent;
/**< SubTensor */
std::unique_ptr<arm_compute::ITensor> _subtensor;
};
} // namespace acl
} // namespace operators
} // namespace paddle_mobile
#endif
#endif // ACL_TENSOR_H_
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef BATCHNORM_OP
#include "operators/kernel/batchnorm_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclBatchNormOp : public acl::ACLOperator {
public:
AclBatchNormOp() {
this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
}
~AclBatchNormOp() = default;
AclBatchNormOp(const AclBatchNormOp&) = delete;
AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
AclBatchNormOp(AclBatchNormOp&&) = delete;
AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const BatchNormParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth, args.batch);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.out_num);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
arm_compute::TensorShape mean_shape(args.in_depth);
arm_compute::TensorShape var_shape = mean_shape;
arm_compute::TensorShape beta_shape = mean_shape;
arm_compute::TensorShape gamma_shape = mean_shape;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
new_tensor(mean(), mean_shape, args.mean_data);
new_tensor(var(), var_shape, args.var_data);
new_tensor(beta(), beta_shape, args.biases_data);
new_tensor(gamma(), gamma_shape, args.weight_data);
acl_configure(bn, this, args.epsilon);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const BatchNormParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const BatchNormParam& param) {
const Tensor* in_x = param.InputX();
Tensor* out = param.OutputY();
const Tensor* scale = param.InputScale();
const Tensor* bias = param.InputBias();
const Tensor* saved_mean = param.InputMean();
const Tensor* saved_variance = param.InputVariance();
const T* input_data = in_x->data<T>();
T* output_data = out->mutable_data<T>();
const T* weight_data = scale->data<T>();
const T* bias_data = bias->data<T>();
const T* mean_data = saved_mean->data<T>();
const T* var_data = saved_variance->data<T>();
float epsilon = param.Epsilon();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
// args.weight_data = (void*)weight_data;
// args.biases_data = (void*)bias_data;
args.mean_data = (void*)mean_data;
args.var_data = (void*)var_data;
args.epsilon = epsilon;
args.dim = in_x->dims().size();
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.in_rows = in_x->dims()[2];
args.in_cols = in_x->dims()[3];
args.out_num = out->dims()[0];
args.out_depth = out->dims()[1];
args.out_rows = out->dims()[2];
args.out_cols = out->dims()[3];
args.weight_data = (void*)weight_data;
args.biases_data = (void*)bias_data;
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl::AclParameters args;
};
template <>
bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
AclBatchNormOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclBatchNormOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void BatchNormKernel<GPU_MALI, float>::Compute(
const BatchNormParam& param) const {
std::cout << "init acl" << std::endl;
AclBatchNormOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class BatchNormKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclConcatOp : public acl::ACLOperator {
public:
AclConcatOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
}
~AclConcatOp() = default;
AclConcatOp(const AclConcatOp&) = delete;
AclConcatOp& operator=(const AclConcatOp&) = delete;
AclConcatOp(AclConcatOp&&) = delete;
AclConcatOp& operator=(AclConcatOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const ConcatParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.batch);
if (is_operator_init_done(output_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
T type;
for (int i = 0; i < input_data->size(); i++) {
const T* idata = (*input_data)[i]->data<T>();
const T* pdata = (*input_data)[i]->data<T>();
int in_batch = (*input_data)[i]->dims()[0];
int in_channels = (*input_data)[i]->dims()[1];
int in_width = (*input_data)[i]->dims()[2];
int in_height = (*input_data)[i]->dims()[3];
arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
new_tensor(cinput(i), in_shape,
acl::InputdataPtr(this, args.in_tensor, type, i));
}
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(concat, this, input_data->size());
}
void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
T type;
acl::acl_run(this, input, output, type);
}
bool Bypass_acl(const ConcatParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const ConcatParam& param) {
auto inputs = param.Inputs();
auto* output = param.Out();
int64_t axis = param.Axis();
T* output_data = output->mutable_data<T>();
args.is_channel_concat = (axis == 1);
args.in_tensor = inputs;
args.output_data = (void*)output_data;
args.batch = output->dims()[0];
args.out_depth = output->dims()[1];
args.out_rows = output->dims()[2];
args.out_cols = output->dims()[3];
}
acl::AclParameters args;
};
template <>
bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
AclConcatOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclConcatOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
std::cout << "init acl" << std::endl;
AclConcatOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl(temp_data, (void*)output_data);
}
template class ConcatKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/conv_add_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclConvAddOp : public acl::ACLOperator {
public:
AclConvAddOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
}
~AclConvAddOp() = default;
AclConvAddOp(const AclConvAddOp&) = delete;
AclConvAddOp& operator=(const AclConvAddOp&) = delete;
AclConvAddOp(AclConvAddOp&&) = delete;
AclConvAddOp& operator=(AclConvAddOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const FusionConvAddParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth, args.batch);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.out_num);
arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
args.in_depth / args.num_group,
args.out_depth);
arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::PadStrideInfo conv_info(
args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::FLOOR);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
check_direct_conv();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor(weights(), weights_shape, args.weight_data);
//[OFM]
if (args.biases_data) {
new_tensor(biases(), biases_shape, args.biases_data);
}
group() = args.num_group;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(conv, this, conv_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const FusionConvAddParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_ || args.num_group >= 5) {
bypass_acl = true;
}
if (args.dim > 2) {
bypass_acl = true;
}
if (args.dilated) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void check_direct_conv() {
bool use_direct_conv = false;
const char* pDirectConv;
pDirectConv = getenv("DIRECTCONV");
if (pDirectConv) {
unsigned int bdirectconv;
sscanf(pDirectConv, "%i", &bdirectconv);
if (bdirectconv != use_direct_conv) {
use_direct_conv = bdirectconv;
printf("DIRECTCONV<%s>\n", pDirectConv);
printf("DIRECTCONV: %x\n", use_direct_conv);
}
}
int pad_data[2], kernel[2];
pad_data[1] = args.pad_rows;
pad_data[0] = args.pad_cols;
kernel[1] = args.filter_rows;
kernel[0] = args.filter_cols;
if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
pad_data[0] == 0 && pad_data[1] == 0) ||
(kernel[0] == 3 && kernel[1] == 3 &&
pad_data[0] <= 1 && pad_data[1] <= 1))) {
setConvMethod(); // NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void AclParametersByContext(const FusionConvAddParam& param) {
const Tensor* input = param.Input();
Tensor filter = *param.Filter();
Tensor* output = param.Output();
Tensor* bias;
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>();
const T* weight_data = filter.data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.weight_data = (void*)weight_data;
args.biases_data = nullptr;
try {
bias = param.Bias();
} catch (const std::exception& e) {
}
if (bias) {
const T* biases_data = bias->data<T>();
args.biases_data = (void*)biases_data;
}
args.num_group = groups;
args.dilation_rows = dilations[0];
args.dilation_cols = dilations[1];
if (dilations[0] != 1 || dilations[1] != 1) {
args.dilated = true;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args.batch = input->dims()[0];
args.in_depth = input->dims()[1];
args.in_rows = input->dims()[2];
args.in_cols = input->dims()[3];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args.out_num = output->dims()[0];
args.out_depth = output->dims()[1];
args.out_rows = output->dims()[2];
args.out_cols = output->dims()[3];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args.filter_rows = filter.dims()[2];
args.filter_cols = filter.dims()[3];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args.stride_rows = strides[0];
args.stride_cols = strides[1];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args.pad_rows = paddings[0];
args.pad_cols = paddings[1];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl::AclParameters args;
};
template <>
bool ConvAddKernel<GPU_MALI, float>::Init(
const FusionConvAddParam& param) const {
AclConvAddOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclConvAddOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ConvAddKernel<GPU_MALI, float>::Compute(
const FusionConvAddParam& param) const {
std::cout << "init acl" << std::endl;
AclConvAddOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class ConvAddKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#include "operators/kernel/conv_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclConvOp : public acl::ACLOperator {
public:
AclConvOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
}
~AclConvOp() = default;
AclConvOp(const AclConvOp&) = delete;
AclConvOp& operator=(const AclConvOp&) = delete;
AclConvOp(AclConvOp&&) = delete;
AclConvOp& operator=(AclConvOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const ConvParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth, args.batch);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth, args.out_num);
arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
args.in_depth / args.num_group,
args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::PadStrideInfo conv_info(
args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::FLOOR);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
check_direct_conv();
//[kernel_x, kernel_y, IFM, OFM]
new_tensor(weights(), weights_shape, args.weight_data);
//[OFM]
// if (args.biases_data) {
// new_tensor(biases(),biases_shape,args.biases_data);
//}
group() = args.num_group;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(conv, this, conv_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const ConvParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_ || args.num_group >= 5) {
bypass_acl = true;
}
if (args.dim > 2) {
bypass_acl = true;
}
if (args.dilated) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void check_direct_conv() {
bool use_direct_conv = false;
const char* pDirectConv;
pDirectConv = getenv("DIRECTCONV");
if (pDirectConv) {
unsigned int bdirectconv;
sscanf(pDirectConv, "%i", &bdirectconv);
if (bdirectconv != use_direct_conv) {
use_direct_conv = bdirectconv;
printf("DIRECTCONV<%s>\n", pDirectConv);
printf("DIRECTCONV: %x\n", use_direct_conv);
}
}
int pad_data[2], kernel[2];
pad_data[1] = args.pad_rows;
pad_data[0] = args.pad_cols;
kernel[1] = args.filter_rows;
kernel[0] = args.filter_cols;
if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
pad_data[0] == 0 && pad_data[1] == 0) ||
(kernel[0] == 3 && kernel[1] == 3 &&
pad_data[0] <= 1 && pad_data[1] <= 1))) {
setConvMethod(); // NEDirectConvolutionLayer only for 1x1 and 3x3
}
}
void AclParametersByContext(const ConvParam& param) {
const Tensor* input = param.Input();
Tensor filter = *param.Filter();
Tensor* output = param.Output();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>();
const T* weight_data = filter.data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.weight_data = (void*)weight_data;
args.biases_data = nullptr;
// try {
// bias = context.Input<framework::Tensor>("Bias");
// } catch (const std::exception& e) {
// }
// if (bias) {
// const T* biases_data = bias->data<T>();
// args.biases_data = (void*)biases_data;
// }
args.num_group = groups;
args.dilation_rows = dilations[0];
args.dilation_cols = dilations[1];
if (dilations[0] != 1 || dilations[1] != 1) {
args.dilated = true;
}
// NCHW
// std::cout << "In dims: " << (input->dims()).size() << std::endl;
args.batch = input->dims()[0];
args.in_depth = input->dims()[1];
args.in_rows = input->dims()[2];
args.in_cols = input->dims()[3];
std::cout << "In N: " << args.batch << " C: " << args.in_depth
<< " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout << "Out dims: " << (output->dims()).size() << std::endl;
args.out_num = output->dims()[0];
args.out_depth = output->dims()[1];
args.out_rows = output->dims()[2];
args.out_cols = output->dims()[3];
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
args.filter_rows = filter.dims()[2];
args.filter_cols = filter.dims()[3];
// std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
// << " I: " << static_cast<int>(filter.dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args.stride_rows = strides[0];
args.stride_cols = strides[1];
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args.pad_rows = paddings[0];
args.pad_cols = paddings[1];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
}
acl::AclParameters args;
};
template <>
bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
AclConvOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclConvOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
std::cout << "init acl" << std::endl;
AclConvOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class ConvKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ELEMENTWISEADD_OP
#pragma once
#include "operators/kernel/elementwise_add_kernel.h"
namespace paddle_mobile {
namespace operators {
template <typename T>
struct AddFunctor {
inline T operator()(T a, T b) const { return a + b; }
};
template <>
bool ElementwiseAddKernel<GPU_MALI, float>::Init(
const ElementwiseAddParam &para) const {
return true;
}
template <>
void ElementwiseAddKernel<GPU_MALI, float>::Compute(
const ElementwiseAddParam &param) const {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *Out = param.Out();
Out->mutable_data<float>();
int axis = param.Axis();
ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
AddFunctor<float>(), Out);
}
template class ElementwiseAddKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
return true;
}
template <>
void FusionFcKernel<GPU_MALI, float>::Compute(
const FusionFcParam &param) const {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
const Tensor *input_z = param.InputZ();
auto *input_z_data = input_z->data<float>();
int axis = param.Axis();
Tensor *out = param.Out();
auto *out_data = out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
: *input_x;
const Tensor y_matrix =
input_y->dims().size() > 2
? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
: *input_y;
auto out_dim = out->dims();
if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
" out_dim.size must be 2.");
axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
int64_t classes = input_z->numel();
for (int i = 0; i < out_dim[0]; i++) {
memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
}
for (int i = 0; i < out->numel(); i++) {
DLOG << out_data[i];
}
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(1));
PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef LRN_OP
#pragma once
#include "operators/kernel/lrn_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclLrnOp : public acl::ACLOperator {
public:
AclLrnOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
}
~AclLrnOp() = default;
AclLrnOp(const AclLrnOp&) = delete;
AclLrnOp& operator=(const AclLrnOp&) = delete;
AclLrnOp(AclLrnOp&&) = delete;
AclLrnOp& operator=(AclLrnOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const LrnParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
if (is_operator_init_done(shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
arm_compute::NormalizationLayerInfo norm_info(
arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
args.knorm);
//[width, height, IFM]
new_tensor(input(), shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), shape, args.output_data);
acl_configure(lrn, this, norm_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const LrnParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const LrnParam& param) {
const Tensor* in_x = param.InputX();
Tensor* out = param.Out();
int n = param.N();
T alpha = param.Alpha();
T beta = param.Beta();
T k = param.K();
const T* input_data = in_x->data<T>();
T* output_data = out->mutable_data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.nsize = n;
args.alpha = alpha;
args.beta = beta;
args.knorm = k;
// NCHW
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.in_rows = in_x->dims()[2];
args.in_cols = in_x->dims()[3];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl::AclParameters args;
};
template <>
bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
AclLrnOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclLrnOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
std::cout << "init acl" << std::endl;
AclLrnOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
for (int n = 0; n < args.batch; ++n) {
acl_op->RunAcl((void*)input_data, (void*)output_data);
input_data += args.in_depth * args.in_cols * args.in_rows;
output_data += args.in_depth * args.in_cols * args.in_rows;
}
}
template class LrnKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#pragma once
#include "operators/kernel/mul_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
return true;
}
template <>
void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY();
Tensor *out = param.Out();
out->mutable_data<float>();
const Tensor x_matrix =
input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
: *input_x;
const Tensor y_matrix =
input_y->dims().size() > 2
? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
: *input_y;
auto out_dim = out->dims();
if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0));
if (out_dim.size() != 2) {
out->Resize(out_dim);
}
}
template class MulKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#pragma once
#include "operators/kernel/pool_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclPoolOp : public acl::ACLOperator {
public:
AclPoolOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
}
~AclPoolOp() = default;
AclPoolOp(const AclPoolOp&) = delete;
AclPoolOp& operator=(const AclPoolOp&) = delete;
AclPoolOp(AclPoolOp&&) = delete;
AclPoolOp& operator=(AclPoolOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const PoolParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
args.in_depth);
arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
args.out_depth);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::PoolingLayerInfo pool_info;
if (args.pool_type == "max") {
pool_info = arm_compute::PoolingLayerInfo(
arm_compute::PoolingType::MAX, args.filter_rows,
arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::CEIL));
} else {
pool_info = arm_compute::PoolingLayerInfo(
arm_compute::PoolingType::AVG, args.filter_rows,
arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
args.pad_cols, args.pad_rows,
arm_compute::DimensionRoundingType::CEIL));
}
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(pooling, this, pool_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const PoolParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
if (args.pool_type != "max" && args.pool_type != "avg") {
bypass_acl = true;
}
if (args.filter_rows != args.filter_cols) {
bypass_acl = true;
}
// if (args.filter_rows!=2 && args.filter_rows!=3) {
// bypass_acl = true;
// }
return bypass_acl;
}
private:
void AclParametersByContext(const PoolParam& param) {
const Tensor* in_x = param.Input();
Tensor* out = param.Output();
std::string pooling_type = param.PoolingType();
std::vector<int> ksize = param.Ksize();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
bool is_global_pooling = param.isGlobalPooling();
const T* input_data = in_x->data<T>();
T* output_data = out->mutable_data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.is_global_pool = is_global_pooling;
args.pool_type = pooling_type;
args.filter_rows = ksize[0];
args.filter_cols = ksize[1];
args.dim = ksize.size();
// NCHW
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.in_rows = in_x->dims()[2];
args.in_cols = in_x->dims()[3];
// std::cout <<"In N: " << args.batch << " C: " << args.in_depth
// << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
// NCHW
// std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
// << " C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
// MCHW = OIHW
// std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
// << " I: " << static_cast<int>(filter->dims()[1])
// << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
// strides(h_stride, w_stride)
args.stride_rows = strides[0];
args.stride_cols = strides[1];
// std::cout <<"PoolingType: " << args.pool_type << "\n";
// std::cout <<"Stride H: " << args.stride_rows << " W: " <<
// args.stride_cols << "\n";
// paddings(h_pad, w_pad)
args.pad_rows = paddings[0];
args.pad_cols = paddings[1];
// std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
// "\n";
args.out_depth = args.in_depth;
// args.out_rows = out->dims()[2];
// args.out_cols = out->dims()[3];
args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
2 * args.pad_rows -
args.filter_rows) /
args.stride_rows)) +
1;
args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
2 * args.pad_cols -
args.filter_cols) /
args.stride_cols)) +
1;
if (is_global_pooling) {
args.filter_rows = args.in_rows;
args.filter_cols = args.in_cols;
args.pad_rows = 0;
args.pad_cols = 0;
}
}
acl::AclParameters args;
};
template <>
bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
AclPoolOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclPoolOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
std::cout << "init acl" << std::endl;
AclPoolOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
for (int n = 0; n < args.batch; ++n) {
acl_op->RunAcl((void*)input_data, (void*)output_data);
input_data += args.in_depth * args.in_cols * args.in_rows;
output_data += args.in_depth * args.out_cols * args.out_rows;
}
}
template class PoolKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RELU_OP
#pragma once
#include "operators/kernel/relu_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclReluOp : public acl::ACLOperator {
public:
AclReluOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
}
~AclReluOp() = default;
AclReluOp(const AclReluOp&) = delete;
AclReluOp& operator=(const AclReluOp&) = delete;
AclReluOp(AclReluOp&&) = delete;
AclReluOp& operator=(AclReluOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const ReluParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
args.in_depth * args.batch);
arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
args.in_depth * args.out_num);
// arm_compute::TensorShape weights_shape(
// args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
// arm_compute::TensorShape biases_shape(args.out_depth);
arm_compute::ActivationLayerInfo::ActivationFunction type;
type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
arm_compute::ActivationLayerInfo act_info(type);
if (is_operator_init_done(input_shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
//[width, height, IFM]
new_tensor(input(), input_shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), output_shape, args.output_data);
acl_configure(activation, this, act_info);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const ReluParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const ReluParam& param) {
const auto* input_x = param.InputX();
auto* out = param.Out();
const T* input_data = input_x->data<T>();
T* output_data = out->mutable_data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
args.batch = input_x->dims()[0];
args.in_depth = input_x->dims()[1];
args.in_rows = input_x->dims()[2];
args.in_cols = input_x->dims()[3];
args.out_num = out->dims()[0];
}
acl::AclParameters args;
};
template <>
bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
AclReluOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclReluOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
std::cout << "init acl" << std::endl;
AclReluOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
acl_op->RunAcl((void*)input_data, (void*)output_data);
}
template class ReluKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef RESHAPE_OP
#pragma once
#include "operators/kernel/reshape_kernel.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
return true;
}
template <>
void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
const auto *input_x = param.InputX();
const auto &input_x_dims = input_x->dims();
auto *out = param.Out();
framework::DDim out_dims = out->dims();
const auto *input_shape = param.InputShape();
if (input_shape) {
auto *shape_data = input_shape->data<int>();
framework::Tensor cpu_shape_tensor;
auto shape =
std::vector<int>(shape_data, shape_data + input_shape->numel());
out_dims = ValidateShape(shape, input_x->dims());
}
bool inplace = param.Inplace();
out->Resize(out_dims);
if (!inplace) {
out->mutable_data<float>();
framework::TensorCopy(*input_x, out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*input_x);
out->Resize(out_dims);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once
#include "operators/kernel/softmax_kernel.h"
#ifdef PADDLE_MOBILE_MALI_GPU
#include "acl_operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class AclSoftmaxOp : public acl::ACLOperator {
public:
AclSoftmaxOp() {
this->force_bypass_acl_path_ =
bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
}
~AclSoftmaxOp() = default;
AclSoftmaxOp(const AclSoftmaxOp&) = delete;
AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
AclSoftmaxOp(AclSoftmaxOp&&) = delete;
AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
acl::AclParameters& getargs() { return args; }
void InitAclLayer(const SoftmaxParam& param) {
setTargetHint(acl::TargetHint::OPENCL);
arm_compute::TensorShape shape(args.in_depth, args.batch);
if (is_operator_init_done(shape)) return;
set_operator_init_done();
this->force_bypass_acl_path_ = false;
//[width, height, IFM]
new_tensor(input(), shape, args.input_data);
//[width, height, OFM]
new_tensor(output(), shape, args.output_data);
acl_configure(softmax, this, NULL);
}
void RunAcl(void* input, void* output) {
acl::ACLOperator::acl_run(input, output);
}
bool Bypass_acl(const SoftmaxParam& param) {
bool bypass_acl = false;
AclParametersByContext(param);
// for performance, more groups impact GPU performance
if (this->force_bypass_acl_path_) {
bypass_acl = true;
}
return bypass_acl;
}
private:
void AclParametersByContext(const SoftmaxParam& param) {
const framework::Tensor* in_x = param.InputX();
framework::Tensor* out = param.Out();
auto x_dims = in_x->dims();
out->Resize(x_dims);
const T* input_data = in_x->data<T>();
T* output_data = out->data<T>();
args.input_data = (void*)input_data;
args.output_data = (void*)output_data;
// NCHW
args.batch = in_x->dims()[0];
args.in_depth = in_x->dims()[1];
args.out_num = out->dims()[0];
// std::cout
// << "Out C: " << args.out_depth
// << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
}
acl::AclParameters args;
};
template <>
bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
AclSoftmaxOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
acl_op = new AclSoftmaxOp<GPU_MALI, float>();
this->SetAclOp((void*)acl_op, (void*)this);
}
return true;
}
template <>
void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
std::cout << "init acl" << std::endl;
AclSoftmaxOp<GPU_MALI, float>* acl_op =
reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
if (acl_op == nullptr) {
return;
}
if (acl_op->Bypass_acl(param)) {
std::cout << "init acl failed" << std::endl;
return;
}
acl::AclParameters& args = acl_op->getargs();
const float* input_data = (const float*)args.input_data;
const float* output_data = (const float*)args.output_data;
acl_op->InitAclLayer(param);
for (int n = 0; n < args.out_num; ++n) {
acl_op->RunAcl((void*)input_data, (void*)output_data);
input_data += args.in_depth;
output_data += args.in_depth;
}
}
template class SoftmaxKernel<GPU_MALI, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
#endif
...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MUL_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -26,6 +29,9 @@ template <typename DeviceType, typename T> ...@@ -26,6 +29,9 @@ template <typename DeviceType, typename T>
class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> { class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
public: public:
void Compute(const MulParam &param) const; void Compute(const MulParam &param) const;
bool Init(const MulParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MULTICLASSNMS_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h"
#pragma once; #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -25,6 +28,9 @@ class MultiClassNMSKernel ...@@ -25,6 +28,9 @@ class MultiClassNMSKernel
: public framework::OpKernelBase<DeviceType, MultiClassNMSParam> { : public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
public: public:
void Compute(const MultiClassNMSParam& param) const; void Compute(const MultiClassNMSParam& param) const;
bool Init(const MultiClassNMSParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
...@@ -26,6 +28,9 @@ template <typename DeviceType, typename T> ...@@ -26,6 +28,9 @@ template <typename DeviceType, typename T>
class PoolKernel : public OpKernelBase<DeviceType, PoolParam> { class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
public: public:
void Compute(const PoolParam &param) const override; void Compute(const PoolParam &param) const override;
bool Init(const PoolParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <vector> #ifdef PRIORBOX_OP
#pragma once
#include <algorithm>
#include <cmath>
#include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/math/transform.h" #include "operators/math/transform.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -52,6 +55,9 @@ class PriorBoxKernel ...@@ -52,6 +55,9 @@ class PriorBoxKernel
: public framework::OpKernelBase<DeviceType, PriorBoxParam> { : public framework::OpKernelBase<DeviceType, PriorBoxParam> {
public: public:
void Compute(const PriorBoxParam& param) const; void Compute(const PriorBoxParam& param) const;
bool Init(const PriorBoxParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef RELU_OP
#pragma once
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h"
#pragma once; #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -24,6 +27,9 @@ template <typename DeviceType, typename T> ...@@ -24,6 +27,9 @@ template <typename DeviceType, typename T>
class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> { class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
public: public:
void Compute(const ReluParam& param) const; void Compute(const ReluParam& param) const;
bool Init(const ReluParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <vector> #ifdef RESHAPE_OP
#pragma once
#include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h"
#pragma once; #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -69,6 +71,9 @@ template <typename DeviceType, typename T> ...@@ -69,6 +71,9 @@ template <typename DeviceType, typename T>
class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> { class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
public: public:
void Compute(const ReshapeParam& param) const; void Compute(const ReshapeParam& param) const;
bool Init(const ReshapeParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SIGMOID_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
...@@ -24,6 +26,9 @@ template <typename DeviceType, typename T> ...@@ -24,6 +26,9 @@ template <typename DeviceType, typename T>
class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> { class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
public: public:
void Compute(const SigmoidParam& param) const override; void Compute(const SigmoidParam& param) const override;
bool Init(const SigmoidParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
...@@ -27,6 +29,9 @@ template <typename DeviceType, typename T> ...@@ -27,6 +29,9 @@ template <typename DeviceType, typename T>
class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> { class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
public: public:
void Compute(const SoftmaxParam &param) const override; void Compute(const SoftmaxParam &param) const override;
bool Init(const SoftmaxParam &para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef TRANSPOSE_OP
#pragma once
#include <vector> #include <vector>
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#pragma once;
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -27,6 +29,9 @@ class TransposeKernel ...@@ -27,6 +29,9 @@ class TransposeKernel
: public framework::OpKernelBase<DeviceType, TransposeParam> { : public framework::OpKernelBase<DeviceType, TransposeParam> {
public: public:
void Compute(const TransposeParam& param) const; void Compute(const TransposeParam& param) const;
bool Init(const TransposeParam& para) const;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef LRN_OP
#include "lrn_op.h" #include "lrn_op.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -19,13 +21,23 @@ namespace operators { ...@@ -19,13 +21,23 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void LrnOp<Dtype, T>::InferShape() const { void LrnOp<Dtype, T>::InferShape() const {
auto x_dims = param_.InputX()->dims(); auto x_dims = this->param_.InputX()->dims();
param_.Out()->Resize(x_dims); this->param_.Out()->Resize(x_dims);
} }
template class LrnOp<CPU, float>; template class LrnOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(lrn); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(lrn, ops::LrnOp); USE_OP_CPU(lrn);
REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(lrn);
REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef LRN_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -22,26 +25,25 @@ namespace paddle_mobile { ...@@ -22,26 +25,25 @@ namespace paddle_mobile {
namespace operators { namespace operators {
using std::string; using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class LrnOp : public framework::OperatorWithKernel<DeviceType> { class LrnOp : public framework::OperatorWithKernel<
DeviceType, LrnParam, operators::LrnKernel<DeviceType, T>> {
public: public:
LrnOp(const string &type, const VariableNameMap &inputs, LrnOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, LrnParam,
scope), operators::LrnKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const { using framework::OperatorWithKernel<
operators::LrnKernel<DeviceType, T> kernel; DeviceType, LrnParam,
kernel.Compute(param_); operators::LrnKernel<DeviceType, T>>::OperatorWithKernel;
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
LrnParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if __ARM_NEON
#include <arm_neon.h>
#endif
#include "framework/ddim.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::DDim;
using framework::Tensor;
inline int ConvOutputSize(int input_size, int filter_size, int dilation,
int padding, int stride) {
const int dkernel = dilation * (filter_size - 1) + 1;
int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size;
}
inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
auto bias_ptr = bias.data<float>();
const DDim bias_ddim = bias.dims();
PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
"the bias tensor's dims size != 1")
DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
DDim inner_ddim =
paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
int outer_size = paddle_mobile::framework::product(outer_ddim);
int inner_size = paddle_mobile::framework::product(inner_ddim);
bias.Resize(dDim);
auto new_ptr = bias.mutable_data<float>();
int axis_size = dDim[axis];
#if __ARM_NEON
for (int i = 0; i < outer_size; ++i) {
int inner_num = inner_size >> 4;
int remain = inner_size - (inner_num << 4);
float v_bias = bias_ptr[i * axis_size / outer_size];
for (; inner_num > 0; inner_num--) {
float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
vst1q_f32(new_ptr, v_newptr1);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr2);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr3);
new_ptr += 4;
vst1q_f32(new_ptr, v_newptr4);
new_ptr += 4;
}
for (; remain > 0; remain--) {
*new_ptr = v_bias;
new_ptr++;
}
}
#else
for (int i = 0; i < outer_size; ++i) {
float v_bias = bias_ptr[i * axis_size / outer_size];
for (int j = 0; j < inner_size; ++j) {
new_ptr[i * inner_size + j] = v_bias;
}
}
#endif
}
inline bool IsExpand(const std::vector<int64_t> &filter_dim,
const std::vector<int> &strides,
const std::vector<int> &paddings,
const std::vector<int> &dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/depthwiseconv3x3s1p1.h"
#include <arm_neon.h>
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
Tensor bias, bool if_bias) {
const float *input_data = input->data<float>();
const float *filter_data = filter.data<float>();
float *output_data = output->data<float>();
const float *bias_data = bias.data<float>();
const int h = static_cast<int>(input->dims()[2]);
const int w = static_cast<int>(input->dims()[3]);
const int l = h;
const int batch_size = static_cast<int>(input->dims()[0]);
const int c = static_cast<int>(input->dims()[1]);
const int hxw = h * w;
float32x4_t vbias = vdupq_n_f32(0.0);
for (int b = 0; b < batch_size; ++b) {
const float *filter_data_tmp = filter_data;
for (int j = 0; j < c; ++j) {
if (if_bias) {
vbias = vdupq_n_f32(bias_data[j]);
}
int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0
float w00 = filter_data_tmp[0];
float w01 = filter_data_tmp[1];
float w02 = filter_data_tmp[2];
float w10 = filter_data_tmp[3];
float w11 = filter_data_tmp[4];
float w12 = filter_data_tmp[5];
float w20 = filter_data_tmp[6];
float w21 = filter_data_tmp[7];
float w22 = filter_data_tmp[8];
output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
w21 * input_data[l] + w22 * input_data[l + 1] +
bias_data[j];
output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
w20 * input_data[2 * l - 2] +
w21 * input_data[2 * l - 1] + bias_data[j];
output_data[(l - 1) * l] =
w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
bias_data[j];
output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
w01 * input_data[(l - 2) * (l + 1) + 1] +
w10 * input_data[l * l - 2] +
w11 * input_data[l * l - 1] + bias_data[j];
for (int i = 1; i < l - 1; ++i) {
output_data[i * l] =
w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
bias_data[j];
output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
w01 * input_data[i * l + l - 1 - l] +
w10 * input_data[i * l + l - 1 - 1] +
w11 * input_data[i * l + l - 1] +
w20 * input_data[i * l + l - 1 + l - 1] +
w21 * input_data[i * l + l - 1 + l] +
bias_data[j];
}
// top 1 row and bottom 1 row
const float *input_tmp = input_data;
float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
tmp3, tmp4, tmp5, out0;
in0 = vld1q_f32(input_tmp);
in2 = vld1q_f32(input_tmp + l);
const float *input_tmp_end = input_tmp + (l - 2) * l;
in4 = vld1q_f32(input_tmp_end);
in6 = vld1q_f32(input_tmp_end + l);
int c_mid = l_mid;
auto output_ptr = output_data + 1;
for (; c_mid > 3; c_mid -= 4) {
in1 = vld1q_f32(input_tmp + 4);
in3 = vld1q_f32(input_tmp + l + 4);
tmp0 = vextq_f32(in0, in1, 1);
tmp1 = vextq_f32(in0, in1, 2);
tmp2 = vextq_f32(in2, in3, 1);
tmp3 = vextq_f32(in2, in3, 2);
out0 = vmulq_n_f32(in0, w10);
out0 = vmlaq_n_f32(out0, tmp0, w11);
out0 = vmlaq_n_f32(out0, tmp1, w12);
out0 = vmlaq_n_f32(out0, in2, w20);
out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr, out0);
in5 = vld1q_f32(input_tmp_end + 4);
in7 = vld1q_f32(input_tmp_end + l + 4);
tmp0 = vextq_f32(in4, in5, 1);
tmp1 = vextq_f32(in4, in5, 2);
tmp2 = vextq_f32(in6, in7, 1);
tmp3 = vextq_f32(in6, in7, 2);
out0 = vmulq_n_f32(in4, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in6, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr + (l - 1) * l, out0);
// can optimize to each 8 stride.
input_tmp += 4;
input_tmp_end += 4;
output_ptr += 4;
in0 = in1;
in2 = in3;
in4 = in5;
in6 = in7;
}
// top right pad
float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
tmp0 = vextq_f32(in0, pad0, 1);
tmp1 = vextq_f32(in0, pad0, 2);
tmp2 = vextq_f32(in2, pad1, 1);
tmp3 = vextq_f32(in2, pad1, 2);
out0 = vmulq_n_f32(in0, w10);
out0 = vmlaq_n_f32(out0, tmp0, w11);
out0 = vmlaq_n_f32(out0, tmp1, w12);
out0 = vmlaq_n_f32(out0, in2, w20);
out0 = vmlaq_n_f32(out0, tmp2, w21);
out0 = vmlaq_n_f32(out0, tmp3, w22);
out0 = vaddq_f32(out0, vbias);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + i, out0, 2);
}
}
// bottom right pad
float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
tmp0 = vextq_f32(in4, pad2, 1);
tmp1 = vextq_f32(in4, pad2, 2);
tmp2 = vextq_f32(in6, pad3, 1);
tmp3 = vextq_f32(in6, pad3, 2);
out0 = vmulq_n_f32(in4, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in6, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vaddq_f32(out0, vbias);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
}
}
// mid
for (int i = 0; i < l - 2; ++i) {
auto output_ptr = output_data + (i + 1) * l + 1;
input_tmp = input_data + i * l;
auto in0_tmp = vld1q_f32(input_tmp);
auto in2_tmp = vld1q_f32(input_tmp + l);
auto in4_tmp = vld1q_f32(input_tmp + l + l);
c_mid = l_mid;
for (; c_mid > 3; c_mid -= 4) {
auto in1_tmp = vld1q_f32(input_tmp + 4);
auto in3_tmp = vld1q_f32(input_tmp + l + 4);
auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
out0 = vmulq_n_f32(in0_tmp, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in2_tmp, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vmlaq_n_f32(out0, in4_tmp, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
vst1q_f32(output_ptr, out0);
output_ptr += 4;
input_tmp += 4;
in0_tmp = in1_tmp;
in2_tmp = in3_tmp;
in4_tmp = in5_tmp;
}
float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
tmp0 = vextq_f32(in0_tmp, pad0, 1);
tmp1 = vextq_f32(in0_tmp, pad0, 2);
tmp2 = vextq_f32(in2_tmp, pad1, 1);
tmp3 = vextq_f32(in2_tmp, pad1, 2);
tmp4 = vextq_f32(in4_tmp, pad2, 1);
tmp5 = vextq_f32(in4_tmp, pad2, 2);
out0 = vmulq_n_f32(in0_tmp, w00);
out0 = vmlaq_n_f32(out0, tmp0, w01);
out0 = vmlaq_n_f32(out0, tmp1, w02);
out0 = vmlaq_n_f32(out0, in2_tmp, w10);
out0 = vmlaq_n_f32(out0, tmp2, w11);
out0 = vmlaq_n_f32(out0, tmp3, w12);
out0 = vmlaq_n_f32(out0, in4_tmp, w20);
out0 = vmlaq_n_f32(out0, tmp4, w21);
out0 = vmlaq_n_f32(out0, tmp5, w22);
out0 = vaddq_f32(out0, vbias);
for (int i = 0; i < c_mid; ++i) {
if (i == 0) {
vst1q_lane_f32(output_ptr + i, out0, 0);
}
if (i == 1) {
vst1q_lane_f32(output_ptr + i, out0, 1);
}
if (i == 2) {
vst1q_lane_f32(output_ptr + i, out0, 2);
}
}
}
output_data += hxw;
input_data += hxw;
filter_data_tmp += 9;
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
...@@ -13,20 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,20 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "framework/tensor.h"
#include <string>
#include "stdio.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators {
class PaddleMobileObject { namespace math {
public: using framework::Tensor;
virtual std::string ToString() {
char address[128] = {0}; void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
sprintf(address, "%p", this); Tensor bias, bool if_bias);
return std::string(address); } // namespace math
} } // namespace operators
private:
};
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -13,17 +13,25 @@ See the License for the specific language governing permissions and ...@@ -13,17 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#ifndef X86
#include <arm_neon.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
alignas(64) float packedA[MC * KC];
alignas(64) float packedB[KC * NC];
alignas(64) float ab[MR * NR];
// 将A矩阵分块复制到连续内存(ColMajor) // 将A矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Aij; const float *Aij;
for (i = 0; i < m - paddingM; i += MR) { for (i = 0; i < m - m_tail; i += MR) {
for (int j = 0; j < k; ++j) { for (j = 0; j < k; ++j) {
Aij = &A(i, j); Aij = &A(i, j);
*buffer++ = *Aij; *buffer++ = *Aij;
*buffer++ = *(Aij + 1); *buffer++ = *(Aij + 1);
...@@ -31,13 +39,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, ...@@ -31,13 +39,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
*buffer++ = *(Aij + 3); *buffer++ = *(Aij + 3);
} }
} }
if (paddingM != 0) { if (m_tail != 0) {
for (j = 0; j < k; ++j) { for (j = 0; j < k; ++j) {
Aij = &A(m - paddingM, j); Aij = &A(m - m_tail, j);
for (i = 0; i < paddingM; ++i) { for (i = 0; i < m_tail; ++i) {
*buffer++ = *(Aij + i); *buffer++ = *(Aij + i);
} }
for (i = paddingM; i < MR; ++i) { for (i = m_tail; i < MR; ++i) {
*buffer++ = 0; *buffer++ = 0;
} }
} }
...@@ -45,11 +53,11 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, ...@@ -45,11 +53,11 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
} }
// 将A矩阵分块复制到连续内存(RowMajor) // 将A矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda, void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Ai, *Ai1, *Ai2, *Ai3; const float *Ai, *Ai1, *Ai2, *Ai3;
for (i = 0; i < m - paddingM; i += MR) { for (i = 0; i < m - m_tail; i += MR) {
Ai = &A(i, 0); Ai = &A(i, 0);
Ai1 = &A(i + 1, 0); Ai1 = &A(i + 1, 0);
Ai2 = &A(i + 2, 0); Ai2 = &A(i + 2, 0);
...@@ -61,12 +69,12 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda, ...@@ -61,12 +69,12 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
*buffer++ = *Ai3++; *buffer++ = *Ai3++;
} }
} }
if (paddingM != 0) { if (m_tail != 0) {
for (j = 0; j < k; ++j) { for (j = 0; j < k; ++j) {
for (i = m - paddingM; i < m; ++i) { for (i = m - m_tail; i < m; ++i) {
*buffer++ = A(i, j); *buffer++ = A(i, j);
} }
for (i = m; i < m + (MR - paddingM); ++i) { for (i = m; i < m + (MR - m_tail); ++i) {
*buffer++ = 0; *buffer++ = 0;
} }
} }
...@@ -74,11 +82,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda, ...@@ -74,11 +82,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
} }
// 将B矩阵分块复制到连续内存(ColMajor) // 将B矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Bj, *Bj1, *Bj2, *Bj3; const float *Bj, *Bj1, *Bj2, *Bj3;
for (j = 0; j < n - paddingN; j += NR) { for (j = 0; j < n - n_tail; j += NR) {
Bj = &B(0, j); Bj = &B(0, j);
Bj1 = &B(0, j + 1); Bj1 = &B(0, j + 1);
Bj2 = &B(0, j + 2); Bj2 = &B(0, j + 2);
...@@ -90,12 +98,12 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, ...@@ -90,12 +98,12 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
*buffer++ = *Bj3++; *buffer++ = *Bj3++;
} }
} }
if (paddingN != 0) { if (n_tail != 0) {
for (i = 0; i < k; ++i) { for (i = 0; i < k; ++i) {
for (int j = n - paddingN; j < n; ++j) { for (int j = n - n_tail; j < n; ++j) {
*buffer++ = B(i, j); *buffer++ = B(i, j);
} }
for (int j = n; j < n + (NR - paddingN); ++j) { for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0; *buffer++ = 0;
} }
} }
...@@ -103,26 +111,28 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, ...@@ -103,26 +111,28 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
} }
// 将B矩阵分块复制到连续内存(RowMajor) // 将B矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) { float *buffer) {
int i, j; int i, j;
const float *Bij; const float *Bij;
for (j = 0; j < n - paddingN; j += NR) { for (j = 0; j < n - n_tail; j += NR) {
for (i = 0; i < k; ++i) { for (i = 0; i < k; ++i) {
Bij = &B(i, j); Bij = &B(i, j);
*buffer++ = *Bij; asm volatile(
*buffer++ = *(Bij + 1); "vld1.32 {q0}, [%[Bij]] \n\t"
*buffer++ = *(Bij + 2); "vst1.32 {q0}, [%[buffer]]! \n\t"
*buffer++ = *(Bij + 3); : [buffer] "+r"(buffer)
: [Bij] "r"(Bij)
: "memory", "q0");
} }
} }
if (paddingN != 0) { if (n_tail != 0) {
for (i = 0; i < k; ++i) { for (i = 0; i < k; ++i) {
Bij = &B(i, n - paddingN); Bij = &B(i, n - n_tail);
for (int j = n - paddingN; j < n; ++j) { for (int j = n - n_tail; j < n; ++j) {
*buffer++ = *Bij++; *buffer++ = *Bij++;
} }
for (int j = n; j < n + (NR - paddingN); ++j) { for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0; *buffer++ = 0;
} }
} }
...@@ -133,53 +143,545 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, ...@@ -133,53 +143,545 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
int first_time) { int first_time) {
int Buff_A_M = m; int m_block = (m + MR - 1) / MR * MR;
int Buff_B_N = n; int n_block = (n + NR - 1) / NR * NR;
int _mc = m % MR; int m_tail = m % MR;
int _nc = n % NR; int n_tail = n % NR;
if (_mc != 0) { if (first_time) {
Buff_A_M = m + (MR - _mc); PackMatrixB_(k, n, n_tail, B, ldb, packedB);
} }
PackMatrixA_(m, k, m_tail, A, lda, packedA);
int i, j, mc, nc;
if (_nc != 0) { // B 取 4 列, 打包预热
Buff_B_N = n + (NR - _nc); for (j = 0; j < n_block; j += NR) {
nc = (n - j) < NR ? n_tail : NR;
// A 取 4 行,打包预热
for (i = 0; i < m_block; i += MR) {
mc = (m - i) < MR ? m_tail : MR;
AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc);
}
} }
}
float packedA[MC * KC]; // 分块矩阵乘法
static float packedB[KC * NC]; void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
int first_time, bool relu = false) {
int m_block = (m + MR - 1) / MR * MR;
int n_block = (n + NR - 1) / NR * NR;
int m_tail = m % MR;
int n_tail = n % NR;
if (first_time) { if (first_time) {
PackMatrixB_(k, n, _nc, B, ldb, packedB); PackMatrixB_(k, n, n_tail, B, ldb, packedB);
} }
PackMatrixA_(m, k, _mc, A, lda, packedA); PackMatrixA_(m, k, m_tail, A, lda, packedA);
int i, j, mc, nc; int i, j, mc, nc;
// B 取 4 列, 打包预热 // B 取 4 列, 打包预热
for (j = 0; j < Buff_B_N; j += NR) { for (j = 0; j < n_block; j += NR) {
nc = (n - j) < NR ? _nc : NR; nc = (n - j) < NR ? n_tail : NR;
// A 取 4 行,打包预热 // A 取 4 行,打包预热
for (i = 0; i < Buff_A_M; i += MR) { for (i = 0; i < m_block; i += MR) {
mc = (m - i) < MR ? _mc : MR; mc = (m - i) < MR ? m_tail : MR;
AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta, AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
&C(i, j), ldc, mc, nc); &C(i, j), ldc, mc, nc, relu);
} }
} }
} }
// 计算一个更小的 4 * 4 的 C 矩阵分块 // 计算一个更小的 4 * 4 的 C 矩阵分块
#if defined(IOS)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) {
// init C
float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0);
float32x4_t cv2 = vdupq_n_f32(0.0);
float32x4_t cv3 = vdupq_n_f32(0.0);
float32x4_t av;
float32x4_t bv;
float32x2_t av01;
float32x2_t av23;
for (int p = 0; p < k; p += 1) {
av = vld1q_f32(a);
bv = vld1q_f32(b);
av01 = vget_low_f32(av);
cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
av23 = vget_high_f32(av);
cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
a += MR;
b += NR;
}
float32x4x4_t cv = {cv0, cv1, cv2, cv3};
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (j == 0) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
} else if (j == 1) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
} else if (j == 2) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
} else if (j == 3) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
}
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu = false) {
// init C
float32x4_t cv0 = vdupq_n_f32(0.0);
float32x4_t cv1 = vdupq_n_f32(0.0);
float32x4_t cv2 = vdupq_n_f32(0.0);
float32x4_t cv3 = vdupq_n_f32(0.0);
float32x4_t av;
float32x4_t bv;
float32x2_t av01;
float32x2_t av23;
for (int p = 0; p < k; p += 1) {
av = vld1q_f32(a);
bv = vld1q_f32(b);
av01 = vget_low_f32(av);
cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
av23 = vget_high_f32(av);
cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
a += MR;
b += NR;
}
float32x4x4_t cv = {cv0, cv1, cv2, cv3};
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (j == 0) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
} else if (j == 1) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
} else if (j == 2) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
} else if (j == 3) {
C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
}
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
}
#elif defined(ARMV7)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) {
int kc1 = k / 4, kc2 = k % 4;
int bytes_ldc = 4 * ldc;
int flag_alpha = (alpha == 1.0) ? 1 : 2;
int flag_beta;
if (beta == 0.0) {
flag_beta = 0;
} else if (beta == 1.0) {
flag_beta = 1;
} else {
flag_beta = 2;
}
asm volatile(
"pld [%[a]] \n\t"
"pld [%[b]] \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"pld [%[a], #64] \n\t"
"pld [%[b], #64] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t"
"vmla.f32 q13, q1, d1[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t"
"bne temp_%= \n\t"
"cmp %[nc], #4 \n\t"
"bne temp_%= \n\t"
"vmov.f32 d8[0], %[alpha] \n\t"
"vmov.f32 d8[1], %[beta] \n\t"
"cmp %[flag_alpha], #1 \n\t"
"bne alpha_%= \n\t"
"alpha_%=: \n\t"
"vmul.f32 q10, q10, d8[0] \n\t"
"vmul.f32 q11, q11, d8[0] \n\t"
"vmul.f32 q12, q12, d8[0] \n\t"
"vmul.f32 q13, q13, d8[0] \n\t"
"beta_%=: \n\t"
"cmp %[flag_beta], #0 \n\t"
"beq memory_%= \n\t"
"mov r4, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vld1.32 {q0}, [r4], r6 \n\t"
"vld1.32 {q1}, [r4], r6 \n\t"
"vld1.32 {q2}, [r4], r6 \n\t"
"vld1.32 {q3}, [r4] \n\t"
"cmp %[flag_beta], #1 \n\t"
"beq beta_eq1_%= \n\t"
"bne beta_ne1_%= \n\t"
"beta_eq1_%=: \n\t"
"vadd.f32 q10, q10, q0 \n\t"
"vadd.f32 q11, q11, q1 \n\t"
"vadd.f32 q12, q12, q2 \n\t"
"vadd.f32 q13, q13, q3 \n\t"
"b memory_%= \n\t"
"beta_ne1_%=: \n\t"
"vmla.f32 q10, q0, d8[1] \n\t"
"vmla.f32 q11, q1, d8[1] \n\t"
"vmla.f32 q12, q2, d8[1] \n\t"
"vmla.f32 q13, q3, d8[1] \n\t"
"memory_%=: \n\t"
"mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t"
"vst1.32 {q11}, [r5], r6 \n\t"
"vst1.32 {q12}, [r5], r6 \n\t"
"vst1.32 {q13}, [r5] \n\t"
"b end_%= \n\t"
"temp_%=: \n\t"
"vst1.32 {q10, q11}, [%[ab]]!\n\t"
"vst1.32 {q12, q13}, [%[ab]] \n\t"
"end_%=: \n\t"
:
: [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
if (mc != MR || nc != NR) {
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
if (alpha != 1.0) {
C(i, j) = alpha * ab[i * MR + j];
} else {
C(i, j) = ab[i * MR + j];
}
} else {
if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * ab[i * MR + j];
} else {
C(i, j) += ab[i * MR + j];
}
}
}
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu = false) {
int kc1 = k / 4, kc2 = k % 4;
int bytes_ldc = 4 * ldc;
int flag_alpha = (alpha == 1.0) ? 1 : 2;
int flag_beta;
if (beta == 0.0) {
flag_beta = 0;
} else if (beta == 1.0) {
flag_beta = 1;
} else {
flag_beta = 2;
}
asm volatile(
"pld [%[a]] \n\t"
"pld [%[b]] \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"pld [%[a], #64] \n\t"
"pld [%[b], #64] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"vld1.32 {q0, q1}, [%[a]]! \n\t"
"vld1.32 {q2, q3}, [%[b]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q2, d0[1] \n\t"
"vmla.f32 q12, q2, d1[0] \n\t"
"vmla.f32 q13, q2, d1[1] \n\t"
"vmla.f32 q10, q3, d2[0] \n\t"
"vmla.f32 q11, q3, d2[1] \n\t"
"vmla.f32 q12, q3, d3[0] \n\t"
"vmla.f32 q13, q3, d3[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"blt end_kc2_%= \n\t"
"loop_kc2_%=: \n\t"
"vld1.32 {q0}, [%[a]]! \n\t"
"vld1.32 {q1}, [%[b]]! \n\t"
"vmla.f32 q10, q1, d0[0] \n\t"
"vmla.f32 q11, q1, d0[1] \n\t"
"vmla.f32 q12, q1, d1[0] \n\t"
"vmla.f32 q13, q1, d1[1] \n\t"
"subs %[kc2], %[kc2], #1 \n\t"
"bge loop_kc2_%= \n\t"
"end_kc2_%=: \n\t"
"cmp %[mc], #4 \n\t"
"bne temp_%= \n\t"
"cmp %[nc], #4 \n\t"
"bne temp_%= \n\t"
"vmov.f32 d8[0], %[alpha] \n\t"
"vmov.f32 d8[1], %[beta] \n\t"
"cmp %[flag_alpha], #1 \n\t"
"bne alpha_%= \n\t"
"alpha_%=: \n\t"
"vmul.f32 q10, q10, d8[0] \n\t"
"vmul.f32 q11, q11, d8[0] \n\t"
"vmul.f32 q12, q12, d8[0] \n\t"
"vmul.f32 q13, q13, d8[0] \n\t"
"beta_%=: \n\t"
"cmp %[flag_beta], #0 \n\t"
"beq memory_%= \n\t"
"mov r4, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vld1.32 {q0}, [r4], r6 \n\t"
"vld1.32 {q1}, [r4], r6 \n\t"
"vld1.32 {q2}, [r4], r6 \n\t"
"vld1.32 {q3}, [r4] \n\t"
"cmp %[flag_beta], #1 \n\t"
"beq beta_eq1_%= \n\t"
"bne beta_ne1_%= \n\t"
"beta_eq1_%=: \n\t"
"vadd.f32 q10, q10, q0 \n\t"
"vadd.f32 q11, q11, q1 \n\t"
"vadd.f32 q12, q12, q2 \n\t"
"vadd.f32 q13, q13, q3 \n\t"
"b memory_%= \n\t"
"beta_ne1_%=: \n\t"
"vmla.f32 q10, q0, d8[1] \n\t"
"vmla.f32 q11, q1, d8[1] \n\t"
"vmla.f32 q12, q2, d8[1] \n\t"
"vmla.f32 q13, q3, d8[1] \n\t"
"memory_%=: \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"mov r5, %[C] \n\t"
"mov r6, %[bytes_ldc]\n\t"
"vst1.32 {q10}, [r5], r6 \n\t"
"vst1.32 {q11}, [r5], r6 \n\t"
"vst1.32 {q12}, [r5], r6 \n\t"
"vst1.32 {q13}, [r5] \n\t"
"b end_%= \n\t"
"temp_%=: \n\t"
"vst1.32 {q10, q11}, [%[ab]]!\n\t"
"vst1.32 {q12, q13}, [%[ab]] \n\t"
"end_%=: \n\t"
:
: [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
[kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
[beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
[flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
: "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
"q14");
if (mc != MR || nc != NR) {
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
if (alpha != 1.0) {
C(i, j) = alpha * ab[i * MR + j];
} else {
C(i, j) = ab[i * MR + j];
}
} else {
if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * ab[i * MR + j];
} else {
C(i, j) += ab[i * MR + j];
}
}
if (relu) {
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
}
}
}
}
#else
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) { int ldb, float beta, float *C, int ldc, int mc, int nc) {
float c[16] = {0}; float c[16] = {0};
float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3; float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
// // init C for (int p = 0; p < k; p += 1) {
// float32x4_t cv0 = vdup_n_f32(0.0); reg_b0 = *b++;
// float32x4_t cv1 = vdup_n_f32(0.0); reg_b1 = *b++;
// float32x4_t cv2 = vdup_n_f32(0.0); reg_b2 = *b++;
// float32x4_t cv3 = vdup_n_f32(0.0); reg_b3 = *b++;
reg_a0 = *a++;
reg_a1 = *a++;
reg_a2 = *a++;
reg_a3 = *a++;
// first row
c[0] += reg_a0 * reg_b0;
c[1] += reg_a0 * reg_b1;
c[2] += reg_a0 * reg_b2;
c[3] += reg_a0 * reg_b3;
// second row
c[4] += reg_a1 * reg_b0;
c[5] += reg_a1 * reg_b1;
c[6] += reg_a1 * reg_b2;
c[7] += reg_a1 * reg_b3;
// third row
c[8] += reg_a2 * reg_b0;
c[9] += reg_a2 * reg_b1;
c[10] += reg_a2 * reg_b2;
c[11] += reg_a2 * reg_b3;
// fourth row
c[12] += reg_a3 * reg_b0;
c[13] += reg_a3 * reg_b1;
c[14] += reg_a3 * reg_b2;
c[15] += reg_a3 * reg_b3;
}
int i, j;
for (i = 0; i < mc; ++i) {
for (j = 0; j < nc; ++j) {
if (beta == 0.0) {
C(i, j) = 0.0;
} else if (beta != 1.0) {
C(i, j) *= beta;
}
if (alpha != 1.0) {
C(i, j) += alpha * c[i * MR + j];
} else {
C(i, j) += c[i * MR + j];
}
}
}
}
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu) {
float c[16] = {0};
float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
for (int p = 0; p < k; p += 1) { for (int p = 0; p < k; p += 1) {
reg_b0 = *b++; reg_b0 = *b++;
...@@ -229,15 +731,26 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, ...@@ -229,15 +731,26 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
} else { } else {
C(i, j) += c[i * MR + j]; C(i, j) += c[i * MR + j];
} }
if (relu) {
if (C(i, j) < 0) {
C(i, j) = 0;
}
}
} }
} }
} }
#endif
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void sgemm(int m, int n, int k, float alpha, const float *A, int lda, void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) { const float *B, int ldb, float beta, float *C, int ldc) {
int i, j, p, mc, nc, kc; int i, j, p, mc, nc, kc;
float beta_; float beta_;
if (m == 1) {
VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
return;
}
for (j = 0; j < n; j += NC) { for (j = 0; j < n; j += NC) {
nc = s_min(n - j, NC); nc = s_min(n - j, NC);
for (p = 0; p < k; p += KC) { for (p = 0; p < k; p += KC) {
...@@ -256,6 +769,248 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -256,6 +769,248 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
} }
} }
void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) {
int i, j, p, mc, nc, kc;
float beta_;
for (j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
for (p = 0; p < k; p += KC) {
kc = s_min(k - p, KC);
for (i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
if (p != 0) {
beta_ = 1.0;
} else {
beta_ = beta;
}
if (p + KC >= k) {
InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
beta_, &C(i, j), ldc, i == 0, true);
} else {
InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
&C(i, j), ldc, i == 0);
}
}
}
}
}
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc) {
float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
const float *a0, *b0, *b1, *b2, *b3;
float *c0, *C0;
int volatile kc1 = k / 4;
int volatile kc2 = k % 4;
int volatile nc1 = n / 16;
int _nc1 = n % 16;
int volatile nc2 = _nc1 / 4;
int volatile nc3 = _nc1 % 4;
for (int i = 0; i < kc1; i++) {
a0 = A + i * 4;
b0 = B + i * 4 * ldb;
b1 = b0 + ldb;
b2 = b1 + ldb;
b3 = b2 + ldb;
c0 = bufferC;
asm volatile(
"pld [%[a0], #16] \n\t"
"vld1.32 {q0}, [%[a0]] \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"cmp %[i], #0 \n\t"
"beq i_eq0_%= \n\t"
"bne i_ne0_%= \n\t"
"i_eq0_%=: \n\t"
"vmov.f32 q10, #0.0 \n\t"
"vmov.f32 q11, #0.0 \n\t"
"vmov.f32 q12, #0.0 \n\t"
"vmov.f32 q13, #0.0 \n\t"
"b gemm_nc1_%= \n\t"
"i_ne0_%=: \n\t"
"pld [%[c0], #64] \n\t"
"vld1.32 {q10, q11}, [%[c0]]! \n\t"
"vld1.32 {q12, q13}, [%[c0]] \n\t"
"sub %[c0], %[c0], #32 \n\t"
"gemm_nc1_%=: \n\t"
"pld [%[b0], #64] \n\t"
"vld1.32 {q2, q3}, [%[b0]]! \n\t"
"vld1.32 {q4, q5}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q3, d0[0] \n\t"
"vmla.f32 q12, q4, d0[0] \n\t"
"vmla.f32 q13, q5, d0[0] \n\t"
"pld [%[b1], #64] \n\t"
"vld1.32 {q2, q3}, [%[b1]]! \n\t"
"vld1.32 {q4, q5}, [%[b1]]! \n\t"
"vmla.f32 q10, q2, d0[1] \n\t"
"vmla.f32 q11, q3, d0[1] \n\t"
"vmla.f32 q12, q4, d0[1] \n\t"
"vmla.f32 q13, q5, d0[1] \n\t"
"pld [%[b2], #64] \n\t"
"vld1.32 {q2, q3}, [%[b2]]! \n\t"
"vld1.32 {q4, q5}, [%[b2]]! \n\t"
"vmla.f32 q10, q2, d1[0] \n\t"
"vmla.f32 q11, q3, d1[0] \n\t"
"vmla.f32 q12, q4, d1[0] \n\t"
"vmla.f32 q13, q5, d1[0] \n\t"
"pld [%[b3], #64] \n\t"
"vld1.32 {q2, q3}, [%[b3]]! \n\t"
"vld1.32 {q4, q5}, [%[b3]]! \n\t"
"vmla.f32 q10, q2, d1[1] \n\t"
"vmla.f32 q11, q3, d1[1] \n\t"
"vmla.f32 q12, q4, d1[1] \n\t"
"vmla.f32 q13, q5, d1[1] \n\t"
"vst1.32 {q10, q11}, [%[c0]]! \n\t"
"vst1.32 {q12, q13}, [%[c0]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"cmp %[i], #0 \n\t"
"beq ii_eq0_%= \n\t"
"bne ii_ne0_%= \n\t"
"ii_eq0_%=: \n\t"
"vmov.f32 q10, #0.0 \n\t"
"b gemm_nc2_%= \n\t"
"ii_ne0_%=: \n\t"
"pld [%[c0], #16] \n\t"
"vld1.32 {q10}, [%[c0]] \n\t"
"gemm_nc2_%=: \n\t"
"pld [%[b0], #16] \n\t"
"vld1.32 {q2}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"pld [%[b1], #16] \n\t"
"vld1.32 {q3}, [%[b1]]! \n\t"
"vmla.f32 q10, q3, d0[1] \n\t"
"pld [%[b2], #16] \n\t"
"vld1.32 {q4}, [%[b2]]! \n\t"
"vmla.f32 q10, q4, d1[0] \n\t"
"pld [%[b3], #16] \n\t"
"vld1.32 {q5}, [%[b3]]! \n\t"
"vmla.f32 q10, q5, d1[1] \n\t"
"vst1.32 {q10}, [%[c0]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
: [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
[c0] "+r"(c0)
: [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
: "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
for (int j = 0; j < nc3; j++) {
if (i == 0) {
*c0 = (*a0) * (*b0++);
} else {
*c0 += (*a0) * (*b0++);
}
*c0 += (*(a0 + 1)) * (*b1++);
*c0 += (*(a0 + 2)) * (*b2++);
*c0 += (*(a0 + 3)) * (*b3++);
c0++;
}
}
for (int i = 0; i < kc2; ++i) {
a0 = A + 4 * kc1 + i;
b0 = B + (4 * kc1 + i) * ldb;
c0 = bufferC;
asm volatile(
"pld [%[a0], #16] \n\t"
"vld1.32 {d0}, [%[a0]] \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"blt end_nc1_%= \n\t"
"loop_nc1_%=: \n\t"
"pld [%[c0], #64] \n\t"
"vld1.32 {q10, q11}, [%[c0]]! \n\t"
"vld1.32 {q12, q13}, [%[c0]] \n\t"
"sub %[c0], %[c0], #32 \n\t"
"gemm_nc1_%=: \n\t"
"pld [%[b0], #64] \n\t"
"vld1.32 {q2, q3}, [%[b0]]! \n\t"
"vld1.32 {q4, q5}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vmla.f32 q11, q3, d0[0] \n\t"
"vmla.f32 q12, q4, d0[0] \n\t"
"vmla.f32 q13, q5, d0[0] \n\t"
"vst1.32 {q10, q11}, [%[c0]]! \n\t"
"vst1.32 {q12, q13}, [%[c0]]! \n\t"
"subs %[nc1], %[nc1], #1 \n\t"
"bge loop_nc1_%= \n\t"
"end_nc1_%=: \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"pld [%[c0], #16] \n\t"
"vld1.32 {q10}, [%[c0]] \n\t"
"gemm_nc2_%=: \n\t"
"vld1.32 {q2}, [%[b0]]! \n\t"
"vmla.f32 q10, q2, d0[0] \n\t"
"vst1.32 {q10}, [%[c0]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
: [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
[c0] "+r"(c0)
: [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
: "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
for (int j = 0; j < nc3; j++) {
*c0 += (*a0) * (*b0++);
c0++;
}
}
c0 = bufferC;
C0 = C;
for (int i = 0; i < n; i++) {
if (beta == 1.0) {
*C0++ += *c0++;
} else {
*C0++ = *c0++;
}
}
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -20,9 +20,9 @@ limitations under the License. */ ...@@ -20,9 +20,9 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)] #define C(i, j) C[(i)*ldc + (j)]
// 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k // 分块计算的块大小,mc 与 kc 分别对应分块计算时的 m 与 k
#define MC 384 #define MC 128
#define KC 384 #define KC 128
#define NC 4096 #define NC 1024
#define MR 4 #define MR 4
#define NR 4 #define NR 4
...@@ -33,19 +33,19 @@ namespace operators { ...@@ -33,19 +33,19 @@ namespace operators {
namespace math { namespace math {
// 将 A 矩阵分块复制到连续内存(ColMajor) // 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int paddingM, const float *A, int lda, void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(ColMajor) // 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb, void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 将 A 矩阵分块复制到连续内存(RowMajor) // 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda, void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer); float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor) // 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb, void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer); float *buffer);
// 分块矩阵乘法 // 分块矩阵乘法
...@@ -53,14 +53,25 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -53,14 +53,25 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc, const float *B, int ldb, float beta, float *C, int ldc,
int first_time); int first_time);
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc);
// 计算一个更小的 4 * 4 的 C 矩阵分块 // 计算一个更小的 4 * 4 的 C 矩阵分块
void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B, void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
int ldb, float beta, float *C, int ldc, int mc, int nc); int ldb, float beta, float *C, int ldc, int mc, int nc);
void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc,
bool relu);
// 32位 float 矩阵乘法 // 32位 float 矩阵乘法
void sgemm(int m, int n, int k, float alpha, const float *A, int lda, void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc); const float *B, int ldb, float beta, float *C, int ldc);
void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc);
// 64位 double 矩阵乘法 // 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda, void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
const double *B, int ldb, float beta, double *C, int ldc); const double *B, int ldb, float beta, double *C, int ldc);
......
...@@ -14,8 +14,10 @@ limitations under the License. */ ...@@ -14,8 +14,10 @@ limitations under the License. */
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include <vector> #include <vector>
#ifdef __ARM_NEON
#include "arm_neon.h"
#endif
#include "common/types.h" #include "common/types.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -65,9 +67,350 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -65,9 +67,350 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
// are " "inconsistent."); // are " "inconsistent.");
int channels_col = im_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
const T *im_data = im.data<T>(); const T *im_data = im.data<T>();
T *col_data = col->data<T>(); T *col_data = col->data<T>();
#ifdef __ARM_NEON
const int osize = col_height;
const int isize = im_height;
bool pad1 = padding[0] > 0;
bool pad2 =
(pad1 &&
(((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
int fill = isize % 2;
if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
dilation[0] == 1) {
for (int c = 0; c < im_channels; ++c) {
int oosize = osize * osize;
int nk4 = osize / 4;
int mk4 = osize % 4;
float *col0 = col_data + 0 * oosize + 2 * osize + 2;
float *col1 = col_data + 1 * oosize + 2 * osize + 1;
float *col2 = col_data + 2 * oosize + 2 * osize;
float *col3 = col_data + 3 * oosize + osize + 2;
float *col4 = col_data + 4 * oosize + osize + 1;
float *col5 = col_data + 5 * oosize + osize;
float *col6 = col_data + 6 * oosize + 2;
float *col7 = col_data + 7 * oosize + 1;
float *col8 = col_data + 8 * oosize;
float32x4_t im1;
const float *im_tmp_data = im_data + osize + 1;
int rrsize = oosize - osize - 1;
int nr4 = rrsize / 4;
int mr4 = rrsize % 4;
for (int i = 0; i < nr4; ++i) {
im1 = vld1q_f32(im_tmp_data);
vst1q_f32(col0, im1);
vst1q_f32(col1, im1);
vst1q_f32(col2, im1);
vst1q_f32(col3, im1);
vst1q_f32(col4, im1);
vst1q_f32(col5, im1);
vst1q_f32(col6, im1);
vst1q_f32(col7, im1);
vst1q_f32(col8, im1);
col0 += 4;
col1 += 4;
col2 += 4;
col3 += 4;
col4 += 4;
col5 += 4;
col6 += 4;
col7 += 4;
col8 += 4;
im_tmp_data += 4;
}
for (int i = 0; i < mr4; ++i) {
*col0 = *im_tmp_data;
*col1 = *im_tmp_data;
*col2 = *im_tmp_data;
*col3 = *im_tmp_data;
*col4 = *im_tmp_data;
*col5 = *im_tmp_data;
*col6 = *im_tmp_data;
*col7 = *im_tmp_data;
*col8 = *im_tmp_data;
col0++;
col1++;
col2++;
col3++;
col4++;
col5++;
col6++;
col7++;
col8++;
im_tmp_data++;
}
im_tmp_data = im_data + 1;
col0 = col_data + 0 * oosize + osize + 2;
col1 = col_data + 1 * oosize + osize + 1;
col2 = col_data + 2 * oosize + osize;
col3 = col_data + 3 * oosize + 2;
col4 = col_data + 4 * oosize + 1;
col5 = col_data + 5 * oosize;
for (int i = 0; i < nk4; i++) {
im1 = vld1q_f32(im_tmp_data);
vst1q_f32(col0, im1);
vst1q_f32(col1, im1);
vst1q_f32(col2, im1);
vst1q_f32(col3, im1);
vst1q_f32(col4, im1);
vst1q_f32(col5, im1);
col0 += 4;
col1 += 4;
col2 += 4;
col3 += 4;
col4 += 4;
col5 += 4;
im_tmp_data += 4;
}
for (int i = 0; i < mk4; i++) {
*col0 = *im_tmp_data;
*col1 = *im_tmp_data;
*col2 = *im_tmp_data;
*col3 = *im_tmp_data;
*col4 = *im_tmp_data;
*col5 = *im_tmp_data;
col0++;
col1++;
col2++;
col3++;
col4++;
col5++;
im_tmp_data++;
}
// fill 0 1 11;
for (int i = 0; i < osize; ++i) {
col_data[0 * oosize + i * osize] = 0.0;
col_data[3 * oosize + i * osize] = 0.0;
col_data[6 * oosize + i * osize] = 0.0;
col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
}
col_data[0 * oosize + osize + 1] = im_data[0];
col_data[3 * oosize + 1] = im_data[0];
col_data[6 * oosize + 1] = im_data[osize];
col_data[1 * oosize + osize] = im_data[0];
col_data[4 * oosize] = im_data[0];
col_data[7 * oosize] = im_data[osize];
float32x4_t zero4;
zero4 = vdupq_n_f32(0.0);
auto col_z0 = col_data;
auto col_z1 = col_data + oosize;
auto col_z2 = col_data + 2 * oosize;
auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
for (int i = 0; i < nk4; ++i) {
vst1q_f32(col_z0, zero4);
vst1q_f32(col_z1, zero4);
vst1q_f32(col_z2, zero4);
vst1q_f32(col_z6, zero4);
vst1q_f32(col_z7, zero4);
vst1q_f32(col_z8, zero4);
col_z0 += 4;
col_z1 += 4;
col_z2 += 4;
col_z6 += 4;
col_z7 += 4;
col_z8 += 4;
}
for (int i = 0; i < mk4; ++i) {
col_z0[i] = 0.0;
col_z1[i] = 0.0;
col_z2[i] = 0.0;
col_z6[i] = 0.0;
col_z7[i] = 0.0;
col_z8[i] = 0.0;
}
col_data += 9 * oosize;
im_data += isize * isize;
}
} else if (stride[0] == 2 && filter_height == 3 && pad1 &&
dilation[0] == 1) {
for (int c = 0; c < im_channels; ++c) {
int oosize = osize * osize;
int nk4 = osize / 4;
int mk4 = osize % 4;
// 3 2 3 1 0 1 3 2 3
float *col0 = col_data + 0 * oosize + osize + 1;
float *col1 = col_data + 1 * oosize + osize;
float *col2 = col_data + 2 * oosize + osize;
float *col3 = col_data + 3 * oosize + 1;
float *col4 = col_data + 4 * oosize;
float *col5 = col_data + 5 * oosize;
float *col6 = col_data + 6 * oosize + 1;
float *col7 = col_data + 7 * oosize;
float *col8 = col_data + 8 * oosize;
float32x4x2_t im01;
float32x4x2_t im23;
const float *im_tmp_data0 = im_data;
const float *im_tmp_data2 = im_data + isize;
for (int j = 0; j < osize; ++j) {
for (int i = 0; i < nk4; ++i) {
im01 = vld2q_f32(im_tmp_data0);
im23 = vld2q_f32(im_tmp_data2);
vst1q_f32(col0, im23.val[1]);
vst1q_f32(col1, im23.val[0]);
vst1q_f32(col2, im23.val[1]);
vst1q_f32(col3, im01.val[1]);
vst1q_f32(col4, im01.val[0]);
vst1q_f32(col5, im01.val[1]);
vst1q_f32(col6, im23.val[1]);
vst1q_f32(col7, im23.val[0]);
vst1q_f32(col8, im23.val[1]);
col0 += 4;
col1 += 4;
col2 += 4;
col3 += 4;
col4 += 4;
col5 += 4;
col6 += 4;
col7 += 4;
col8 += 4;
im_tmp_data0 += 8;
im_tmp_data2 += 8;
}
const float *im_tmp_data1 = im_tmp_data0 + 1;
const float *im_tmp_data3 = im_tmp_data2 + 1;
for (int i = 0; i < mk4; ++i) {
*col0 = *im_tmp_data3;
*col1 = *im_tmp_data2;
*col2 = *im_tmp_data3;
*col3 = *im_tmp_data1;
*col4 = *im_tmp_data0;
*col5 = *im_tmp_data1;
*col6 = *im_tmp_data3;
*col7 = *im_tmp_data2;
*col8 = *im_tmp_data3;
col0++;
col1++;
col2++;
col3++;
col4++;
col5++;
col6++;
col7++;
col8++;
im_tmp_data0 += 2;
im_tmp_data1 += 2;
im_tmp_data2 += 2;
im_tmp_data3 += 2;
}
im_tmp_data0 += (isize - fill);
im_tmp_data2 += (isize - fill);
}
for (int i = 0; i < osize; ++i) {
col_data[0 * oosize + i * osize] = 0.0;
col_data[3 * oosize + i * osize] = 0.0;
col_data[6 * oosize + i * osize] = 0.0;
if (pad2) {
col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
}
}
float32x4_t zero4;
zero4 = vdupq_n_f32(0.0);
auto col_z0 = col_data;
auto col_z1 = col_data + oosize;
auto col_z2 = col_data + 2 * oosize;
auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
for (int i = 0; i < nk4; ++i) {
vst1q_f32(col_z0, zero4);
vst1q_f32(col_z1, zero4);
vst1q_f32(col_z2, zero4);
if (pad2) {
vst1q_f32(col_z6, zero4);
vst1q_f32(col_z7, zero4);
vst1q_f32(col_z8, zero4);
}
col_z0 += 4;
col_z1 += 4;
col_z2 += 4;
col_z6 += 4;
col_z7 += 4;
col_z8 += 4;
}
for (int i = 0; i < mk4; ++i) {
col_z0[i] = 0.0;
col_z1[i] = 0.0;
col_z2[i] = 0.0;
if (pad2) {
col_z6[i] = 0.0;
col_z7[i] = 0.0;
col_z8[i] = 0.0;
}
}
col_data[1 * oosize + osize] = im_data[isize];
for (int i = 1; i < osize; ++i) {
col_data[3 * oosize + i] = im_data[(i - 1) * stride[0] + 1];
}
col_data[4 * oosize] = im_data[0];
col_data[7 * oosize] = im_data[isize];
col_data += 9 * oosize;
im_data += isize * isize;
}
} else {
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / (filter_width * filter_height);
for (int h = 0; h < col_height; ++h) {
int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
for (int w = 0; w < col_width; ++w) {
int im_col_idx =
w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * col_height + h) * col_width + w;
int im_idx =
(im_row_idx + c_im * im_height) * im_width + im_col_idx;
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width)
? static_cast<T>(0)
: im_data[im_idx];
}
}
}
}
#else
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width; int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height; int h_offset = (c / filter_width) % filter_height;
...@@ -86,6 +429,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -86,6 +429,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
} }
} }
} }
#endif
} }
}; };
...@@ -158,7 +502,7 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> { ...@@ -158,7 +502,7 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
}; };
template class Im2ColFunctor<ColFormat::kCFO, CPU, float>; template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
template class Im2ColFunctor<ColFormat::kCFO, CPU, double>; // template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
template class Col2ImFunctor<ColFormat::kCFO, CPU, float>; template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
template class Col2ImFunctor<ColFormat::kCFO, CPU, double>; template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;
......
...@@ -22,7 +22,7 @@ namespace math { ...@@ -22,7 +22,7 @@ namespace math {
template <> template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
framework::Tensor *matrix_out, float beta) { framework::Tensor *matrix_out, float beta, bool relu) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
...@@ -41,14 +41,20 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, ...@@ -41,14 +41,20 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int N = dim_out[1]; int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0]; int K = (trans_a == false) ? dim_a[1] : dim_a[0];
sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N, if (relu) {
beta, matrix_out->data<float>(), N); sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
} else {
sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
beta, matrix_out->data<float>(), N);
}
} }
template <> template <>
void matmul<double>(const framework::Tensor &matrix_a, bool trans_a, void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, const framework::Tensor &matrix_b, bool trans_b,
double alpha, framework::Tensor *matrix_out, double beta) { double alpha, framework::Tensor *matrix_out, double beta,
bool relu) {
auto dim_a = matrix_a.dims(); auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims(); auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims(); auto dim_out = matrix_out->dims();
......
...@@ -25,7 +25,7 @@ namespace math { ...@@ -25,7 +25,7 @@ namespace math {
template <typename T> template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta); framework::Tensor *matrix_out, T beta, bool relu = false);
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "pool_2x2.h"
namespace paddle_mobile {
namespace operators {
namespace math {
void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#if __ARM_NEON
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = 2;
const int ksize_width = 2;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int out_w_num = output_width >> 2;
const int in_h_num = output_height >> 1;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
int remain = output_width - out_w_num << 2;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
const float *input_data_chanel_row_next = input_data + input_width;
for (; output_height > 0; output_height--) {
if (out_w_num > 0) {
asm volatile(
"max_loop: \n\t"
"vld1.f32 {q0,q1}, [%[in_ptr1]]! \n\t"
"vld1.f32 {q2,q3}, [%[in_ptr2]]! \n\t"
"vmax.f32 q0, q0, q2 \n\t"
"vmax.f32 q1, q1, q3 \n\t"
"vpmax.f32 d4, d0, d1 \n\t"
"vpmax.f32 d5, d2, d3 \n\t"
"subs %[out_w_num], #1 \n\t"
"vst1.32 {q2}, [%[out_ptr]]! \n\t"
"bne max_loop \n\t"
: [in_ptr1] "+r"(input_data),
[in_ptr2] "+r"(input_data_chanel_row_next),
[out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
:
: "memory", "q0", "q1", "q2", "q3");
}
for (; remain > 0; remain--) {
float max_row1 = std::max(input_data[0], input_data[1]);
float max_row2 = std::max(input_data_chanel_row_next[0],
input_data_chanel_row_next[1]);
*output_data = std::max(max_row1, max_row2);
input_data += 2;
input_data_chanel_row_next += 2;
output_data++;
}
}
input_data += input_channel_stride;
output_data += output_channel_stride;
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#endif
}
void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#if __ARM_NEON
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int ksize_height = 2;
const int ksize_width = 2;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
int out_w_num = output_width >> 2;
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f};
int remain = output_width - out_w_num << 2;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
const float *input_data_chanel_row_next = input_data + input_width;
for (; output_height > 0; output_height--) {
if (out_w_num > 0) {
asm volatile(
"avg_loop: \n\t"
"vld1.32 {q0,q1}, [%[in_ptr1]]! \n\t"
"vld1.32 {q2,q3}, [%[in_ptr2]]! \n\t"
"vadd.f32 q0, q0, q2 \n\t"
"vadd.f32 q1, q1, q3 \n\t"
"vpadd.f32 d4, d0, d1 \n\t"
"vpadd.f32 d5, d2, d3 \n\t"
"vld1.32 {q4}, [%[vqua]]! \n\t"
"vmul.f32 q2, q2, q4 \n\t"
"subs %[out_w_num], #1 \n\t"
"vst1.32 {q2}, [%[out_ptr]]! \n\t"
"bne avg_loop \n\t"
: [in_ptr1] "+r"(input_data),
[in_ptr2] "+r"(input_data_chanel_row_next),
[out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
: [vqua] "r"(vqua)
: "memory", "q0", "q1", "q2", "q3", "q4");
}
for (; remain > 0; remain--) {
float max_row1 = std::max(input_data[0], input_data[1]);
float max_row2 = std::max(input_data_chanel_row_next[0],
input_data_chanel_row_next[1]);
*output_data = std::max(max_row1, max_row2);
input_data += 2;
input_data_chanel_row_next += 2;
output_data++;
}
}
input_data += input_channel_stride;
output_data += output_channel_stride;
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#endif
}
//}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,16 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#pragma once #pragma once
#include "framework/tensor.h"
#if __ARM_NEON #if __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif // __ARM_NEON #endif // __ARM_NEON
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::vector;
static void Pool2x2Max() { void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
// todo impl with neon Tensor *output);
}
static void Pool2x2Avg() { void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
// todo impl with neon Tensor *out);
} } // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#define __ARM_NEON true
#include "pool_3x3.h"
#include "framework/tensor.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
#include <climits>
namespace paddle_mobile {
namespace operators {
namespace math {
using framework::Tensor;
using std::max;
using std::min;
using std::vector;
void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#if __ARM_NEON
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int _kernel_size = 3;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const float negative_max = -INT_MAX;
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
const float *pos1, *pos2, *pos3, *output_ptr;
int hstart, wstart, hend, wend;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ph++) {
for (int pw = 0; pw < output_width; pw++) {
hstart = ph * stride_height - padding_height;
wstart = pw * stride_width - padding_width;
hend = min(hstart + _kernel_size, input_height + padding_height);
wend = min(wstart + _kernel_size, input_width + padding_width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, input_height);
wend = min(wend, input_width);
pos1 = input_data + hstart * input_width + wstart;
pos2 = input_data + (hstart + 1) * input_width + wstart;
pos3 = input_data + (hstart + 2) * input_width + wstart;
output_ptr = output_data + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) {
float max_value = -INT_MAX;
for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) {
float value = input_data[h * input_width + w];
if (value > max_value) {
max_value = value;
}
}
}
output_data[ph * output_width + pw] = max_value;
} else {
#if defined(ARMV7)
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t"
"vld1.32 {q3}, [%[pos3]] \n\t"
"vmax.f32 q1, q1, q2 \n\t"
"vmax.f32 q2, q1, q3 \n\t"
"vmov.f32 d5[1], %[negative_max] \n\t"
"vpmax.f32 d6, d4, d5 \n\t"
"vpmax.f32 d7, d6, d6 \n\t"
"vst1.32 {d7[0]},[%[output_ptr]] \n\t"
:
: [input_data] "r"(input_data), [pos1] "r"(pos1),
[pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
: "memory", "q1", "q2", "q3", "q4");
#else
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t max_data =
vmaxq_f32(vmaxq_f32(data1, data3), data2);
float32x2_t res =
vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
vget_low_f32(max_data));
res = vpmax_f32(res, res);
output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
#endif
}
}
}
input_data += input_channel_stride;
output_data += output_channel_stride;
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#endif
}
void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#if __ARM_NEON
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
const int input_width = input->dims()[3];
const int output_channels = output->dims()[1];
const int output_height = output->dims()[2];
const int output_width = output->dims()[3];
const int _kernel_size = 3;
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_channel_stride = input_height * input_width;
const int output_channel_stride = output_height * output_width;
const float *input_data = input->data<float>();
float *output_data = output->mutable_data<float>();
const float zero = 0;
const float nine = 1.0 / 9.0;
const float nine_ptr[] = {nine, nine};
const int input_batch_stride = output_channels * input_channel_stride;
const int output_batch_stride = output_channels * output_channel_stride;
for (int i = 0; i < batch_size; ++i) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ph++) {
for (int pw = 0; pw < output_width; pw++) {
int hstart = ph * stride_height - padding_height;
int wstart = pw * stride_width - padding_width;
int hend = min(hstart + _kernel_size, input_height + padding_height);
int wend = min(wstart + _kernel_size, input_width + padding_width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
hend = min(hend, input_height);
wend = min(wend, input_width);
const float *pos1 = input_data + hstart * input_width + wstart;
const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
const float *output_ptr = output_data + ph * output_width + pw;
if (hend - hstart != 3 || wend - wstart != 3) {
float sum = 0;
for (int h = hstart; h < hend; h++) {
for (int w = wstart; w < wend; w++) {
sum += input_data[h * input_width + w];
}
}
output_data[ph * output_width + pw] = sum / 9.0;
} else {
#if defined(ARMV7)
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t"
"vld1.32 {q3}, [%[pos3]] \n\t"
"vadd.f32 q1, q1, q2 \n\t"
"vadd.f32 q2, q1, q3 \n\t"
"vmov.f32 d5[1], %[zero] \n\t"
"vpadd.f32 d6, d4, d5 \n\t"
"vpadd.f32 d6, d6, d6 \n\t"
"vld1.f32 d7, [%[nine_ptr]]! \n\t"
"vmul.f32 d6,d7 \n\t"
"vst1.32 {d6[0]},[%[output_ptr]] \n\t"
:
: [input_data] "r"(input_data), [pos1] "r"(pos1),
[pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [zero] "r"(zero),
[nine_ptr] "r"(nine_ptr)
: "memory", "r6", "q1", "q2", "q3", "q4");
#else
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t sum_data =
vaddq_f32(vaddq_f32(data1, data3), data2);
float32x2_t res =
vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
vget_low_f32(sum_data));
res = vpadd_f32(res, res);
output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
#endif
}
}
}
input_data += input_channel_stride;
output_data += output_channel_stride;
}
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#endif
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,16 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#pragma once #pragma once
#include "framework/tensor.h"
#if __ARM_NEON #if __ARM_NEON
#include <arm_neon.h> #include <arm_neon.h>
#endif // __ARM_NEON #endif // __ARM_NEON
static void Pool3x3Max() { namespace paddle_mobile {
// todo impl with neon namespace operators {
} namespace math {
using framework::Tensor;
using std::vector;
void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output);
void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
Tensor *out);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
static void Pool3x3Avg() { #endif
// todo impl with neon
}
...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#include "pooling.h" #include "pooling.h"
#include <common/types.h> #include "common/types.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -36,9 +38,7 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -36,9 +38,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
if (output == nullptr) {
DLOG << "output tensor is null";
}
const int output_channels = output->dims()[1]; const int output_channels = output->dims()[1];
const int output_height = output->dims()[2]; const int output_height = output->dims()[2];
...@@ -57,7 +57,7 @@ class PoolFunctor<CPU, PoolProcess, T> { ...@@ -57,7 +57,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
T *output_data = output->mutable_data<T>(); T *output_data = output->mutable_data<T>();
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
#pragma omp parallel for #pragma omp parallel for
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) { for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height; int hstart = ph * stride_height - padding_height;
...@@ -91,3 +91,5 @@ template class PoolFunctor<CPU, math::MaxPool<float>, float>; ...@@ -91,3 +91,5 @@ template class PoolFunctor<CPU, math::MaxPool<float>, float>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#pragma once #pragma once
#include "common/log.h" #include "common/log.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "pool_2x2.h"
#include "pool_3x3.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -64,3 +68,5 @@ class PoolFunctor { ...@@ -64,3 +68,5 @@ class PoolFunctor {
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP
#include "operators/math/softmax.h" #include "operators/math/softmax.h"
#include "common/types.h" #include "common/types.h"
#if __ARM_NEON #if __ARM_NEON
...@@ -153,3 +156,4 @@ template class SoftmaxFuntor<CPU, float>; ...@@ -153,3 +156,4 @@ template class SoftmaxFuntor<CPU, float>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once #pragma once
#include "framework/tensor.h" #include "framework/tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -26,3 +27,4 @@ class SoftmaxFuntor { ...@@ -26,3 +27,4 @@ class SoftmaxFuntor {
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MUL_OP
#include "mul_op.h" #include "mul_op.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -19,10 +21,10 @@ namespace operators { ...@@ -19,10 +21,10 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void MulOp<Dtype, T>::InferShape() const { void MulOp<Dtype, T>::InferShape() const {
auto x_dims = param_.InputX()->dims(); auto x_dims = this->param_.InputX()->dims();
auto y_dims = param_.InputY()->dims(); auto y_dims = this->param_.InputY()->dims();
int x_num_col_dims = param_.XNumColDims(); int x_num_col_dims = this->param_.XNumColDims();
int y_num_col_dims = param_.YNumColDims(); int y_num_col_dims = this->param_.YNumColDims();
assert(x_dims.size() > x_num_col_dims); assert(x_dims.size() > x_num_col_dims);
assert(y_dims.size() > y_num_col_dims); assert(y_dims.size() > y_num_col_dims);
...@@ -46,12 +48,22 @@ void MulOp<Dtype, T>::InferShape() const { ...@@ -46,12 +48,22 @@ void MulOp<Dtype, T>::InferShape() const {
} }
framework::DDim ddim = framework::make_ddim(output_dims); framework::DDim ddim = framework::make_ddim(output_dims);
param_.Out()->Resize(ddim); this->param_.Out()->Resize(ddim);
} }
template class MulOp<CPU, float>; template class MulOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(mul); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(mul, ops::MulOp); USE_OP_CPU(mul);
REGISTER_OPERATOR_CPU(mul, ops::MulOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(mul);
REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MUL_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -22,26 +25,25 @@ namespace paddle_mobile { ...@@ -22,26 +25,25 @@ namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class MulOp : public framework::OperatorWithKernel<DeviceType> { class MulOp : public framework::OperatorWithKernel<
DeviceType, MulParam, operators::MulKernel<DeviceType, T>> {
public: public:
MulOp(const std::string &type, const VariableNameMap &inputs, MulOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, MulParam,
scope), operators::MulKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const { using framework::OperatorWithKernel<
operators::MulKernel<DeviceType, T> kernel; DeviceType, MulParam,
kernel.Compute(param_); operators::MulKernel<DeviceType, T>>::OperatorWithKernel;
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
MulParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MULTICLASSNMS_OP
#include "operators/multiclass_nms_op.h" #include "operators/multiclass_nms_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void MultiClassNMSOp<Dtype, T>::InferShape() const { void MultiClassNMSOp<Dtype, T>::InferShape() const {
auto input_bboxes_dims = param_.InputBBoxes()->dims(); auto input_bboxes_dims = this->param_.InputBBoxes()->dims();
auto input_scores_dims = param_.InputScores()->dims(); auto input_scores_dims = this->param_.InputScores()->dims();
if (input_scores_dims.size() != 3) { if (input_scores_dims.size() != 3) {
LOG(kLOG_ERROR) << "Input Scores size must be 3"; LOG(kLOG_ERROR) << "Input Scores size must be 3";
} }
...@@ -30,12 +32,20 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const { ...@@ -30,12 +32,20 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
LOG(kLOG_ERROR) << "Predict bboxes must be equal"; LOG(kLOG_ERROR) << "Predict bboxes must be equal";
} }
// pre size, will change in Compute. // pre size, will change in Compute.
param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6})); this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
} }
template class MultiClassNMSOp<CPU, float>; template class MultiClassNMSOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(multiclass_nms); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp); USE_OP_CPU(multiclass_nms);
REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MULTICLASSNMS_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -26,27 +28,28 @@ namespace operators { ...@@ -26,27 +28,28 @@ namespace operators {
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class MultiClassNMSOp : public framework::OperatorWithKernel<DeviceType> { class MultiClassNMSOp : public framework::OperatorWithKernel<
DeviceType, MultiClassNMSParam,
operators::MultiClassNMSKernel<DeviceType, T>> {
public: public:
MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs, MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<
scope), DeviceType, MultiClassNMSParam,
param_(inputs, outputs, attrs, *scope) {} operators::MultiClassNMSKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::MultiClassNMSKernel<DeviceType, T> kernel; using framework::OperatorWithKernel<
kernel.Compute(param_); DeviceType, MultiClassNMSParam,
} operators::MultiClassNMSKernel<DeviceType, T>>::OperatorWithKernel;
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
MultiClassNMSParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "op_param.h" #include "op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
#ifdef CONV_OP
Print &operator<<(Print &printer, const ConvParam &conv_param) { Print &operator<<(Print &printer, const ConvParam &conv_param) {
printer << "parameter of conv: " printer << "parameter of conv: "
<< "\n"; << "\n";
...@@ -36,5 +37,33 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) { ...@@ -36,5 +37,33 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) {
printer << " output dims: " << conv_param.Output()->dims(); printer << " output dims: " << conv_param.Output()->dims();
return printer; return printer;
} }
#endif
#ifdef FUSION_CONVADD_OP
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param) {
printer << "parameter of conv_add: "
<< "\n";
printer << " stride: "
<< " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
<< "\n";
printer << " paddings: "
<< " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
<< ") "
<< "\n";
printer << " dilations: "
<< " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
<< ") "
<< "\n";
printer << " groups: " << conv_param.Groups() << "\n";
printer << " input dims: " << conv_param.Input()->dims() << "\n";
printer << " filter dims: " << conv_param.Filter()->dims() << "\n";
printer << " bias dims: " << conv_param.Bias()->dims() << "\n";
printer << " output dims: " << conv_param.Output()->dims();
return printer;
}
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -34,7 +34,7 @@ using framework::Tensor; ...@@ -34,7 +34,7 @@ using framework::Tensor;
using std::string; using std::string;
using std::vector; using std::vector;
class OpParam : PaddleMobileObject { class OpParam {
protected: protected:
template <typename T> template <typename T>
static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) { static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
...@@ -165,10 +165,10 @@ class OpParam : PaddleMobileObject { ...@@ -165,10 +165,10 @@ class OpParam : PaddleMobileObject {
template <typename T> template <typename T>
static T *GetVarValue(const string &key, const VariableNameMap &var_map, static T *GetVarValue(const string &key, const VariableNameMap &var_map,
const Scope &scope) { const Scope &scope) {
PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
"%s is not contained in var_map", key.c_str())
auto var_vec = var_map.at(key); auto var_vec = var_map.at(key);
if (!var_vec.empty()) { if (!var_vec.empty()) {
// std::cout << " get var value -- " << var_vec[0] <<
// std::endl;
auto var = scope.FindVar(var_vec[0]); auto var = scope.FindVar(var_vec[0]);
return var->GetMutable<T>(); return var->GetMutable<T>();
} else { } else {
...@@ -191,6 +191,7 @@ class OpParam : PaddleMobileObject { ...@@ -191,6 +191,7 @@ class OpParam : PaddleMobileObject {
} }
}; };
#ifdef CONV_OP
class ConvParam : OpParam { class ConvParam : OpParam {
public: public:
ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs, ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -230,7 +231,9 @@ class ConvParam : OpParam { ...@@ -230,7 +231,9 @@ class ConvParam : OpParam {
}; };
Print &operator<<(Print &printer, const ConvParam &conv_param); Print &operator<<(Print &printer, const ConvParam &conv_param);
#endif
#ifdef ELEMENTWISEADD_OP
class ElementwiseAddParam : OpParam { class ElementwiseAddParam : OpParam {
public: public:
ElementwiseAddParam(const VariableNameMap &inputs, ElementwiseAddParam(const VariableNameMap &inputs,
...@@ -258,6 +261,9 @@ class ElementwiseAddParam : OpParam { ...@@ -258,6 +261,9 @@ class ElementwiseAddParam : OpParam {
int axis_; int axis_;
}; };
#endif
#ifdef MUL_OP
class MulParam : OpParam { class MulParam : OpParam {
public: public:
MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs, MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -287,7 +293,9 @@ class MulParam : OpParam { ...@@ -287,7 +293,9 @@ class MulParam : OpParam {
int x_num_col_dims_; int x_num_col_dims_;
int y_num_col_dims_; int y_num_col_dims_;
}; };
#endif
#ifdef CONCAT_OP
class ConcatParam : public OpParam { class ConcatParam : public OpParam {
public: public:
ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs, ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -309,7 +317,9 @@ class ConcatParam : public OpParam { ...@@ -309,7 +317,9 @@ class ConcatParam : public OpParam {
Tensor *out_; Tensor *out_;
int axis_; int axis_;
}; };
#endif
#ifdef LRN_OP
class LrnParam : public OpParam { class LrnParam : public OpParam {
public: public:
LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs, LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -351,6 +361,9 @@ class LrnParam : public OpParam { ...@@ -351,6 +361,9 @@ class LrnParam : public OpParam {
float k_; float k_;
string data_format_; string data_format_;
}; };
#endif
#ifdef BATCHNORM_OP
class BatchNormParam : OpParam { class BatchNormParam : OpParam {
public: public:
BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs, BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -399,6 +412,9 @@ class BatchNormParam : OpParam { ...@@ -399,6 +412,9 @@ class BatchNormParam : OpParam {
bool is_test_; bool is_test_;
string data_format_; string data_format_;
}; };
#endif
#ifdef POOL_OP
class PoolParam : public OpParam { class PoolParam : public OpParam {
public: public:
PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -442,6 +458,9 @@ class PoolParam : public OpParam { ...@@ -442,6 +458,9 @@ class PoolParam : public OpParam {
bool gloabal_pooling_ = false; bool gloabal_pooling_ = false;
}; };
#endif
#ifdef PRIORBOX_OP
class PriorBoxParam : public OpParam { class PriorBoxParam : public OpParam {
public: public:
PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs, PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -503,7 +522,9 @@ class PriorBoxParam : public OpParam { ...@@ -503,7 +522,9 @@ class PriorBoxParam : public OpParam {
float step_h_; float step_h_;
float offset_; float offset_;
}; };
#endif
#ifdef BOXCODER_OP
class BoxCoderParam : public OpParam { class BoxCoderParam : public OpParam {
public: public:
BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs, BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -533,7 +554,9 @@ class BoxCoderParam : public OpParam { ...@@ -533,7 +554,9 @@ class BoxCoderParam : public OpParam {
Tensor *output_box_; Tensor *output_box_;
std::string code_type_; std::string code_type_;
}; };
#endif
#ifdef SOFTMAX_OP
class SoftmaxParam : public OpParam { class SoftmaxParam : public OpParam {
public: public:
SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs, SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -549,7 +572,9 @@ class SoftmaxParam : public OpParam { ...@@ -549,7 +572,9 @@ class SoftmaxParam : public OpParam {
Tensor *input_x_; Tensor *input_x_;
Tensor *out_; Tensor *out_;
}; };
#endif
#ifdef SIGMOID_OP
class SigmoidParam : public OpParam { class SigmoidParam : public OpParam {
public: public:
SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs, SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -565,6 +590,9 @@ class SigmoidParam : public OpParam { ...@@ -565,6 +590,9 @@ class SigmoidParam : public OpParam {
Tensor *input_x_; Tensor *input_x_;
Tensor *out_; Tensor *out_;
}; };
#endif
#ifdef MULTICLASSNMS_OP
class MultiClassNMSParam : public OpParam { class MultiClassNMSParam : public OpParam {
public: public:
MultiClassNMSParam(const VariableNameMap &inputs, MultiClassNMSParam(const VariableNameMap &inputs,
...@@ -610,6 +638,7 @@ class MultiClassNMSParam : public OpParam { ...@@ -610,6 +638,7 @@ class MultiClassNMSParam : public OpParam {
float nms_eta_; float nms_eta_;
float score_threshold_; float score_threshold_;
}; };
#endif
class FeedParam : public OpParam { class FeedParam : public OpParam {
public: public:
...@@ -646,6 +675,7 @@ class FetchParam : public OpParam { ...@@ -646,6 +675,7 @@ class FetchParam : public OpParam {
Tensor *out_; Tensor *out_;
}; };
#ifdef TRANSPOSE_OP
class TransposeParam : public OpParam { class TransposeParam : public OpParam {
public: public:
TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -666,7 +696,9 @@ class TransposeParam : public OpParam { ...@@ -666,7 +696,9 @@ class TransposeParam : public OpParam {
Tensor *out_; Tensor *out_;
vector<int> axis_; vector<int> axis_;
}; };
#endif
#ifdef RESHAPE_OP
class ReshapeParam : public OpParam { class ReshapeParam : public OpParam {
public: public:
ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
...@@ -695,7 +727,9 @@ class ReshapeParam : public OpParam { ...@@ -695,7 +727,9 @@ class ReshapeParam : public OpParam {
vector<int> shape_; vector<int> shape_;
bool inplace_; bool inplace_;
}; };
#endif
#ifdef RELU_OP
/* /*
* @b op 层实例化好这个 param 传递给 kernel 层使用 * @b op 层实例化好这个 param 传递给 kernel 层使用
* */ * */
...@@ -715,11 +749,13 @@ class ReluParam : public OpParam { ...@@ -715,11 +749,13 @@ class ReluParam : public OpParam {
Tensor *input_x_; Tensor *input_x_;
Tensor *out_; Tensor *out_;
}; };
#endif
class FushionFcParam : public OpParam { #ifdef FUSION_FC_OP
class FusionFcParam : public OpParam {
public: public:
FushionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs, FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) { const AttributeMap &attrs, const Scope &scope) {
input_x_ = InputXFrom<LoDTensor>(inputs, scope); input_x_ = InputXFrom<LoDTensor>(inputs, scope);
input_y_ = InputYFrom<LoDTensor>(inputs, scope); input_y_ = InputYFrom<LoDTensor>(inputs, scope);
input_z_ = InputZFrom<LoDTensor>(inputs, scope); input_z_ = InputZFrom<LoDTensor>(inputs, scope);
...@@ -751,6 +787,66 @@ class FushionFcParam : public OpParam { ...@@ -751,6 +787,66 @@ class FushionFcParam : public OpParam {
int y_num_col_dims_; int y_num_col_dims_;
int axis_; int axis_;
}; };
#endif
#ifdef FUSION_CONVADD_OP
class FusionConvAddParam : public OpParam {
public:
FusionConvAddParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope) {
bias_ = InputYFrom<LoDTensor>(inputs, scope);
axis_ = GetAttr<int>("axis", attrs);
filter_ = FilterFrom<LoDTensor>(inputs, scope);
input_ = InputFrom<LoDTensor>(inputs, scope);
output_ = OutFrom<LoDTensor>(outputs, scope);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs);
groups = GetAttr<int>("groups", attrs);
}
Tensor *Bias() const { return bias_; }
const int &Axis() const { return axis_; }
const Tensor *Input() const { return input_; }
const Tensor *Filter() const { return filter_; }
Tensor *Output() const { return output_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Dilations() const { return dilations_; }
const int &Groups() const { return groups; }
protected:
Tensor *bias_;
int axis_;
Tensor *input_;
Tensor *output_;
Tensor *filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
int groups;
};
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
#endif
#ifdef FUSION_CONVADD_RELU_OP
class FusionConvAddReluParam : public FusionConvAddParam {
public:
FusionConvAddReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope)
: FusionConvAddParam(inputs, outputs, attrs, scope) {}
};
#endif
class Im2SequenceParam : public OpParam { class Im2SequenceParam : public OpParam {
public: public:
......
...@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#include "pool_op.h" #include "pool_op.h"
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -30,13 +34,13 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride, ...@@ -30,13 +34,13 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
} }
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
void PoolOp<DeviceType, T>::InferShape() const { void PoolOp<DeviceType, T>::InferShape() const {
auto in_x_dims = param_.Input()->dims(); auto in_x_dims = this->param_.Input()->dims();
std::vector<int> ksize = param_.Ksize(); std::vector<int> ksize = this->param_.Ksize();
std::vector<int> paddings = param_.Paddings(); std::vector<int> paddings = this->param_.Paddings();
std::vector<int> strides = param_.Strides(); std::vector<int> strides = this->param_.Strides();
bool ceil_mode = param_.isCeilMode(); bool ceil_mode = this->param_.isCeilMode();
if (param_.isGlobalPooling()) { if (this->param_.isGlobalPooling()) {
ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2); ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
for (size_t i = 0; i < ksize.size(); ++i) { for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0; paddings[i] = 0;
...@@ -48,12 +52,22 @@ void PoolOp<DeviceType, T>::InferShape() const { ...@@ -48,12 +52,22 @@ void PoolOp<DeviceType, T>::InferShape() const {
output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i], output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
paddings[i], strides[i], ceil_mode)); paddings[i], strides[i], ceil_mode));
} }
param_.Output()->Resize(framework::make_ddim(output_shape)); this->param_.Output()->Resize(framework::make_ddim(output_shape));
} }
template class PoolOp<CPU, float>; template class PoolOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(pool2d); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(pool2d, ops::PoolOp); USE_OP_CPU(pool2d);
REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(pool2d);
REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef POOL_OP
#pragma once #pragma once
#include <framework/operator.h>
#include <operators/kernel/pool_kernel.h>
#include <operators/op_param.h>
#include <string> #include <string>
#include "framework/operator.h"
#include "operators/kernel/pool_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using framework::AttributeMap; using framework::AttributeMap;
...@@ -26,24 +29,23 @@ using framework::OperatorWithKernel; ...@@ -26,24 +29,23 @@ using framework::OperatorWithKernel;
using framework::Scope; using framework::Scope;
using std::string; using std::string;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class PoolOp : public OperatorWithKernel<DeviceType> { class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
operators::PoolKernel<DeviceType, T>> {
public: public:
PoolOp(const string &type, const VariableNameMap &inputs, PoolOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs, const VariableNameMap &outputs, const AttributeMap &attrs,
std::shared_ptr<Scope> scope) std::shared_ptr<Scope> scope)
: OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, scope), : OperatorWithKernel<DeviceType, PoolParam,
param_(inputs, outputs, attrs, *scope) {} operators::PoolKernel<DeviceType, T>>(
using OperatorWithKernel<DeviceType>::OperatorWithKernel; type, inputs, outputs, attrs, scope) {}
using OperatorWithKernel<
DeviceType, PoolParam,
operators::PoolKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
void RunImpl() const {
operators::PoolKernel<DeviceType, T> kernel;
kernel.Compute(param_);
this->ClearVariables({"X"});
}
private: private:
PoolParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PRIORBOX_OP
#include "operators/prior_box_op.h" #include "operators/prior_box_op.h"
#include <vector> #include <vector>
namespace paddle_mobile { namespace paddle_mobile {
...@@ -19,13 +21,13 @@ namespace operators { ...@@ -19,13 +21,13 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void PriorBoxOp<Dtype, T>::InferShape() const { void PriorBoxOp<Dtype, T>::InferShape() const {
auto input_dims = param_.Input()->dims(); auto input_dims = this->param_.Input()->dims();
auto input_image_dims = param_.InputImage()->dims(); auto input_image_dims = this->param_.InputImage()->dims();
auto min_sizes = param_.MinSizes(); auto min_sizes = this->param_.MinSizes();
auto max_sizes = param_.MaxSizes(); auto max_sizes = this->param_.MaxSizes();
auto variances = param_.Variances(); auto variances = this->param_.Variances();
auto aspect_ratios = param_.AspectRatios(); auto aspect_ratios = this->param_.AspectRatios();
bool flip = param_.Flip(); bool flip = this->param_.Flip();
std::vector<float> aspect_ratios_vec; std::vector<float> aspect_ratios_vec;
ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec); ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
...@@ -39,13 +41,21 @@ void PriorBoxOp<Dtype, T>::InferShape() const { ...@@ -39,13 +41,21 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
dim_vec[1] = input_dims[3]; dim_vec[1] = input_dims[3];
dim_vec[2] = num_priors; dim_vec[2] = num_priors;
dim_vec[3] = 4; dim_vec[3] = 4;
param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec)); this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
param_.OutputVariances()->Resize(framework::make_ddim(dim_vec)); this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
} }
template class PriorBoxOp<CPU, float>; template class PriorBoxOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(prior_box); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(prior_box, ops::PriorBoxOp); USE_OP_CPU(prior_box);
REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PRIORBOX_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -26,27 +28,27 @@ namespace operators { ...@@ -26,27 +28,27 @@ namespace operators {
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class PriorBoxOp : public framework::OperatorWithKernel<DeviceType> { class PriorBoxOp
: public framework::OperatorWithKernel<
DeviceType, PriorBoxParam, operators::PriorBoxKernel<DeviceType, T>> {
public: public:
PriorBoxOp(const std::string &type, const VariableNameMap &inputs, PriorBoxOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, PriorBoxParam,
scope), operators::PriorBoxKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::PriorBoxKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, PriorBoxParam,
operators::PriorBoxKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
PriorBoxParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef RELU_OP
#include "operators/relu_op.h" #include "operators/relu_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void ReluOp<Dtype, T>::InferShape() const { void ReluOp<Dtype, T>::InferShape() const {
auto input_dims = param_.InputX()->dims(); auto input_dims = this->param_.InputX()->dims();
param_.Out()->Resize(input_dims); this->param_.Out()->Resize(input_dims);
} }
template class ReluOp<CPU, float>; template class ReluOp<CPU, float>;
} // namespace operators } // namespace operators
...@@ -31,5 +33,15 @@ template class ReluOp<CPU, float>; ...@@ -31,5 +33,15 @@ template class ReluOp<CPU, float>;
* 都是需要和model中类型对应起来的 * 都是需要和model中类型对应起来的
* */ * */
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(relu); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(relu, ops::ReluOp); USE_OP_CPU(relu);
REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(relu);
REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef RELU_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -26,36 +28,29 @@ namespace operators { ...@@ -26,36 +28,29 @@ namespace operators {
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class ReluOp : public framework::OperatorWithKernel<DeviceType> { class ReluOp
: public framework::OperatorWithKernel<
DeviceType, ReluParam, operators::ReluKernel<DeviceType, T>> {
public: public:
/* /*
* @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体 * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
* */ * */
ReluOp(const std::string &type, const VariableNameMap &inputs, ReluOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, ReluParam,
scope), operators::ReluKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
/*
* @b op 进行运算, 调用相应的 kernel 进行运算
* */
void RunImpl() const {
operators::ReluKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, ReluParam,
operators::ReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
/*
* @b Relu kernel 进行运算时所需要用到参数的结构体,
* 结构体定义在: paddle-mobile/src/operators/op_param.h
* */
ReluParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef RESHAPE_OP
#include "operators/reshape_op.h" #include "operators/reshape_op.h"
#include <vector> #include <vector>
namespace paddle_mobile { namespace paddle_mobile {
...@@ -20,15 +22,25 @@ namespace operators { ...@@ -20,15 +22,25 @@ namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void ReshapeOp<Dtype, T>::InferShape() const { void ReshapeOp<Dtype, T>::InferShape() const {
/// todo: add InputShape() detection. /// todo: add InputShape() detection.
auto &shape = param_.Shape(); auto &shape = this->param_.Shape();
auto input_x_dims = param_.InputX()->dims(); auto input_x_dims = this->param_.InputX()->dims();
auto out_dims = ValidateShape(shape, input_x_dims); auto out_dims = ValidateShape(shape, input_x_dims);
param_.Out()->Resize(out_dims); this->param_.Out()->Resize(out_dims);
} }
template class ReshapeOp<CPU, float>; template class ReshapeOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(reshape); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(reshape, ops::ReshapeOp); USE_OP_CPU(reshape);
REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(reshape);
REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef RESHAPE_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -26,26 +28,27 @@ namespace operators { ...@@ -26,26 +28,27 @@ namespace operators {
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class ReshapeOp : public framework::OperatorWithKernel<DeviceType> { class ReshapeOp
: public framework::OperatorWithKernel<
DeviceType, ReshapeParam, operators::ReshapeKernel<DeviceType, T>> {
public: public:
ReshapeOp(const std::string &type, const VariableNameMap &inputs, ReshapeOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const framework::AttributeMap attrs, const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, ReshapeParam,
scope), operators::ReshapeKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
void RunImpl() const {
operators::ReshapeKernel<DeviceType, T> kernel;
kernel.Compute(param_);
}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, ReshapeParam,
operators::ReshapeKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected: protected:
ReshapeParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,18 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SIGMOID_OP
#include "operators/sigmoid_op.h" #include "operators/sigmoid_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
void SigmoidOp<DeviceType, T>::InferShape() const { void SigmoidOp<DeviceType, T>::InferShape() const {
param_.Out()->Resize(param_.InputX()->dims()); this->param_.Out()->Resize(this->param_.InputX()->dims());
} }
template class SigmoidOp<CPU, float>; template class SigmoidOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(sigmoid); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(sigmoid, ops::SigmoidOp); USE_OP_CPU(sigmoid);
REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,38 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,38 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SIGMOID_OP
#pragma once #pragma once
#include <framework/operator.h>
#include <operators/op_param.h>
#include <string> #include <string>
#include "framework/operator.h"
#include "operators/kernel/sigmoid_kernel.h" #include "operators/kernel/sigmoid_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class SigmoidOp : public framework::OperatorWithKernel<DeviceType> { class SigmoidOp
: public framework::OperatorWithKernel<
DeviceType, SigmoidParam, operators::SigmoidKernel<DeviceType, T>> {
public: public:
SigmoidOp(const std::string &type, const VariableNameMap &inputs, SigmoidOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap &attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, SigmoidParam,
scope), operators::SigmoidKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, SigmoidParam,
operators::SigmoidKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
void RunImpl() const {
operators::SigmoidKernel<DeviceType, T> kernel;
kernel.Compute(param_);
this->ClearVariables({"X"});
}
private:
SigmoidParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,18 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP
#include "operators/softmax_op.h" #include "operators/softmax_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
void SoftmaxOp<DeviceType, T>::InferShape() const { void SoftmaxOp<DeviceType, T>::InferShape() const {
param_.Out()->Resize(param_.InputX()->dims()); this->param_.Out()->Resize(this->param_.InputX()->dims());
} }
template class SoftmaxOp<CPU, float>; template class SoftmaxOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(softmax); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(softmax, ops::SoftmaxOp); USE_OP_CPU(softmax);
REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
USE_OP_MALI_GPU(softmax);
REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,38 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,38 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef SOFTMAX_OP
#pragma once #pragma once
#include <framework/operator.h>
#include <operators/op_param.h>
#include <string> #include <string>
#include "framework/operator.h"
#include "operators/kernel/softmax_kernel.h" #include "operators/kernel/softmax_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class SoftmaxOp : public framework::OperatorWithKernel<DeviceType> { class SoftmaxOp
: public framework::OperatorWithKernel<
DeviceType, SoftmaxParam, operators::SoftmaxKernel<DeviceType, T>> {
public: public:
SoftmaxOp(const std::string &type, const VariableNameMap &inputs, SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap &attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<DeviceType, SoftmaxParam,
scope), operators::SoftmaxKernel<DeviceType, T>>(
param_(inputs, outputs, attrs, *scope) {} type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel; using framework::OperatorWithKernel<
DeviceType, SoftmaxParam,
operators::SoftmaxKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
void RunImpl() const {
operators::SoftmaxKernel<DeviceType, T> kernel;
kernel.Compute(param_);
this->ClearVariables({"X"});
}
private: private:
SoftmaxParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -12,16 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "operators/transpose_op.h" #ifdef TRANSPOSE_OP
#include <common/enforce.h>
#include <vector> #include <vector>
#include "common/enforce.h"
#include "operators/transpose_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype, typename T> template <typename Dtype, typename T>
void TransposeOp<Dtype, T>::InferShape() const { void TransposeOp<Dtype, T>::InferShape() const {
auto input_x_dims = param_.InputX()->dims(); auto input_x_dims = this->param_.InputX()->dims();
auto axis = param_.Axis(); auto axis = this->param_.Axis();
size_t x_dims_size = input_x_dims.size(); size_t x_dims_size = input_x_dims.size();
size_t axis_size = axis.size(); size_t axis_size = axis.size();
...@@ -42,12 +45,20 @@ void TransposeOp<Dtype, T>::InferShape() const { ...@@ -42,12 +45,20 @@ void TransposeOp<Dtype, T>::InferShape() const {
for (size_t i = 0; i < axis_size; i++) { for (size_t i = 0; i < axis_size; i++) {
out_dims[i] = input_x_dims[axis[i]]; out_dims[i] = input_x_dims[axis[i]];
} }
param_.Out()->Resize(out_dims); this->param_.Out()->Resize(out_dims);
} }
template class TransposeOp<CPU, float>; template class TransposeOp<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
namespace ops = paddle_mobile::operators; namespace ops = paddle_mobile::operators;
USE_OP(transpose); #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR(transpose, ops::TransposeOp); USE_OP_CPU(transpose);
REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef TRANSPOSE_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -26,27 +28,26 @@ namespace operators { ...@@ -26,27 +28,26 @@ namespace operators {
using paddle_mobile::framework::Tensor; using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class TransposeOp : public framework::OperatorWithKernel<DeviceType> { class TransposeOp : public framework::OperatorWithKernel<
DeviceType, TransposeParam,
operators::TransposeKernel<DeviceType, T>> {
public: public:
TransposeOp(const std::string &type, const VariableNameMap &inputs, TransposeOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const VariableNameMap &outputs,
const framework::AttributeMap attrs, const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope) std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, : framework::OperatorWithKernel<
scope), DeviceType, TransposeParam,
param_(inputs, outputs, attrs, *scope) {} operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
void RunImpl() const {
operators::TransposeKernel<DeviceType, T> kernel; using framework::OperatorWithKernel<
kernel.Compute(param_); DeviceType, TransposeParam,
} operators::TransposeKernel<DeviceType, T>>::OperatorWithKernel;
using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
void InferShape() const override; void InferShape() const override;
protected:
TransposeParam param_;
}; };
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <typeindex>
#include "framework/program/tensor_desc.h"
namespace paddle_mobile {
namespace framework {
inline VarType_Type ToDataType(std::type_index type) {
/*if (typeid(platform::float16).hash_code() == type.hash_code()) {
return proto::VarType::FP16;
} else */
if (typeid(const float).hash_code() == type.hash_code()) {
// CPPLint complains Using C-style cast. Use
// static_cast<float>() instead
// One fix to this is to replace float with const float because
// typeid(T) == typeid(const T)
// http://en.cppreference.com/w/cpp/language/typeid
return VARTYPE_TYPE_FP32;
} else if (typeid(const double).hash_code() == type.hash_code()) {
return VARTYPE_TYPE_FP64;
} else if (typeid(const int).hash_code() == type.hash_code()) {
return VARTYPE_TYPE_INT32;
} else if (typeid(const int64_t).hash_code() == type.hash_code()) {
return VARTYPE_TYPE_INT64;
} else if (typeid(const bool).hash_code() == type.hash_code()) {
return VARTYPE_TYPE_BOOL;
} else {
// PADDLE_THROW("Not supported");
// std::cout << "Not supported";
}
}
inline std::type_index ToTypeIndex(VarType_Type type) {
switch (type) {
// case proto::VarType::FP16:
// return typeid(platform::float16);
case VARTYPE_TYPE_FP32:
return typeid(float);
case VARTYPE_TYPE_FP64:
return typeid(double);
case VARTYPE_TYPE_INT32:
return typeid(int);
case VARTYPE_TYPE_INT64:
return typeid(int64_t);
case VARTYPE_TYPE_BOOL:
return typeid(bool);
default:
// PADDLE_THROW("Not support type %d", type);
printf("Not support type %d", type);
}
}
template <typename Visitor>
inline void VisitDataType(VarType_Type type, Visitor visitor) {
switch (type) {
// case proto::VarType::FP16:
// visitor.template operator()<platform::float16>();
// break;
case VARTYPE_TYPE_FP32:
visitor.template operator()<float>();
break;
case VARTYPE_TYPE_FP64:
visitor.template operator()<double>();
break;
case VARTYPE_TYPE_INT32:
visitor.template operator()<int>();
break;
case VARTYPE_TYPE_INT64:
visitor.template operator()<int64_t>();
break;
case VARTYPE_TYPE_BOOL:
visitor.template operator()<bool>();
break;
default:
// PADDLE_THROW("Not supported");
printf("Not supported");
}
}
inline std::string DataTypeToString(const VarType_Type type) {
switch (type) {
case VARTYPE_TYPE_FP16:
return "float16";
case VARTYPE_TYPE_FP32:
return "float32";
case VARTYPE_TYPE_FP64:
return "float64";
case VARTYPE_TYPE_INT16:
return "int16";
case VARTYPE_TYPE_INT32:
return "int32";
case VARTYPE_TYPE_INT64:
return "int64";
case VARTYPE_TYPE_BOOL:
return "bool";
default:
// PADDLE_THROW("Not support type %d", type);
printf("Not support type %d", type);
}
}
inline std::ostream &operator<<(std::ostream &out, const VarType_Type &type) {
out << DataTypeToString(type);
return out;
}
} // namespace framework
} // namespace paddle_mobile
set(dir ${CMAKE_CURRENT_SOURCE_DIR}) set(dir ${CMAKE_CURRENT_SOURCE_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build") set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
# gen test if (googlenet)
ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h) # gen test
target_link_libraries(test-conv-op paddle-mobile) ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-googlenet paddle-mobile)
# gen test elseif (mobilenet)
ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h) # gen test
target_link_libraries(test-mul-op paddle-mobile) ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenet paddle-mobile)
# gen test elseif (yolo)
ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h) # gen test
target_link_libraries(test-elementwiseadd-op paddle-mobile) ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile)
# gen test elseif (squeezenet)
ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h) # gen test
target_link_libraries(test-concat-op paddle-mobile) ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-squeezenet paddle-mobile)
# gen test elseif(resnet)
ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h) # gen test
target_link_libraries(test-lrn-op paddle-mobile) ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile)
# gen test else ()
ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
target_link_libraries(test-batchnorm-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
# gen test target_link_libraries(test-resnet paddle-mobile)
ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
target_link_libraries(test-priorbox-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
# gen test target_link_libraries(test-squeezenet paddle-mobile)
ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
target_link_libraries(test-boxcoder-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
# gen test target_link_libraries(test-yolo paddle-mobile)
ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
target_link_libraries(test-transpose-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
# gen test target_link_libraries(test-googlenet paddle-mobile)
ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
target_link_libraries(test-multiclassnms-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h)
# gen test target_link_libraries(test-conv-op paddle-mobile)
ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
target_link_libraries(test-reshape-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
# gen test target_link_libraries(test-mul-op paddle-mobile)
ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
target_link_libraries(test-relu-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
# gen test target_link_libraries(test-elementwiseadd-op paddle-mobile)
ADD_EXECUTABLE(test-fc-op operators/test_fushion_fc_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fc-op paddle-mobile) # gen test
ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
# gen test log target_link_libraries(test-concat-op paddle-mobile)
ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-log paddle-mobile) # gen test
ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
# gen test log target_link_libraries(test-lrn-op paddle-mobile)
ADD_EXECUTABLE(test-load framework/test_load.cpp)
target_link_libraries(test-load paddle-mobile) # gen test
ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
# gen test log target_link_libraries(test-batchnorm-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp) # gen test
target_link_libraries(test-optimize paddle-mobile) ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
target_link_libraries(test-priorbox-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
target_link_libraries(test-boxcoder-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
target_link_libraries(test-transpose-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
target_link_libraries(test-multiclassnms-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
target_link_libraries(test-reshape-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
target_link_libraries(test-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
target_link_libraries(test-fc-op paddle-mobile)
# gen test log
ADD_EXECUTABLE(test-log common/test_log.cpp)
target_link_libraries(test-log paddle-mobile)
# gen test log
ADD_EXECUTABLE(test-load framework/test_load.cpp)
target_link_libraries(test-load paddle-mobile)
# gen test log
# gen test
ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
target_link_libraries(test-optimize paddle-mobile)
#gen test
ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool paddle-mobile)
#gen test
ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-softmax paddle-mobile)
# gen test
ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
target_link_libraries(test-gemm paddle-mobile)
# gen test
ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
target_link_libraries(test-enforce paddle-mobile)
#gen test # gen test - test if openmp works
ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-pool paddle-mobile) target_link_libraries(test-openmp paddle-mobile)
#gen test # gen test
ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-softmax paddle-mobile) target_link_libraries(test-mobilenetssd paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-gemm common/test_gemm.cpp) ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
target_link_libraries(test-gemm paddle-mobile) target_link_libraries(test-sigmoid paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-enforce common/test_enforce.cpp) ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-enforce paddle-mobile) target_link_libraries(test-depthwise-conv-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile) target_link_libraries(test-mobilenet paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-googlenet paddle-mobile) target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenet paddle-mobile)
# gen test endif()
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile)
# gen test
ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenetssd paddle-mobile)
# gen test
ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-squeezenet paddle-mobile)
# gen test
ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
target_link_libraries(test-sigmoid paddle-mobile)
# gen test
ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-depthwise-conv-op paddle-mobile)
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <iostream> #include <iostream>
#include "../test_helper.h"
#include "common/log.h" #include "common/log.h"
#include "memory/t_malloc.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
#define a(i, j) a[(i)*lda + (j)] #define a(i, j) a[(i)*lda + (j)]
...@@ -29,10 +31,15 @@ int main() { ...@@ -29,10 +31,15 @@ int main() {
int ldb = n; int ldb = n;
int ldc = n; int ldc = n;
float a[62 * 74]; float *a =
float b[74 * 63]; static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
float c[62 * 63] = {0}; float *b =
float c1[62 * 63] = {0}; static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
float *c =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
float *c1 =
static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
a[i] = 2; a[i] = 2;
} }
...@@ -44,8 +51,11 @@ int main() { ...@@ -44,8 +51,11 @@ int main() {
c1[i] = 2; c1[i] = 2;
} }
auto time1 = time();
paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
ldc); ldc);
auto time2 = time();
DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
std::cout << c[i] << " | "; std::cout << c[i] << " | ";
if (i % n == (n - 1)) { if (i % n == (n - 1)) {
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
// //
// Created by liuRuiLong on 2018/5/26. // Created by liuRuiLong on 2018/6/6.
// //
#include "tensor_desc.h" #include "test_lib_size.h"
static test_lib_size t;
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
//
// Created by liuRuiLong on 2018/6/6.
//
#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
#define PADDLE_MOBILE_TEST_LIB_SIZE_H
#include <pthread.h>
#include <thread>
#include <vector>
//#include <list>
//#include <tuple>
//#include <typeinfo>
//#include <mutex>
//#include <initializer_list>
//#include <map>
//#include <string>
//#include <unordered_map>
//#include <unordered_set>
//#include <algorithm>
//#include <iostream>
//#include <sstream>
//#include <memory>
//#include <stdio.h>
//#include <cstring>
void foo() {
// char *str = "1234";
// char dst[10];
// strcpy(dst, str);
// std::cout << "12345" << std::endl;
std::vector<int> vec = {1, 2, 3, 4, 5};
vec.push_back(2);
pthread_mutex_init(NULL, NULL);
pthread_attr_destroy(NULL);
// std::find(vec.begin(), vec.end(), 1);
// std::list<int> l;
// std::mutex mutex_;
// std::map<int, float> m;
// std::unordered_map<int, float> u_m;
// std::unordered_set<int> u_s;
// std::string ss = "12345";
// printf("%f", ss.c_str());
// std::initializer_list<int> init_list = {1, 2};
// std::tuple<int, int> t = {1, 2};
// std::tuple_element<I, std::tuple<ARGS...>>::type
// std::tuple<>
// int i;
// int j;
// if (typeid(i) == typeid(j)){
// int z = 10;
// }
// std::shared_ptr<int> s1 = std::make_shared<int>();
// std::stringstream ss;
// ss << "12345";
}
class test_lib_size {
public:
test_lib_size() {}
// std::shared_ptr<int> Test(){
// std::vector<int> vec = {1, 2, 3};
// std::shared_ptr<int> si = std::make_shared<int>();
// return si;
// }
// void test(){
// int i = 9;
// }
};
#endif // PADDLE_MOBILE_TEST_LIB_SIZE_H
...@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once //#include <omp.h>
#include <iostream>
#ifdef __CUDACC__ int main(void) {
#define HOSTDEVICE __host__ __device__ #ifdef PADDLE_MOBILE_USE_OPENMP
#define DEVICE __device__ #pragma omp parallel num_threads(2)
#define HOST __host__ {
#else // int thread_id = omp_get_thread_num();
#define HOSTDEVICE // int nthreads = omp_get_num_threads();
#define DEVICE // std::cout << "Hello, OMP " << thread_id << "/" << nthreads <<
#define HOST // "\n";
}
#endif #endif
return 0;
}
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "common/log.h" #include "common/log.h"
#include "framework/op_registry.h" #include "framework/op_registry.h"
#include "io.h" #include "io/io.h"
#include "operators/conv_op.h" #include "operators/conv_op.h"
#include "operators/elementwise_add_op.h" #include "operators/elementwise_add_op.h"
#include "operators/pool_op.h" #include "operators/pool_op.h"
...@@ -42,8 +42,10 @@ using std::vector; ...@@ -42,8 +42,10 @@ using std::vector;
template <typename DeviceType, typename OpType> template <typename DeviceType, typename OpType>
class Executor4Test : public Executor<DeviceType> { class Executor4Test : public Executor<DeviceType> {
public: public:
Executor4Test(Program<DeviceType> p, string op_type) Executor4Test(Program<DeviceType> p, string op_type,
bool use_optimize = false)
: Executor<DeviceType>() { : Executor<DeviceType>() {
this->use_optimize_ = use_optimize;
this->program_ = p; this->program_ = p;
if (this->use_optimize_) { if (this->use_optimize_) {
this->to_predict_program_ = this->program_.optimizeProgram; this->to_predict_program_ = this->program_.optimizeProgram;
...@@ -61,13 +63,14 @@ class Executor4Test : public Executor<DeviceType> { ...@@ -61,13 +63,14 @@ class Executor4Test : public Executor<DeviceType> {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops(); std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (std::shared_ptr<OpDesc> op : ops) { for (std::shared_ptr<OpDesc> op : ops) {
if (op->Type() == op_type) { if (op->Type() == op_type) {
DLOG << "匹配到: " << op->Type();
/// test first meeting op in program /// test first meeting op in program
std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>> std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
op_ptr = paddle_mobile::framework::OpRegistry< op_ptr =
paddle_mobile::CPU>::CreateOp(op->Type(), op->GetInputs(), paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
op->GetOutputs(), op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), op->GetAttrMap(), this->program_.scope);
this->program_.scope);
this->ops_of_block_[*block_desc.get()].push_back(op_ptr); this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
break; break;
} }
......
...@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../test_helper.h" #include "../test_helper.h"
#include "io.h" #include "io/io.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../../../test/models/googlenet // ../../../test/models/googlenet
// ../../../test/models/mobilenet // ../../../test/models/mobilenet
auto program = loader.Load(g_googlenet); auto program = loader.Load(g_mobilenet_ssd, false, false);
program.optimizeProgram->Description("program desc: "); // auto program = loader.Load(g_googlenet_combine + "/model",
// g_googlenet_combine +
// "/params", true);
// program.originProgram->Description("program desc: ");
return 0; return 0;
} }
...@@ -15,17 +15,17 @@ limitations under the License. */ ...@@ -15,17 +15,17 @@ limitations under the License. */
#include "../test_helper.h" #include "../test_helper.h"
#include "framework/program/program-optimize/node.h" #include "framework/program/program-optimize/node.h"
#include "framework/program/program-optimize/program_optimize.h" #include "framework/program/program-optimize/program_optimize.h"
#include "io.h" #include "io/io.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
// "../../../test/models/googlenet" // "../../../test/models/googlenet"
auto program = loader.Load(g_googlenet); auto program = loader.Load(g_mobilenet_ssd, true);
paddle_mobile::framework::ProgramOptimize optimize; paddle_mobile::framework::ProgramOptimize optimize;
// program.originProgram->Description("origin"); // program.originProgram->Description("origin");
auto optimize_program = optimize.FushionOptimize(program.originProgram); auto optimize_program = optimize.FusionOptimize(program.originProgram);
if (optimize_program != nullptr) { if (optimize_program != nullptr) {
optimize_program->Description("optimize"); // optimize_program->Description("optimize");
} else { } else {
LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null"; LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
} }
......
...@@ -20,7 +20,9 @@ int main() { ...@@ -20,7 +20,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
bool optimize = true; bool optimize = true;
auto time1 = time(); auto time1 = time();
auto program = loader.Load(g_googlenet, optimize); // auto program = loader.Load(g_googlenet, optimize);
auto program = loader.Load(g_googlenet_combine + "/model",
g_googlenet_combine + "/params", optimize);
auto time2 = time(); auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time2) << "ms\n"; DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize); paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
...@@ -28,7 +30,11 @@ int main() { ...@@ -28,7 +30,11 @@ int main() {
std::vector<int64_t> dims{1, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time(); auto time3 = time();
executor.Predict(input, dims);
for (int i = 0; i < 10; ++i) {
executor.Predict(input, dims);
}
auto time4 = time(); auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n"; DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
return 0; return 0;
......
...@@ -19,10 +19,10 @@ limitations under the License. */ ...@@ -19,10 +19,10 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto time1 = time(); auto time1 = time();
auto program = loader.Load(g_mobilenet_ssd, false); auto program = loader.Load(g_mobilenet_ssd, true);
auto time2 = time(); auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms"; DLOG << "load cost :" << time_diff(time1, time1) << "ms";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false); paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
std::vector<int64_t> dims{1, 3, 300, 300}; std::vector<int64_t> dims{1, 3, 300, 300};
Tensor input_tensor; Tensor input_tensor;
......
...@@ -19,14 +19,14 @@ limitations under the License. */ ...@@ -19,14 +19,14 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto time1 = time(); auto time1 = time();
auto program = loader.Load(g_mobilenet, false); auto program = loader.Load(g_mobilenet, true);
auto time2 = time(); auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms"; DLOG << "load cost :" << time_diff(time1, time1) << "ms";
paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 2, false); paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
std::vector<int64_t> dims{2, 3, 224, 224}; std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor; Tensor input_tensor;
SetupTensor<float>(&input_tensor, {2, 3, 224, 224}, static_cast<float>(0), SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
std::vector<float> input(input_tensor.data<float>(), std::vector<float> input(input_tensor.data<float>(),
......
...@@ -41,7 +41,7 @@ class TestBatchNormOp { ...@@ -41,7 +41,7 @@ class TestBatchNormOp {
for (int j = 0; j < ops.size(); ++j) { for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<OpDesc> op = ops[j]; std::shared_ptr<OpDesc> op = ops[j];
if (op->Type() == "batch_norm" && if (op->Type() == "batch_norm" &&
op->Input("X")[0] == "conv2d_0.tmp_0") { op->Input("X")[0] == "conv2d_5.tmp_0") {
DLOG << " mul attr size: " << op->GetAttrMap().size(); DLOG << " mul attr size: " << op->GetAttrMap().size();
DLOG << " inputs size: " << op->GetInputs().size(); DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " outputs size: " << op->GetOutputs().size(); DLOG << " outputs size: " << op->GetOutputs().size();
...@@ -67,29 +67,29 @@ class TestBatchNormOp { ...@@ -67,29 +67,29 @@ class TestBatchNormOp {
const Tensor &t5) { const Tensor &t5) {
// feed // feed
auto scope = program_.scope; auto scope = program_.scope;
Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0"); Variable *x1_feed_value = scope->Var("conv2d_5.tmp_0");
auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>(); auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
tensor_x1->ShareDataWith(t1); tensor_x1->ShareDataWith(t1);
Variable *mean_feed_value = scope->Var("batch_norm_0.w_1"); Variable *mean_feed_value = scope->Var("batch_norm_10.w_1");
auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>(); auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
tensor_mean->ShareDataWith(t2); tensor_mean->ShareDataWith(t2);
Variable *scale_feed_value = scope->Var("batch_norm_0.w_0"); Variable *scale_feed_value = scope->Var("batch_norm_10.w_0");
auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>(); auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
tensor_scale->ShareDataWith(t3); tensor_scale->ShareDataWith(t3);
Variable *variance_feed_value = scope->Var("batch_norm_0.w_2"); Variable *variance_feed_value = scope->Var("batch_norm_10.w_2");
auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>(); auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
tensor_variance->ShareDataWith(t4); tensor_variance->ShareDataWith(t4);
Variable *bias_feed_value = scope->Var("batch_norm_0.b_0"); Variable *bias_feed_value = scope->Var("batch_norm_10.b_0");
auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>(); auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
tensor_bias->ShareDataWith(t5); tensor_bias->ShareDataWith(t5);
Variable *output = scope->Var("batch_norm_0.tmp_2"); Variable *output = scope->Var("batch_norm_10.tmp_2");
auto *output_tensor = output->GetMutable<LoDTensor>(); auto *output_tensor = output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>({4, 10, 2, 2}); output_tensor->mutable_data<float>({1, 256, 38, 38});
// DLOG << typeid(output_tensor).name(); // DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims(); // DLOG << "output_tensor dims: " << output_tensor->dims();
...@@ -128,30 +128,32 @@ int main() { ...@@ -128,30 +128,32 @@ int main() {
DLOG << "----------**********----------"; DLOG << "----------**********----------";
DLOG << "begin to run BatchNormOp Test"; DLOG << "begin to run BatchNormOp Test";
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_resnet)); auto program = loader.Load(std::string(g_mobilenet_ssd));
/// input x (4,10,2,2) /// input x (4,10,2,2)
paddle_mobile::framework::Tensor inputx1; paddle_mobile::framework::Tensor inputx1;
SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0), SetupTensor<float>(&inputx1, {1, 256, 38, 38}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
auto *inputx1_ptr = inputx1.data<float>(); auto *inputx1_ptr = inputx1.data<float>();
paddle_mobile::framework::Tensor mean; paddle_mobile::framework::Tensor mean;
SetupTensor<float>(&mean, {10}, static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&mean, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *mean_ptr = mean.data<float>(); auto *mean_ptr = mean.data<float>();
paddle_mobile::framework::Tensor scale; paddle_mobile::framework::Tensor scale;
SetupTensor<float>(&scale, {10}, static_cast<float>(0), SetupTensor<float>(&scale, {256}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
auto *scale_ptr = scale.data<float>(); auto *scale_ptr = scale.data<float>();
paddle_mobile::framework::Tensor variance; paddle_mobile::framework::Tensor variance;
SetupTensor<float>(&variance, {10}, static_cast<float>(0), SetupTensor<float>(&variance, {256}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
auto *variance_ptr = variance.data<float>(); auto *variance_ptr = variance.data<float>();
paddle_mobile::framework::Tensor bias; paddle_mobile::framework::Tensor bias;
SetupTensor<float>(&bias, {10}, static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&bias, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *bias_ptr = bias.data<float>(); auto *bias_ptr = bias.data<float>();
paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp( paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
...@@ -161,11 +163,13 @@ int main() { ...@@ -161,11 +163,13 @@ int main() {
testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias); testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias);
auto *output_bn_ptr = output_bn->data<float>(); auto *output_bn_ptr = output_bn->data<float>();
/// [2, 5, 1, 0] DLOG << " (" << inputx1_ptr[0] << " - " << mean_ptr[0] << ")/(("
DLOG << " (" << inputx1_ptr[102] << " - " << mean_ptr[5] << ")/((" << variance_ptr[0] << " + 0.00001"
<< variance_ptr[5] << " + 0.00001" << ")^0.5)* " << scale_ptr[0] << " + " << bias_ptr[0] << " = ";
<< ")^0.5)* " << scale_ptr[5] << " + " << bias_ptr[5] << " = "; DLOG << output_bn_ptr[0];
DLOG << output_bn_ptr[102];
DLOG << "input_ptr 0 : " << inputx1_ptr[0];
DLOG << "output_ptr 0 : " << output_bn_ptr[0];
return 0; return 0;
} }
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/concat_op.h" #include "operators/concat_op.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/fusion_conv_add_relu_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_googlenet, true);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<
paddle_mobile::CPU,
paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
executor(program, "fusion_conv_add_relu", true);
paddle_mobile::framework::Tensor input;
GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
auto output_ptr = output->data<float>();
for (int j = 0; j < 25; ++j) {
DLOG << " value of output: " << output_ptr[j];
}
return 0;
}
...@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/conv_op.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
// ../models/image_classification_resnet.inference.model // ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_googlenet); auto program = loader.Load(g_googlenet);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail"); "program file read fail");
Executor4Test<paddle_mobile::CPU, Executor4Test<paddle_mobile::GPU_MALI, paddle_mobile::operators::ConvOp<
paddle_mobile::operators::ConvOp<paddle_mobile::CPU, float>> paddle_mobile::GPU_MALI, float>>
executor(program, "conv2d"); executor(program, "conv2d");
paddle_mobile::framework::Tensor input; paddle_mobile::framework::Tensor input;
...@@ -37,7 +37,7 @@ int main() { ...@@ -37,7 +37,7 @@ int main() {
auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim); auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
auto output_ptr = output->data<float>(); auto output_ptr = output->data<float>();
for (int j = 0; j < output->numel(); ++j) { for (int j = 0; j < 20; ++j) {
DLOG << " value of output: " << output_ptr[j]; DLOG << " value of output: " << output_ptr[j];
} }
return 0; return 0;
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/depthwise_conv_op.h" #include "operators/depthwise_conv_op.h"
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
int main() { int main() {
......
...@@ -49,8 +49,8 @@ class TestFcOp { ...@@ -49,8 +49,8 @@ class TestFcOp {
DLOG << " Input Y is : " << op->Input("Y")[0]; DLOG << " Input Y is : " << op->Input("Y")[0];
DLOG << " Input Y is : " << op->Input("Z")[0]; DLOG << " Input Y is : " << op->Input("Z")[0];
DLOG << " Output Out is : " << op->Output("Out")[0]; DLOG << " Output Out is : " << op->Output("Out")[0];
std::shared_ptr<operators::FushionFcOp<Dtype, float>> testOp = std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
std::make_shared<operators::FushionFcOp<Dtype, float>>( std::make_shared<operators::FusionFcOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(), op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), program_.scope); op->GetAttrMap(), program_.scope);
ops_of_block_[*block_desc.get()].push_back(testOp); ops_of_block_[*block_desc.get()].push_back(testOp);
...@@ -119,7 +119,7 @@ int main() { ...@@ -119,7 +119,7 @@ int main() {
auto program = loader.Load(g_googlenet); auto program = loader.Load(g_googlenet);
paddle_mobile::framework::ProgramOptimize optimize; paddle_mobile::framework::ProgramOptimize optimize;
// program.originProgram->Description("origin"); // program.originProgram->Description("origin");
auto optimize_program = optimize.FushionOptimize(program.originProgram); auto optimize_program = optimize.FusionOptimize(program.originProgram);
program.optimizeProgram = optimize_program; program.optimizeProgram = optimize_program;
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/lrn_op.h" #include "operators/lrn_op.h"
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
......
...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h" #include "../test_include.h"
#include "../test_helper.h" #include "operators/pool_op.h"
#include "io.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/relu_op.h" #include "operators/relu_op.h"
......
...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h" #include "../test_include.h"
#include "../test_helper.h" #include "operators/reshape_op.h"
#include "io.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "../../src/operators/kernel/sigmoid_kernel.h" #include "../../src/operators/kernel/sigmoid_kernel.h"
#include "../test_helper.h" #include "../test_helper.h"
#include "io.h" #include "io/io.h"
int main() { int main() {
paddle_mobile::framework::Tensor input; paddle_mobile::framework::Tensor input;
......
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h" #include "../test_include.h"
#include "../test_helper.h"
#include "io.h" #include "operators/softmax_op.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
......
...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "../executor_for_test.h"
#include "../test_helper.h" #include "../test_helper.h"
#include "io.h" #include "../test_include.h"
#include "operators/transpose_op.h"
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_mobilenet_ssd)); auto program = loader.Load(std::string(g_mobilenet_ssd));
......
...@@ -22,12 +22,13 @@ limitations under the License. */ ...@@ -22,12 +22,13 @@ limitations under the License. */
#include "framework/ddim.h" #include "framework/ddim.h"
#include "framework/tensor.h" #include "framework/tensor.h"
static const std::string g_googlenet = "../models/googlenet";
static const std::string g_mobilenet = "../models/mobilenet";
static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd"; static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
static const std::string g_squeezenet = "../models/squeezenet"; static const std::string g_squeezenet = "../models/squeezenet";
static const std::string g_resnet = static const std::string g_googlenet = "../models/googlenet";
"../models/image_classification_resnet.inference.model"; static const std::string g_mobilenet = "../models/mobilenet";
static const std::string g_resnet_50 = "../models/resnet_50";
static const std::string g_resnet = "../models/resnet";
static const std::string g_googlenet_combine = "../models/googlenet_combine";
static const std::string g_yolo = "../models/yolo"; static const std::string g_yolo = "../models/yolo";
static const std::string g_test_image_1x3x224x224 = static const std::string g_test_image_1x3x224x224 =
"../images/test_image_1x3x224x224_float"; "../images/test_image_1x3x224x224_float";
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "./test_helper.h" #include "./test_helper.h"
#include "common/enforce.h" #include "common/enforce.h"
#include "common/log.h" #include "common/log.h"
#include "executor_for_test.h"
#include "framework/lod_tensor.h" #include "framework/lod_tensor.h"
#include "framework/operator.h" #include "framework/operator.h"
#include "framework/program/block_desc.h" #include "framework/program/block_desc.h"
...@@ -29,4 +30,4 @@ limitations under the License. */ ...@@ -29,4 +30,4 @@ limitations under the License. */
#include "framework/scope.h" #include "framework/scope.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "framework/variable.h" #include "framework/variable.h"
#include "io.h" #include "io/io.h"
#!/usr/bin/env sh
push_fn () {
MODELS_PATH="../../test/models/*"
MODELS_SRC="../../test/models"
IMAGE_PATH="../../test/images/*"
EXE_FILE="../../test/build/*"
EXE_DIR="data/local/tmp/bin"
adb shell mkdir ${EXE_DIR}
MODELS_DIR="data/local/tmp/models"
adb shell mkdir ${MODELS_DIR}
for file in `ls ${MODELS_SRC}`
do
adb shell mkdir ${MODELS_DIR}"/"${file}
done
if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
adb push ${ACL_BUILD_PATH} ${EXE_DIR}
fi
IMAGES_DIR="data/local/tmp/images"
adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then
adb push ${IMAGE_PATH} ${IMAGES_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR}
fi
}
if [[ $1 == "npm" ]]; then
push_fn $1
else
push_fn
fi
#!/usr/bin/env sh
push_fn () {
MODELS_PATH="../../test/models/*"
MODELS_SRC="../../test/models"
IMAGE_PATH="../../test/images/*"
EXE_FILE="../../test/build/*"
EXE_DIR="data/local/tmp/bin"
adb shell mkdir ${EXE_DIR}
MODELS_DIR="data/local/tmp/models"
adb shell mkdir ${MODELS_DIR}
for file in `ls ${MODELS_SRC}`
do
adb shell mkdir ${MODELS_DIR}"/"${file}
done
IMAGES_DIR="data/local/tmp/images"
adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then
adb push ${IMAGE_PATH} ${IMAGES_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR}
fi
echo "test-op or test-net below : "
adb shell ls /data/local/tmp/bin
echo "**** choose OP or NET to test ****"
read -p "which to test : " test_name
adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
}
if [[ $1 == "npm" ]]; then
push_fn $1
else
push_fn
fi
\ No newline at end of file
set(ARCH "armv7-a")
set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
set(FPU "neon")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
...@@ -15,17 +15,15 @@ build_for_mac() { ...@@ -15,17 +15,15 @@ build_for_mac() {
fi fi
PLATFORM="x86" PLATFORM="x86"
MODE="Release" MODE="Release"
CXX_FLAGS="-std=c++11 -O3 -s" BUILD_DIR=../build/release/"${PLATFORM}"
BUILD_DIR=build/release/"${PLATFORM}"
mkdir -p ${BUILD_DIR}/build mkdir -p ${BUILD_DIR}/build
mkdir -p ${BUILD_DIR}/test mkdir -p ${BUILD_DIR}/test
cp -r test/models ${BUILD_DIR}/test/models cp -r ../test/models ${BUILD_DIR}/test/models
cmake . \ cmake .. \
-B"${BUILD_DIR}" \ -B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \ -DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-DIS_MAC=true -DIS_MAC=true
cd ${BUILD_DIR} cd ${BUILD_DIR}
...@@ -33,34 +31,38 @@ build_for_mac() { ...@@ -33,34 +31,38 @@ build_for_mac() {
} }
build_for_android() { build_for_android() {
rm -rf "../build"
if [ -z "${ANDROID_NDK}" ]; then if [ -z "${ANDROID_NDK}" ]; then
echo "ANDROID_NDK not found!" echo "ANDROID_NDK not found!"
exit -1 exit -1
fi fi
PLATFORM="arm-v7a" if [ -z "$PLATFORM" ]; then
# PLATFORM="arm-v8a" PLATFORM="arm-v7a" # Users could choose "arm-v8a" or other platforms from the command line.
fi
if [ "${PLATFORM}" = "arm-v7a" ]; then if [ "${PLATFORM}" = "arm-v7a" ]; then
ABI="armeabi-v7a with NEON" ABI="armeabi-v7a with NEON"
ARM_PLATFORM="V7" ARM_PLATFORM="V7"
CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security -llog" CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
elif [ "${PLATFORM}" = "arm-v8a" ]; then elif [ "${PLATFORM}" = "arm-v8a" ]; then
ABI="arm64-v8a" ABI="arm64-v8a"
ARM_PLATFORM="V8" ARM_PLATFORM="V8"
CXX_FLAGS="-O3 -std=c++11 -s -march=armv8-a -pie -fPIE -w -Wno-error=format-security -llog" CXX_FLAGS="-march=armv8-a -pie -fPIE -w -Wno-error=format-security -llog"
else else
echo "unknown platform!" echo "unknown platform!"
exit -1 exit -1
fi fi
MODE="Release" MODE="Release"
ANDROID_PLATFORM_VERSION="android-15" ANDROID_PLATFORM_VERSION="android-22"
TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE="arm" ANDROID_ARM_MODE="arm"
if [ $# -eq 1 ]; then
cmake . \ NET=$1
-B"build/release/${PLATFORM}" \ cmake .. \
-B"../build/release/${PLATFORM}" \
-DANDROID_ABI="${ABI}" \ -DANDROID_ABI="${ABI}" \
-DCMAKE_BUILD_TYPE="${MODE}" \ -DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
...@@ -68,30 +70,55 @@ build_for_android() { ...@@ -68,30 +70,55 @@ build_for_android() {
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-DANDROID_STL=c++_static \ -DANDROID_STL=c++_static \
-DANDROID=true \ -DANDROID=true \
-D"${NET}=true" \
-D"${ARM_PLATFORM}"=true -D"${ARM_PLATFORM}"=true
else
cd "./build/release/${PLATFORM}" cmake .. \
-B"../build/release/${PLATFORM}" \
-DANDROID_ABI="${ABI}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-DANDROID_STL=c++_static \
-DANDROID=true \
-D"${ARM_PLATFORM}"=true
fi
cd "../build/release/${PLATFORM}"
make -j 8 make -j 8
} }
build_for_ios() { build_for_ios() {
rm -rf "../build"
PLATFORM="ios" PLATFORM="ios"
MODE="Release" MODE="Release"
BUILD_DIR=build/release/"${PLATFORM}" BUILD_DIR=../build/release/"${PLATFORM}"
TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}" C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}" CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
mkdir -p "${BUILD_DIR}" mkdir -p "${BUILD_DIR}"
if [ $# -eq 1 ]; then
cmake . \ NET=$1
-B"${BUILD_DIR}" \ cmake .. \
-DCMAKE_BUILD_TYPE="${MODE}" \ -B"${BUILD_DIR}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \ -DCMAKE_BUILD_TYPE="${MODE}" \
-DIOS_PLATFORM=OS \ -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DCMAKE_C_FLAGS="${C_FLAGS}" \ -DIOS_PLATFORM=OS \
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ -DCMAKE_C_FLAGS="${C_FLAGS}" \
-DIS_IOS="true" \ -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-D"${NET}"=true \
-DIS_IOS="true"
else
cmake .. \
-B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIOS_PLATFORM=OS \
-DCMAKE_C_FLAGS="${C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-DIS_IOS="true"
fi
cd "${BUILD_DIR}" cd "${BUILD_DIR}"
make -j 8 make -j 8
} }
...@@ -105,15 +132,43 @@ if [ $# -lt 1 ]; then ...@@ -105,15 +132,43 @@ if [ $# -lt 1 ]; then
echo "available targets: mac|linux|ios|android" echo "available targets: mac|linux|ios|android"
echo "sample usage: ./build.sh mac" echo "sample usage: ./build.sh mac"
else else
if [ $1 = "mac" ]; then if [ $# -eq 2 ]; then
build_for_mac if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
elif [ $1 = "linux" ]; then if [ $1 = "mac" ]; then
build_for_linux build_for_mac
elif [ $1 = "android" ]; then elif [ $1 = "linux" ]; then
build_for_android build_for_linux
elif [ $1 = "ios" ]; then elif [ $1 = "android" ]; then
build_for_ios build_for_android
else elif [ $1 = "ios" ]; then
build_error build_for_ios
else
build_error
fi
else
if [ $1 = "mac" ]; then
build_for_mac $2
elif [ $1 = "linux" ]; then
build_for_linux $2
elif [ $1 = "android" ]; then
build_for_android $2
elif [ $1 = "ios" ]; then
build_for_ios $2
else
build_error
fi
fi
else
if [ $1 = "mac" ]; then
build_for_mac
elif [ $1 = "linux" ]; then
build_for_linux
elif [ $1 = "android" ]; then
build_for_android
elif [ $1 = "ios" ]; then
build_for_ios
else
build_error
fi
fi fi
fi fi
# This file is part of the ios-cmake project. It was retrieved from # This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
# https://github.com/cristeab/ios-cmake.git, which is a fork of # files which are included with CMake 2.8.4
# https://code.google.com/p/ios-cmake/. Which in turn is based off of # It has been altered for iOS development
# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which
# are included with CMake 2.8.4 # Options:
#
# The ios-cmake project is licensed under the new BSD license.
#
# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software,
# Kitware, Inc., Insight Software Consortium. All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# This file is based off of the Platform/Darwin.cmake and
# Platform/UnixPaths.cmake files which are included with CMake 2.8.4
# It has been altered for iOS development.
#
# Updated by Alex Stewart (alexs.mac@gmail.com)
#
# *****************************************************************************
# Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com)
# under the BSD-Clause-3 licence
# *****************************************************************************
#
# INFORMATION / HELP
#
# The following variables control the behaviour of this toolchain:
#
# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS
# OS = Build for iPhoneOS.
# SIMULATOR = Build for x86 i386 iPhone Simulator.
# SIMULATOR64 = Build for x86_64 iPhone Simulator.
# TVOS = Build for AppleTVOS.
# SIMULATOR_TVOS = Build for x86_64 AppleTV Simulator.
# CMAKE_OSX_SYSROOT: Path to the iOS SDK to use. By default this is
# automatically determined from IOS_PLATFORM and xcodebuild, but
# can also be manually specified (although this should not be required).
# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform
# being compiled for. By default this is automatically determined from
# CMAKE_OSX_SYSROOT, but can also be manually specified (although this should
# not be required).
# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true)
# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default)
# IOS_ARCH: (armv7 armv7s arm64 i386 x86_64) If specified, will override the default architectures for the given IOS_PLATFORM
# OS = armv7 armv7s arm64
# SIMULATOR = i386
# SIMULATOR64 = x86_64
# TVOS = arm64
# SIMULATOR_TVOS = x86_64
# #
# This toolchain defines the following variables for use externally: # IOS_PLATFORM = OS (default) or SIMULATOR or SIMULATOR64
# This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
# OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
# SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
# #
# XCODE_VERSION: Version number (not including Build version) of Xcode detected. # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
# IOS_SDK_VERSION: Version of iOS SDK being used. # By default this location is automatcially chosen based on the IOS_PLATFORM value above.
# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from # If set manually, it will override the default location and force the user of a particular Developer Platform
# IOS_PLATFORM).
# #
# This toolchain defines the following macros for use externally: # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
# In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
# If set manually, this will force the use of a specific SDK version
# Macros:
# #
# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT) # set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
# A convenience macro for setting xcode specific properties on targets. # A convenience macro for setting xcode specific properties on targets
# Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel # example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
# example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
# #
# find_host_package (PROGRAM ARGS) # find_host_package (PROGRAM ARGS)
# A macro used to find executable programs on the host system, not within the # A macro used to find executable programs on the host system, not within the iOS environment.
# iOS environment. Thanks to the android-cmake project for providing the # Thanks to the android-cmake project for providing the command
# command.
# Standard settings
# Fix for PThread library not in path set (CMAKE_SYSTEM_NAME Darwin)
set(CMAKE_THREAD_LIBS_INIT "-lpthread") set (CMAKE_SYSTEM_VERSION 1)
set(CMAKE_HAVE_THREADS_LIBRARY 1) set (UNIX True)
set(CMAKE_USE_WIN32_THREADS_INIT 0) set (APPLE True)
set(CMAKE_USE_PTHREADS_INIT 1) set (IOS True)
# Get the Xcode version being used. # Required as of cmake 2.8.10
execute_process(COMMAND xcodebuild -version set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
OUTPUT_VARIABLE XCODE_VERSION
ERROR_QUIET # Determine the cmake host system version so we know where to find the iOS SDKs
OUTPUT_STRIP_TRAILING_WHITESPACE) find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}") if (CMAKE_UNAME)
string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}") exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
message(STATUS "Building with Xcode version: ${XCODE_VERSION}") string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
# Default to building for iPhoneOS if not specified otherwise, and we cannot endif (CMAKE_UNAME)
# determine the platform from the CMAKE_OSX_ARCHITECTURES variable. The use
# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly # Force the compilers to gcc for iOS
# determine the value of IOS_PLATFORM from the root project, as #include (CMakeForceCompiler)
# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake. #CMAKE_C_COMPILER (/usr/bin/gcc)
if (NOT DEFINED IOS_PLATFORM) #CMAKE_CXX_COMPILER (/usr/bin/g++)
if (CMAKE_OSX_ARCHITECTURES) set(CMAKE_C_COMPILER /usr/bin/gcc)
if (CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*") set(CMAKE_CXX_COMPILER /usr/bin/g++)
set(IOS_PLATFORM "OS")
elseif (CMAKE_OSX_ARCHITECTURES MATCHES "i386")
set(IOS_PLATFORM "SIMULATOR")
elseif (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
set(IOS_PLATFORM "SIMULATOR64")
endif()
endif()
if (NOT IOS_PLATFORM)
set(IOS_PLATFORM "OS")
endif()
endif()
set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING
"Type of iOS platform for which to build.")
# Determine the platform name and architectures for use in xcodebuild commands
# from the specified IOS_PLATFORM name.
if (IOS_PLATFORM STREQUAL "OS")
set(XCODE_IOS_PLATFORM iphoneos)
if(NOT IOS_ARCH)
set(IOS_ARCH armv7 armv7s arm64)
endif()
elseif (IOS_PLATFORM STREQUAL "SIMULATOR")
set(XCODE_IOS_PLATFORM iphonesimulator)
if(NOT IOS_ARCH)
set(IOS_ARCH i386)
endif()
elseif(IOS_PLATFORM STREQUAL "SIMULATOR64")
set(XCODE_IOS_PLATFORM iphonesimulator)
if(NOT IOS_ARCH)
set(IOS_ARCH x86_64)
endif()
elseif (IOS_PLATFORM STREQUAL "TVOS")
set(XCODE_IOS_PLATFORM appletvos)
if(NOT IOS_ARCH)
set(IOS_ARCH arm64)
endif()
elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
set(XCODE_IOS_PLATFORM appletvsimulator)
if(NOT IOS_ARCH)
set(IOS_ARCH x86_64)
endif()
else()
message(FATAL_ERROR "Invalid IOS_PLATFORM: ${IOS_PLATFORM}")
endif()
message(STATUS "Configuring iOS build for platform: ${IOS_PLATFORM}, "
"architecture(s): ${IOS_ARCH}")
# If user did not specify the SDK root to use, then query xcodebuild for it.
if (NOT CMAKE_OSX_SYSROOT)
execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
endif()
if (NOT EXISTS ${CMAKE_OSX_SYSROOT})
message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} "
"does not exist.")
endif()
# Specify minimum version of deployment target.
if (NOT DEFINED IOS_DEPLOYMENT_TARGET)
# Unless specified, SDK version 8.0 is used by default as minimum target version.
set(IOS_DEPLOYMENT_TARGET "8.0"
CACHE STRING "Minimum iOS version to build for." )
message(STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!")
endif()
# Use bitcode or not
if (NOT DEFINED ENABLE_BITCODE)
# Unless specified, enable bitcode support by default
set(ENABLE_BITCODE TRUE CACHE BOOL "Wheter or not to enable bitcode")
message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!")
endif()
# Use ARC or not
if (NOT DEFINED ENABLE_ARC)
# Unless specified, enable ARC support by default
set(ENABLE_ARC TRUE CACHE BOOL "Wheter or not to enable ARC")
message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!")
endif()
# Get the SDK version information.
execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
OUTPUT_VARIABLE IOS_SDK_VERSION
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
# Find the Developer root for the specific iOS platform being compiled for
# from CMAKE_OSX_SYSROOT. Should be ../../ from SDK specified in
# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain
# this information from xcrun or xcodebuild.
if (NOT CMAKE_IOS_DEVELOPER_ROOT)
get_filename_component(IOS_PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH)
get_filename_component(CMAKE_IOS_DEVELOPER_ROOT ${IOS_PLATFORM_SDK_DIR} PATH)
endif()
if (NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT})
message(FATAL_ERROR "Invalid CMAKE_IOS_DEVELOPER_ROOT: "
"${CMAKE_IOS_DEVELOPER_ROOT} does not exist.")
endif()
# Find the C & C++ compilers for the specified SDK.
if (NOT CMAKE_C_COMPILER)
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
OUTPUT_VARIABLE CMAKE_C_COMPILER
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}")
endif()
if (NOT CMAKE_CXX_COMPILER)
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
OUTPUT_VARIABLE CMAKE_CXX_COMPILER
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}")
endif()
# Find (Apple's) libtool.
execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool
OUTPUT_VARIABLE IOS_LIBTOOL
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
message(STATUS "Using libtool: ${IOS_LIBTOOL}")
# Configure libtool to be used instead of ar + ranlib to build static libraries.
# This is required on Xcode 7+, but should also work on previous versions of
# Xcode.
set(CMAKE_C_CREATE_STATIC_LIBRARY
"${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
set(CMAKE_CXX_CREATE_STATIC_LIBRARY
"${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
# Get the version of Darwin (OS X) of the host.
execute_process(COMMAND uname -r
OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE)
# Standard settings.
set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "")
set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION} CACHE INTERNAL "")
set(UNIX TRUE CACHE BOOL "")
set(APPLE TRUE CACHE BOOL "")
set(IOS TRUE CACHE BOOL "")
set(CMAKE_AR ar CACHE FILEPATH "" FORCE) set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE)
# Force unset of OS X-specific deployment target (otherwise autopopulated), # Skip the platform compiler checks for cross compiling
# required as of cmake 2.8.10. set (CMAKE_CXX_COMPILER_WORKS TRUE)
set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING set (CMAKE_C_COMPILER_WORKS TRUE)
"Must be empty for iOS builds." FORCE)
# Set the architectures for which to build. # All iOS/Darwin specific settings - some may be redundant
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS") set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
# Skip the platform compiler checks for cross compiling. set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
set(CMAKE_CXX_COMPILER_FORCED TRUE) set (CMAKE_SHARED_MODULE_PREFIX "lib")
set(CMAKE_CXX_COMPILER_WORKS TRUE) set (CMAKE_SHARED_MODULE_SUFFIX ".so")
set(CMAKE_C_COMPILER_FORCED TRUE) set (CMAKE_MODULE_EXISTS 1)
set(CMAKE_C_COMPILER_WORKS TRUE) set (CMAKE_DL_LIBS "")
# All iOS/Darwin specific settings - some may be redundant.
set(CMAKE_SHARED_LIBRARY_PREFIX "lib") set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib") set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
set(CMAKE_SHARED_MODULE_PREFIX "lib") set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
set(CMAKE_SHARED_MODULE_SUFFIX ".so") set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
set(CMAKE_MODULE_EXISTS 1)
set(CMAKE_DL_LIBS "") # Hidden visibilty is required for cxx on iOS
set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ") set (CMAKE_C_FLAGS_INIT "")
set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ") set (CMAKE_CXX_FLAGS_INIT "-fvisibility=hidden -fvisibility-inlines-hidden")
set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}") set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
message(STATUS "Building for minimum iOS version: ${IOS_DEPLOYMENT_TARGET}" set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
" (SDK version: ${IOS_SDK_VERSION})")
# Note that only Xcode 7+ supports the newer more specific: set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
# -m${XCODE_IOS_PLATFORM}-version-min flags, older versions of Xcode use: set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
# -m(ios/ios-simulator)-version-min instead. set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
if (IOS_PLATFORM STREQUAL "OS") set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
if (XCODE_VERSION VERSION_LESS 7.0) set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
set(XCODE_IOS_PLATFORM_VERSION_FLAGS set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
"-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
else() # hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
# Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM. # (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
set(XCODE_IOS_PLATFORM_VERSION_FLAGS # and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
"-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") # hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
endif()
elseif (IOS_PLATFORM STREQUAL "TVOS")
set(XCODE_IOS_PLATFORM_VERSION_FLAGS
"-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}")
elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
set(XCODE_IOS_PLATFORM_VERSION_FLAGS
"-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
else()
# SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min.
set(XCODE_IOS_PLATFORM_VERSION_FLAGS
"-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
endif()
message(STATUS "Version flags set to: ${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
if (ENABLE_BITCODE)
set(BITCODE "-fembed-bitcode")
message(STATUS "Enabling bitcode support.")
else()
set(BITCODE "")
message(STATUS "Disabling bitcode support.")
endif()
if (ENABLE_ARC)
set(FOBJC_ARC "-fobjc-arc")
message(STATUS "Enabling ARC support.")
else()
set(FOBJC_ARC "-fno-objc-arc")
message(STATUS "Disabling ARC support.")
endif()
set(CMAKE_C_FLAGS
"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${C_FLAGS}")
# Hidden visibilty is required for C++ on iOS.
set(CMAKE_CXX_FLAGS
"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fvisibility=hidden -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_MINSIZEREL}")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_RELWITHDEBINFO}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_RELEASE}")
set(CMAKE_C_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${C_LINK_FLAGS}")
set(CMAKE_CXX_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${CXX_LINK_FLAGS}")
# In order to ensure that the updated compiler flags are used in try_compile()
# tests, we have to forcibly set them in the CMake cache, not merely set them
# in the local scope.
list(APPEND VARS_TO_FORCE_IN_CACHE
CMAKE_C_FLAGS
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_RELWITHDEBINFO
CMAKE_CXX_FLAGS_MINSIZEREL
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_LINK_FLAGS
CMAKE_CXX_LINK_FLAGS)
foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE})
set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "" FORCE)
endforeach()
set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old
# build tree (where install_name_tool was hardcoded) and where
# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in
# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode
# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did
# before, Alex.
if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool) find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL) endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
# Set the find root to the iOS developer roots and to user defined paths.
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT} # Setup iOS platform unless specified manually with IOS_PLATFORM
${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root" FORCE) if (NOT DEFINED IOS_PLATFORM)
# Default to searching for frameworks first. set (IOS_PLATFORM "OS")
set(CMAKE_FIND_FRAMEWORK FIRST) endif (NOT DEFINED IOS_PLATFORM)
# Set up the default search directories for frameworks. set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
set(CMAKE_SYSTEM_FRAMEWORK_PATH
${CMAKE_OSX_SYSROOT}/System/Library/Frameworks # Setup building for arm64 or not
${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks if (NOT DEFINED BUILD_ARM64)
${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks) set (BUILD_ARM64 true)
# Only search the specified iOS SDK, not the remainder of the host filesystem. endif (NOT DEFINED BUILD_ARM64)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) set (BUILD_ARM64 ${BUILD_ARM64} CACHE STRING "Build arm64 arch or not")
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) # Check the platform selection and setup for developer root
# This little macro lets you set any XCode specific property. if (${IOS_PLATFORM} STREQUAL "OS")
macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION) set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
set(XCODE_RELVERSION_I "${XCODE_RELVERSION}")
if (XCODE_RELVERSION_I STREQUAL "All") # This causes the installers to properly locate the output libraries
set_property(TARGET ${TARGET} PROPERTY set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}") elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
else() set (SIMULATOR true)
set_property(TARGET ${TARGET} PROPERTY set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}")
endif() # This causes the installers to properly locate the output libraries
endmacro(set_xcode_property) set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
# This macro lets you find executable programs on the host system. elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
macro(find_host_package) set (SIMULATOR true)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) # This causes the installers to properly locate the output libraries
set(IOS FALSE) set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
else (${IOS_PLATFORM} STREQUAL "OS")
message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose OS or SIMULATOR")
endif (${IOS_PLATFORM} STREQUAL "OS")
# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
# Note Xcode 4.3 changed the installation location, choose the most recent one available
exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
set (XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
if (EXISTS ${XCODE_POST_43_ROOT})
set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
elseif(EXISTS ${XCODE_PRE_43_ROOT})
set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
endif (EXISTS ${XCODE_POST_43_ROOT})
endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
if (_CMAKE_IOS_SDKS)
list (SORT _CMAKE_IOS_SDKS)
list (REVERSE _CMAKE_IOS_SDKS)
list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
else (_CMAKE_IOS_SDKS)
message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
endif (_CMAKE_IOS_SDKS)
message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
# Set the sysroot default to the most recent SDK
set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
# set the architecture for iOS
if (${IOS_PLATFORM} STREQUAL "OS")
set (IOS_ARCH armv7 armv7s arm64)
elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
set (IOS_ARCH i386)
elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
set (IOS_ARCH x86_64)
endif (${IOS_PLATFORM} STREQUAL "OS")
set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
# Set the find root to the iOS developer roots and to user defined paths
set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string "iOS find search path root")
# default to searching for frameworks first
set (CMAKE_FIND_FRAMEWORK FIRST)
# set up the default search directories for frameworks
set (CMAKE_SYSTEM_FRAMEWORK_PATH
${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
)
# only search the iOS sdks, not the remainder of the host filesystem
set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
# This little macro lets you set any XCode specific property
macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
endmacro (set_xcode_property)
# This macro lets you find executable programs on the host system
macro (find_host_package)
set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
set (IOS FALSE)
find_package(${ARGN}) find_package(${ARGN})
set(IOS TRUE)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY) set (IOS TRUE)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
endmacro(find_host_package) set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
endmacro (find_host_package)
set(NET "googlenet" CACHE STRING "select net type")
set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
if (NET EQUAL "googlenet")
set(CONCAT_OP ON)
set(CONV_OP ON)
set(LRN_OP ON)
set(MUL_OP ON)
set(ELEMENTWISEADD_OP ON)
set(FUSION_FC_OP ON)
set(POOL_OP ON)
set(RELU_OP ON)
set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADD_RELU_OP ON)
elseif (NET EQUAL "mobilenet")
set(CONV_OP ON)
set(ELEMENTWISEADD_OP ON)
set(RELU_OP ON)
set(SOFTMAX_OP ON)
set(SOFTMAX_OP ON)
set(DEPTHWISECONV_OP ON)
set(BATCHNORM_OP ON)
set(POOL_OP ON)
set(RESHAPE_OP ON)
elseif (NET EQUAL "yolo")
set(BATCHNORM_OP ON)
set(CONV_OP ON)
set(RELU_OP ON)
set(ELEMENTWISEADD_OP ON)
elseif (NET EQUAL "squeezenet")
set(CONCAT_OP ON)
set(CONV_OP ON)
set(RELU_OP ON)
set(ELEMENTWISEADD_OP ON)
set(POOL_OP ON)
set(RESHAPE_OP ON)
set(SOFTMAX_OP ON)
elseif (NET EQUAL "resnet")
set(CONV_OP ON)
set(BATCHNORM_OP ON)
set(ELEMENTWISEADD_OP ON)
set(SOFTMAX_OP ON)
set(MUL_OP ON)
set(POOL_OP ON)
set(RELU_OP ON)
else ()
set(BATCHNORM_OP ON)
set(BOXCODER_OP ON)
set(CONCAT_OP ON)
set(CONV_OP ON)
set(DEPTHWISECONV_OP ON)
set(ELEMENTWISEADD_OP ON)
set(FUSION_CONVADD_OP ON)
set(CONVADDRELU_OP ON)
set(FUSION_FC_OP ON)
set(LRN_OP ON)
set(MUL_OP ON)
set(MULTICLASSNMS_OP ON)
set(POOL_OP ON)
set(PRIORBOX_OP ON)
set(RELU_OP ON)
set(RESHAPE_OP ON)
set(SIGMOID_OP ON)
set(SOFTMAX_OP ON)
set(TRANSPOSE_OP ON)
set(FUSION_CONVADD_RELU_OP ON)
# option(BATCHNORM_OP "" ON)
# option(BOXCODER_OP "" ON)
# option(CONCAT_OP "" ON)
# option(CONV_OP "" ON)
# option(DEPTHWISECONV_OP "" ON)
# option(ELEMENTWISEADD_OP "" ON)
# option(FUSION_CONVADD_OP "" ON)
# option(CONVADDRELU_OP "" ON)
# option(FUSION_FC_OP "" ON)
# option(LRN_OP "" ON)
# option(MUL_OP "" ON)
# option(MULTICLASSNMS_OP "" ON)
# option(POOL_OP "" ON)
# option(PRIORBOX_OP "" ON)
# option(RELU_OP "" ON)
# option(RESHAPE_OP "" ON)
# option(SIGMOID_OP "" ON)
# option(SOFTMAX_OP "" ON)
# option(TRANSPOSE_OP "" ON)
# option(FUSION_CONVADD_RELU_OP "" ON)
endif ()
if (BATCHNORM_OP)
add_definitions(-DBATCHNORM_OP)
endif()
if (BOXCODER_OP)
add_definitions(-DBOXCODER_OP)
endif()
if (CONCAT_OP)
add_definitions(-DCONCAT_OP)
endif()
if (CONV_OP)
add_definitions(-DCONV_OP)
endif()
if (DEPTHWISECONV_OP)
add_definitions(-DDEPTHWISECONV_OP)
endif()
if (ELEMENTWISEADD_OP)
add_definitions(-DELEMENTWISEADD_OP)
endif()
if (FUSION_CONVADD_OP)
add_definitions(-DFUSION_CONVADD_OP)
endif()
if (CONVADDRELU_OP)
add_definitions(-DCONVADDRELU_OP)
endif()
if (FUSION_FC_OP)
add_definitions(-DFUSION_FC_OP)
endif()
if (LRN_OP)
add_definitions(-DLRN_OP)
endif()
if (MUL_OP)
add_definitions(-DMUL_OP)
endif()
if (MULTICLASSNMS_OP)
add_definitions(-DMULTICLASSNMS_OP)
endif()
if (POOL_OP)
add_definitions(-DPOOL_OP)
endif()
if (PRIORBOX_OP)
add_definitions(-DPRIORBOX_OP)
endif()
if (RELU_OP)
add_definitions(-DRELU_OP)
endif()
if (RESHAPE_OP)
add_definitions(-DRESHAPE_OP)
endif()
if (SIGMOID_OP)
add_definitions(-DSIGMOID_OP)
endif()
if (SOFTMAX_OP)
add_definitions(-DSOFTMAX_OP)
endif()
if (TRANSPOSE_OP)
add_definitions(-DTRANSPOSE_OP)
endif()
if (FUSION_CONVADD_RELU_OP)
add_definitions(-DFUSION_CONVADD_RELU_OP)
endif()
\ No newline at end of file
...@@ -12,4 +12,8 @@ if ! [[ $version == *"$VERSION"* ]]; then ...@@ -12,4 +12,8 @@ if ! [[ $version == *"$VERSION"* ]]; then
exit -1 exit -1
fi fi
clang-format $@ # https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
shift
perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
clang-format -i $@
perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
#!/usr/bin/env sh
cat <<EOF
<html>
<head>
<style>
html, body {
position: absolute;
width: 100%;
height: 100%;
margin: 0;
}
div.timeview {
width: 100%;
position: relative;
overflow: scroll;
}
ul {
position: absolute;
margin: 0;
list-style:none;
padding: 0;
margin: 0;
}
li {
height: 15px;
position: absolute;
background: blue;
}
li:nth-child(odd) {
background: blue;
}
li:nth-child(even) {
background: rebeccapurple;
}
ul.timeline {
z-index: -1;
}
ul.timeline li {
position: relative;
height: 15px;
width: 100%;
}
ul.timeline li:nth-child(odd) {
background: beige;
}
ul.timeline li:nth-child(even) {
background: antiquewhite;
}
</style>
</head>
<body>
<div class="timeview">
<ul>
EOF
min=$(awk 'NR==1{min=$4} NR>1{if($4 < min) min=$4} END{print min}' $1)
max=$(awk 'NR==1{max=$5} NR>1{if($5 > max) max=$5} END{print max}' $1)
sort $1 -k1,1n | awk -v max="$max" -v min="$min" '
BEGIN {
total = max - min
}
{
opid = $1
optype = $2
tid = $3
cb = $4
ce = $5
cl = $6
sum += $4 - $3
print "<li class=\"timeline\"" \
" data-opid=\"" opid "\"" \
" data-optype=\"" optype "\"" \
" data-tid=\"" tid "\"" \
" data-begin=\"" cb "\"" \
" data-end=\"" ce "\"" \
"></li>"
}
'
cat <<EOF
</ul>
</div>
<pre>
EOF
echo "==================[ profile ]==================="
cat $1 | awk '
NR>1{
optype = $2
sum += $5 - $4
count[$2] += $6
}
END {
for (t in count) {
msg = sprintf("%-16s\t%-10d\t%-.4f", t, count[t], count[t]*100 / sum);
print msg
}
}' | sort -k2,2nr
cat $1 | awk '
NR>1{
sum += $5 - $4
}
END {
msg = sprintf("%-16s\t%-10d\t%-.4f", "total", sum, 100);
print msg
}'
cat <<EOF
</pre>
<script>
const min= $min;
const max= $max;
const px_per_nanosecond = 1/1000000;
const scale = px_per_nanosecond;
const li = document.querySelectorAll('li');
const thread = new Set();
for (let i = 0; i < li.length; i++) {
const prof = li[i].dataset;
li[i].style.width = (prof.end - prof.begin)*scale + 'px';
li[i].style.left = (prof.begin - min)*scale + 'px';
li[i].style.top = prof.tid * 15 + 'px';
thread.add(prof.tid);
}
const ul = document.createElement('ul');
ul.classList.add('timeline');
ul.style.width = (max - min)*scale + 'px';
thread.forEach(i => {
const l = document.createElement('li');
ul.appendChild(l);
});
const timeview = document.querySelector('.timeview');
timeview.appendChild(ul);
timeview.style.height = thread.size * 15 + 'px';
</script>
</body>
</html>
EOF
set(ANDROID_ARM_NEON ON)
include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
# CMake toolchain file for building ARM software on Linux environment
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_VERSION 1)
set(CMAKE_C_COMPILER /usr/bin/arm-linux-gnueabi-gcc)
set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
set(CMAKE_FIND_ROOT_PATH /usr/arm-linux-gnueabi)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
include("${CMAKE_CURRENT_LIST_DIR}/../arm-platform.cmake")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册