提交 1a220755 编写于 作者: qnqinan's avatar qnqinan

udpate src code with remote

上级 48e8715a
cmake_minimum_required(VERSION 3.0)
project(paddle-mobile)
# select the platform to build
option(CPU "armv7 with neon support" ON)
option(MALI_GPU "mali gpu support" OFF)
option(FPGA "fpga support" OFF)
cmake_minimum_required(VERSION 3.0.0)
option(USE_OPENMP "openmp support" OFF)
option(USE_OPENMP "openmp support" ON)
option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io
option(LOG_PROFILE "log profile" OFF)
# select the platform to build
option(CPU "armv7 with neon" ON)
option(GPU_MALI "mali gpu" OFF)
option(GPU_CL "opencl gpu" OFF)
option(FPGA "fpga" OFF)
if(FPGA)
option(FPGAV1 "fpga v1" ON)
option(FPGAV2 "fpga v2" OFF)
endif()
project(paddle-mobile)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
......@@ -29,10 +36,10 @@ if(DEBUGING)
message(STATUS "debugging mode")
add_definitions(-DPADDLE_MOBILE_DEBUG)
else()
if(FPGA)
else()
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif()
endif()
if(SYMBOL_HIDDEN)
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif()
if(USE_EXCEPTION)
......@@ -70,7 +77,27 @@ else()
endforeach()
endif()
if(MALI_GPU)
if (GPU_CL)
add_definitions(-DPADDLE_MOBILE_CL)
# opencl version
add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
include_directories(third_party/opencl/OpenCL-Headers)
else()
file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if (GPU_MALI)
add_definitions(-DPADDLE_MOBILE_MALI_GPU)
add_definitions(-DUSE_ACL=1)
add_definitions(-DUSE_OPENCL)
......@@ -96,8 +123,43 @@ else()
endif()
if(FPGA)
message("FPGA mode enabled")
add_definitions(-DPADDLE_MOBILE_FPGA)
file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
if(FPGAV1)
message("FPGA_V1 enabled")
add_definitions(-DPADDLE_MOBILE_FPGA_V1)
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
endif()
if(FPGAV2)
message("FPGA_V2 enabled")
add_definitions(-DPADDLE_MOBILE_FPGA_V2)
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
endif()
else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
foreach(f ${_tmp_list})
......@@ -124,17 +186,17 @@ endif()
if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
endif()
if(IS_IOS)
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
endif()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
endif ()
set(CMAKE_VERBOSE_MAKEFILE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
......@@ -143,8 +205,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default
if(FPGA)
set(NET "FPGAnets" CACHE STRING "select net type")
if(FPGAV1)
set(NET "FPGA_NET_V1" CACHE STRING "select net type")
elseif(FPGAV2)
set(NET "FPGA_NET_V2" CACHE STRING "select net type")
else()
set(NET "default" CACHE STRING "select net type")
endif()
......
......@@ -8,46 +8,23 @@
[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
欢迎来到 Paddle-Mobile GitHub 项目。
Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。Paddle-Mobile设计思想和PaddlePaddle的最新版fluid版本保持了高度一致,同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
## 简单搜索线上效果
如下gif是简单搜索app的线上主体检测应用效果
![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
## Demo目录
[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。
## Features
- **ARM CPU**
- **Mali GPU**
- **苹果设备的GPU Metal实现**
- **FPGA**
- 高性能支持ARM CPU
- 支持Mali GPU
- 支持Andreno GPU
- 支持苹果设备的GPU Metal实现
- 支持ZU5、ZU9等FPGA开发板
- 支持树莓派等arm-linux开发板
目前已经支持 ZCU102 开发板。
## Demo
- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
- **灵活性**
* paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
* 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
* 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
* 使用 docker 编译, 提供统一的编译环境。
* 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
* 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
- **体积**
paddle-mobile从设计之初就深入考虑到移动端的包体积的问题,cpu实现中没有外部依赖。在编译过程中,如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
除了二进制体积,我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
### 原Domo目录
[https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
## 文档
......@@ -62,6 +39,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平
* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
### 贡献文档
- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
......@@ -73,18 +51,22 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平
### 1. 直接使用Paddle Fluid训练
该方式最为可靠,推荐方式
### 2. caffe转为Paddle Fluid模型
[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
### 3. ONNX
ONNX全称为“Open Neural Network Exchange”,即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
除直接使用PaddlePaddle训练fluid版本的模型外,还可以通过onnx转换得到个别Paddle fluid模型。
目前,百度也在做onnx支持工作。相关转换项目在这里:[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
目前,百度也在做onnx支持工作。相关转换项目在这里:
[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
### 4. 部分测试模型和测试图片下载
[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
<!--## 简单搜索线上效果
如下gif是简单搜索app的线上主体检测应用效果
![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)-->
## 问题解决
......@@ -96,5 +78,3 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L
## 旧版 Mobile-Deep-Learning
原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
## 如何运行demo
- Android demo下载路径
http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
- iOS demo下载路径:
http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
在demo目录下执行下载demo的脚本
## Demo 下载路径
- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip)
- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip)
- 原demo亦可使用getDemo.sh进行下载
```
sh getDemo.sh
```
demo工程就下载解压到当前目录中了。
\ No newline at end of file
## paddle-mobile GPU开发文档
编译环境配置方法请参考development_android.md文档
1. 下载 paddle-mobile
```
git clone https://github.com/PaddlePaddle/paddle-mobile.git
adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl
修改paddle-mobile/CMakeLists.txt文件,执行如下操作:
option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON)
cd paddle-mobile/tools
sh build.sh android
```
2. 将单测可执行文件和模型部署到手机
下载测试需要的mobilenet和test_image_1x3x224x224_float文件,下载地址:http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip
```
cd ../test
mkdir models
mkdir images
```
将mobilenet复制到paddle-mobile/test/models目录下
将test_image_1x3x224x224_float复制到paddle-mobile/test/images目录下
执行下面命令将可执行文件和预测需要的文件部署到手机
```
cd ../tools/android-debug-script
sh push2android.sh
```
3. 在adb shell中执行对应的可执行文件(目前只支持mobilenet,后续会支持更多的网络模型)
```
adb shell
cd /data/local/tmp/bin/
export LD_LIBRARY_PATH=.
./test-mobilenetgpu
```
4. mobilenet cpu模型预测结果
假设mobilenet和test_image_1x3x224x224_float文件已经推送到手机上,执行下面命令进行mobilenet cpu的预测
```
adb shell
cd /data/local/tmp/bin/
export LD_LIBRARY_PATH=.
./test-mobilenet
```
5. 预测结果
手机型号:小米6(CPU 835,GPU Adreno 540)
mobilenet gpu:预测性能,耗时41ms左右。
mobilenet cpu:
1线程:108ms
2线程:65ms
4线程:38ms
手机型号:OPPO Findx(CPU 845,GPU Adreno 630)
mobilenet gpu:预测性能,耗时27ms左右。
mobilenet cpu:
1线程:90ms
2线程:50ms
4线程:29ms
# ARM_LINUX开发文档
目前支持直接在arm_linux平台上编译paddle-mobile
## 以Raspberrypi3为例:
### 执行编译
在paddle-mobile根目录中,执行以下命令:
```
cd tools
/bin/bash build.sh arm_linux googlenet
```
执行完毕后,生成的so位于paddle-mobile/build/release/arm-linux/build目录中,单测可执行文件位于test/build目录中。
### 运行
```
cd ../build/release/arm-linux/build
export LD_LIBRARY_PATH=.
cd ../../../../test/build/
./test-googlenet
```
*注1:如果本地test目录下没有模型的话,会自动下载官方demo模型并解压.*
*注2:因为arm_linux设备算力限制,建议编译时,根据需要指定编译某个模型(如googlenet)或扩大系统的swap交换空间,避免编译时卡死.*
## 其他ARM_LINUX平台
其他的arm_linux平台可以修改 tools/build.sh中的相关编译参数进行编译。可以参考对应平台的编译选项。
特别说明的是Android平台请参考Android开发文档.
# FPGA开发文档
FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功,预测结果正确
FPGA平台的代码分为V1和V2。其中V1在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功,预测结果正确。以下描述适用于复现V1运行的结果
## 准备硬件
___
......@@ -17,7 +17,7 @@ ___
## 编译工程
___
1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
2. 进入paddle-mobile根目录, CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
2. 进入paddle-mobile根目录, CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。设置option(FPGAV1 "fpga v1" ON), option(FPGAV2 "fpga v2" OFF)。
2. 执行以下命令,可在./test/build下生成test-resnet50可执行程序。
* mkdir build
* cd build
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <chrono>
#include <chrono> // NOLINT
namespace paddle_mobile {
using Time = decltype(std::chrono::high_resolution_clock::now());
......@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
ms counter = std::chrono::duration_cast<ms>(diff);
return counter.count() / 1000.0;
}
} // namespace paddle_mobile
......@@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception {
std::string detail(buffer); \
throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
__FILE__, __LINE__); \
}
} \
exit(0);
#define PADDLE_MOBILE_ENFORCE(stat, ...) \
{ \
......
......@@ -82,6 +82,7 @@ std::unordered_map<
{G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
{G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
{G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
{G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
......
......@@ -39,7 +39,13 @@ struct PrecisionTrait<Precision::FP16> {
};
//! device type
enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
enum DeviceTypeEnum {
kINVALID = -1,
kCPU = 0,
kFPGA = 1,
kGPU_MALI = 2,
kGPU_CL = 3
};
template <DeviceTypeEnum T>
struct DeviceType {};
......@@ -47,6 +53,7 @@ struct DeviceType {};
typedef DeviceType<kCPU> CPU;
typedef DeviceType<kFPGA> FPGA;
typedef DeviceType<kGPU_MALI> GPU_MALI;
typedef DeviceType<kGPU_CL> GPU_CL;
//! data type
enum DataType {
......
......@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/api.h"
#include "fpga/V1/api.h"
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <algorithm>
#include <map>
#include "fpga/bias_scale.h"
#include "fpga/filter.h"
#include "fpga/image.h"
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#define FPGA_TEST_MODE
#define PADDLE_MOBILE_OS_LINUX
......
......@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/bias_scale.h"
#include "fpga/V1/bias_scale.h"
#include <memory.h>
#include "fpga/api.h"
#include "fpga/V1/api.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/filter.h"
#include "fpga/V1/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/api.h"
#include "fpga/V1/api.h"
namespace paddle_mobile {
namespace fpga {
......
......@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/image.h"
#include "fpga/V1/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/api.h"
#include "fpga/V1/api.h"
namespace paddle_mobile {
namespace fpga {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/api.h"
#include <algorithm>
#include "fpga/V2/bias_scale.h"
#include "fpga/V2/config.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
namespace paddle_mobile {
namespace fpga {
static std::map<void *, size_t> memory_map;
int open_device() {
int ret = open_device_driver();
return ret;
}
int close_device() {
int ret = close_device_driver();
return ret;
}
void *fpga_malloc(size_t size) {
static uint64_t counter = 0;
#ifdef PADDLE_MOBILE_ZU5
auto ptr = fpga_malloc_driver(size);
#else
auto ptr = malloc(size);
#endif
counter += size;
memory_map.insert(std::make_pair(ptr, size));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return ptr;
}
void fpga_free(void *ptr) {
static uint64_t counter = 0;
size_t size = 0;
auto iter = memory_map.find(ptr); // std::map<void *, size_t>::iterator
if (iter != memory_map.end()) {
size = iter->second;
memory_map.erase(iter);
#ifdef PADDLE_MOBILE_ZU5
fpga_free_driver(ptr);
#else
free(ptr);
#endif
counter += size;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
} else {
DLOG << "Invalid pointer";
}
}
half fp32_2_fp16(float fp32_num) {
unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT
auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
(((tmp & 0x7f800000) >> 13) - (112 << 10)));
if (tmp & 0x1000) {
t++; // roundoff
}
return t;
}
float fp16_2_fp32(half fp16_num) {
int frac = (fp16_num & 0x3ff);
int exp = ((fp16_num & 0x7c00) >> 10) + 112;
int s = fp16_num & 0x8000;
int tmp = 0;
float fp32_num;
tmp = s << 16 | exp << 23 | frac << 13;
fp32_num = *(float *)&tmp; // NOLINT
return fp32_num;
}
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
memcpy(new_data, data_ptr, memory_size);
int aligned_channel = filter::calc_aligned_channel((int)channel); // NOLINT
image::format_image(&new_data, (int)channel, (int)height, // NOLINT
(int)width, // NOLINT
aligned_channel);
image_tensor->reset_data_ptr(new_data);
}
void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto height = dims[2], width = dims[3];
memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
} else if (dims.size() == 2) {
memory_size = aligned_channel * sizeof(half);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
}
void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto height = dims[2], width = dims[3];
memory_size = height * width * aligned_channel * sizeof(float);
} else if (dims.size() == 2) {
memory_size = aligned_channel * sizeof(float);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
}
float filter_find_max(framework::Tensor *filter_tensor) {
auto filter_ptr = filter_tensor->data<float>();
return filter::find_max(filter_ptr, (int)filter_tensor->numel()); // NOLINT
}
int get_aligned_channel_num(int channel_num) {
return filter::calc_aligned_channel(channel_num);
}
int get_aligned_filter_num(framework::Tensor *filter_tensor) {
auto dims = filter_tensor->dims();
return filter::calc_aligned_num((int)dims[0], (int)dims[1]); // NOLINT
}
int get_conv_output_channel(framework::Tensor *filter_tensor) {
int aligned_filter_num = get_aligned_filter_num(filter_tensor);
return get_aligned_channel_num(aligned_filter_num);
}
void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
memcpy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, (int)num, (int)channel, // NOLINT
(int)height, // NOLINT
(int)width, group_num, max_value); // NOLINT
filter_tensor->reset_data_ptr(new_data);
}
void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT
auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size); // NOLINT
memcpy(new_data, data_ptr, memory_size);
filter::format_fc_filter(&new_data, (int)num, (int)channel, // NOLINT
(int)height, // NOLINT
(int)width, 1, max_value); // NOLINT
filter_tensor->reset_data_ptr(new_data);
}
void format_bias_scale_array(float **bias_scale_array, int filter_num,
int filter_channel) {
int num_after_alignment =
filter::calc_aligned_num(filter_channel, filter_channel);
bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
num_after_alignment);
}
void format_concat_output(framework::Tensor *out, int height, int width,
uint32_t out_channel) {
auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
auto ddim = framework::make_ddim({1, out_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
}
int format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
float max_value = fpga::filter_find_max(filter_tensor);
fpga::format_filter(filter_tensor, max_value, group);
int aligned_num = get_aligned_filter_num(filter_tensor);
fpga::format_bias_scale_array(&bs_ptr,
(int)filter_tensor->dims()[0], // NOLINT
aligned_num);
int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
DLOG << aligned_channel;
return aligned_channel;
}
int format_fc_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float *bs_ptr) {
float max_value = fpga::filter_find_max(filter_tensor);
fpga::format_fc_filter(filter_tensor, max_value);
int aligned_num = get_aligned_filter_num(filter_tensor);
fpga::format_bias_scale_array(&bs_ptr,
(int)filter_tensor->dims()[0], // NOLINT
aligned_num);
int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
DLOG << aligned_channel;
return aligned_channel;
}
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num;
arg->split_num = 1;
arg->filter_num = (uint32_t)filter->dims()[0];
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
arg->conv_args =
(ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT
arg->concat_arg.image_num = arg->split_num;
arg->concat_arg.image_out = out_ptr;
arg->concat_arg.scale_out = out->scale;
arg->concat_arg.height = (uint32_t)out->dims()[2];
arg->concat_arg.width = (uint32_t)out->dims()[3];
int n = arg->split_num;
arg->concat_arg.images_in =
(half **)fpga_malloc(n * sizeof(int *)); // NOLINT
arg->concat_arg.scales_in =
(float **)fpga_malloc(n * sizeof(float *)); // NOLINT
arg->concat_arg.channel_num =
(uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT
for (int i = 0; i < n; i++) {
arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].sb_address = bs_ptr;
arg->conv_args[i].filter_address = (int8_t *)filter_ptr; // NOLINT
arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].filter_num = arg->filter_num;
arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale;
}
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/driver/pe.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace fpga {
enum DataType {
DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0,
};
enum LayoutType {
LAYOUT_CHW = 1,
LAYOUT_HWC = 0,
};
struct KernelArgs {
uint32_t width;
uint32_t height;
uint32_t stride_w;
uint32_t stride_h;
};
struct ImageInputArgs {
void* address; // input featuremap virtual address
float* scale_address; // input scale address;
uint32_t channels;
uint32_t width; // featuremap width
uint32_t height;
uint32_t pad_width; // padding width;
uint32_t pad_height;
};
struct ImageOutputArgs {
void* address; // output result address;
float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation
};
struct ConvArgs {
bool relu_enabled;
void* sb_address; // scale and bias are interlaced;
void* filter_address;
float* filter_scale_address;
uint32_t filter_num;
uint32_t group_num;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t* aligned_channel_num;
uint32_t out_channel;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
half kernel_reciprocal;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
};
struct EWAddArgs {
bool relu_enabled;
uint32_t const0; // output0 = const0 x input0 + const1 x input1;
uint32_t const1;
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataType input_data_type;
enum DataType output_data_type;
enum LayoutType input_layout_type;
enum LayoutType output_layout_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
int open_device();
int close_device();
void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
float filter_find_max(framework::Tensor* filter_tensor);
int get_aligned_channel_num(int channel_num);
int get_aligned_filter_num(framework::Tensor* filter_tensor);
int get_conv_output_channel(framework::Tensor* filter_tensor);
void format_image(framework::Tensor* image_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor,
int aligned_channel); // only allocate memory
void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num);
void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
void format_bias_scale_array(float** bias_scale_array, int filter_num,
int filter_channel);
void format_concat_output(framework::Tensor* out, int height, int width,
uint32_t out_channel);
int format_conv_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* bs_ptr, int group);
int format_fc_data(framework::Tensor* filter_tensor,
framework::Tensor* ofm_tensor, float* bs_ptr);
void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h,
int stride_w, int padding_h, int padding_w, float* bs_ptr);
half fp32_2_fp16(float fp32_num);
float fp16_2_fp32(half fp16_num);
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/bias_scale.h"
#include <memory.h>
#include "fpga/V2/api.h"
namespace paddle_mobile {
namespace fpga {
namespace bias_scale {
void align_element(float **data_in, int num, int num_after_alignment) {
float *ptr_unaligned = *data_in;
int total_element = 2 * num_after_alignment; // including bias & scale
float *ptr_aligned =
(float *)fpga_malloc(total_element * sizeof(float)); // NOLINT
memset(ptr_aligned, 0, total_element * sizeof(float));
for (int i = 1; i < num; i++) {
ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
}
fpga_free(ptr_unaligned);
*data_in = ptr_aligned;
}
void format_bias_scale_array(float **data_in, int num,
int num_after_alignment) {
align_element(data_in, num, num_after_alignment);
}
} // namespace bias_scale
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle_mobile {
namespace fpga {
namespace bias_scale {
void align_element(float **data_in, int num, int num_after_alignment);
void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
} // namespace bias_scale
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define PADDLE_MOBILE_ZU5
#define FPGA_PRINT_MODE
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/bitmap.h"
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len) {
uint64_t *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_set >= 0) {
*p |= mask_to_set;
len -= bits_to_set;
bits_to_set = BITS_PER_LONG;
mask_to_set = ~0UL;
p++;
}
if (len) {
mask_to_set &= BITMAP_LAST_WORD_MASK(size);
*p |= mask_to_set;
}
}
void bitmap_clear(uint64_t *map, unsigned int start, int len) {
uint64_t *p = map + BIT_WORD(start);
const unsigned int size = start + len;
int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
while (len - bits_to_clear >= 0) {
*p &= ~mask_to_clear;
len -= bits_to_clear;
bits_to_clear = BITS_PER_LONG;
mask_to_clear = ~0UL;
p++;
}
if (len) {
mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
*p &= ~mask_to_clear;
}
}
static uint64_t ffs(uint64_t data) {
uint64_t bit = 0;
int i = 0;
for (i = 0; i < sizeof(data); i++) {
if (data & (1 << i)) {
bit = i;
break;
}
}
return bit;
}
static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
uint64_t start, uint64_t invert) {
uint64_t tmp = 0;
if (!nbits || start >= nbits) return nbits;
tmp = addr[start / BITS_PER_LONG] ^ invert;
/* Handle 1st word. */
tmp &= BITMAP_FIRST_WORD_MASK(start);
start = round_down(start, BITS_PER_LONG);
while (!tmp) {
start += BITS_PER_LONG;
if (start >= nbits) return nbits;
tmp = addr[start / BITS_PER_LONG] ^ invert;
}
return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
}
uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
uint64_t offset) {
return _find_next_bit(addr, size, offset, ~0UL);
}
uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
return _find_next_bit(addr, size, offset, 0UL);
}
uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask,
uint64_t align_offset) {
uint64_t index = 0;
uint64_t end = 0;
uint64_t i = 0;
again:
index = find_next_zero_bit(map, size, start);
/* Align allocation */
index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
end = index + nr;
if (end > size) return end;
i = find_next_bit(map, end, index);
if (i < end) {
start = i + 1;
goto again;
}
return index;
}
uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask) {
return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
}
} // namespace fpga_bitmap
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <stdio.h>
#define BITS_PER_LONG 64
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define round_down(x, y) ((x) & ((y)-1))
namespace fpga_bitmap {
void bitmap_set(uint64_t *map, unsigned int start, int len);
void bitmap_clear(uint64_t *map, unsigned int start, int len);
uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
uint64_t start, unsigned int nr,
uint64_t align_mask);
} // namespace fpga_bitmap
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
#include <algorithm>
#include <cstddef>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include "common/enforce.h"
#include "fpga/V2/driver/bitmap.h"
#include "fpga/V2/driver/driver.h"
namespace paddle_mobile {
namespace fpga {
struct FPGA_INFO g_fpgainfo;
int open_drvdevice() {
if (g_fpgainfo.fd_drv == -1) {
g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
}
return g_fpgainfo.fd_drv;
}
int open_memdevice() {
if (g_fpgainfo.fd_mem == -1) {
g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
}
return g_fpgainfo.fd_mem;
}
void pl_reset() {
// DLOG << "PL RESET";
// reg_writeq(0x5a, REG_FPGA_RESET);
usleep(100 * 1000);
}
void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
char const *type_name, int pe_idx) {
memset(pe, 0, sizeof(struct fpga_pe));
pe->outer = pe_data;
snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
pe->status = IDLE;
pe->interrupt_cnt = 0;
pe_data->pes[pe_idx] = pe;
pe_data->pe_num++;
}
void pl_init() {
struct pe_data_s *pe_data = nullptr;
pl_reset();
pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
if (pe_data == nullptr) {
DLOG << "pe_data malloc error!";
return;
}
memset(pe_data, 0, sizeof(struct pe_data_s));
pthread_mutex_init(&pe_data->mutex, 0);
setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
g_fpgainfo.pe_data = pe_data;
}
void pl_destroy() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
pthread_mutex_destroy(&pe_data->mutex);
free(pe_data);
}
void pl_start() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
pthread_mutex_unlock(&pe_data->mutex);
}
void pl_stop() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
pthread_mutex_lock(&pe_data->mutex);
}
void pl_reinit() {
struct pe_data_s *pe_data = g_fpgainfo.pe_data;
struct fpga_pe *pe = nullptr;
int i = 0;
pl_stop();
pl_reset();
pl_start();
for (i = 0; i < pe_data->pe_num; i++) {
pe = pe_data->pes[i];
pe->status = IDLE;
pe->interrupt_cnt = 0;
}
pl_start();
}
int pl_get_status() { return 0; }
/*tmie单位us*/
int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
uint64_t i = 0;
/*timeout精确性待确认*/
int64_t timeout = time * CPU_FREQ / 1000000;
for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) {
break;
}
}
if (i <= timeout) {
return 0;
} else {
return -1;
}
}
/*内存管理*/
int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr;
int ret = 0;
pthread_mutex_lock(&memory->mutex);
unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
memory->bitmap, memory->page_num, 0, nr, 0);
if (pos <= memory->page_num) {
uint64_t address_ofset =
memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
memory->nr[pos] = nr;
*addr = address_ofset;
} else {
ret = -ENOMEM;
}
pthread_mutex_unlock(&memory->mutex);
return ret;
}
void memory_release(struct fpga_memory *memory) {
pthread_mutex_lock(&memory->mutex);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
pthread_mutex_unlock(&memory->mutex);
}
int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
int rc = 0;
uint64_t *bitmap = nullptr;
unsigned int *nr = nullptr;
// 不允许多份memory创建,所以创建memory结构体不存在互斥
// pthread_mutex_lock(&memory->mutex);
memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
bitmap =
(uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT
if (!bitmap) {
rc = -EFAULT;
return rc;
}
memory->bitmap = bitmap;
nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
if (!nr) {
rc = -EFAULT;
free(bitmap);
return rc;
}
memory->nr = nr;
memory->mem_start = FPGA_MEM_PHY_ADDR;
memory->mem_end = FPGA_MEM_SIZE;
// pthread_mutex_unlock(memory->mutex);
return rc;
}
int create_fpga_memory(struct fpga_memory **memory_info) {
int rc = 0;
*memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
if (*memory_info == NULL) {
rc = -EFAULT;
return rc;
}
pthread_mutex_init(&((*memory_info)->mutex), nullptr);
rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
if (rc) {
free(*memory_info);
}
return rc;
}
int init_fpga_memory(struct fpga_memory *memory) {
int rc = 0;
if (!memory) {
rc = -EFAULT;
return rc;
}
// spin_lock_init(&memory->spin);
fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0.
return 0;
}
void destroy_fpga_memory(struct fpga_memory *memory) {
if (memory) {
free(memory->nr);
free(memory->bitmap);
free(memory);
}
}
int fpga_memory_add() {
int rc = 0;
rc = create_fpga_memory(&g_fpgainfo.memory_info);
if (rc) {
return rc;
}
rc = init_fpga_memory(g_fpgainfo.memory_info);
if (rc) {
destroy_fpga_memory(g_fpgainfo.memory_info);
return rc;
}
return 0;
}
uint64_t vaddr_to_paddr(void *address) {
uint64_t paddr = 0;
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
paddr = iter->second;
} else {
DLOG << "Invalid pointer";
}
return paddr;
}
void *fpga_reg_malloc(size_t size) {
void *ret = nullptr;
ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
// PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
return ret;
}
void *fpga_malloc_driver(size_t size) {
void *ret = nullptr;
uint64_t phy_addr = 0;
memory_request(g_fpgainfo.memory_info, size, &phy_addr);
ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
g_fpgainfo.fd_mem, phy_addr);
PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
return ret;
}
void fpga_free_driver(void *ptr) {
size_t size = 0;
auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
size = iter->second;
g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size);
} else {
DLOG << "Invalid pointer";
}
}
int open_device_driver() {
g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
g_fpgainfo.FpgaRegVirAddr = nullptr;
g_fpgainfo.pe_data = nullptr;
g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
g_fpgainfo.memdevice_path = "/dev/fpgamem0";
g_fpgainfo.fd_drv = -1;
g_fpgainfo.fd_mem = -1;
int ret = 0;
ret = open_drvdevice();
ret |= open_memdevice();
g_fpgainfo.FpgaRegVirAddr =
(uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT
fpga_memory_add();
pl_init();
return ret;
}
int close_device_driver() {
pl_destroy();
fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
memory_release(g_fpgainfo.memory_info);
destroy_fpga_memory(g_fpgainfo.memory_info);
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <map>
#include "common/log.h"
namespace paddle_mobile {
namespace fpga {
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
#define FPGA_REG_PHY_ADDR 0xa0000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x20000000
#define FPGA_MEM_SIZE 0x20000000
#define CPU_FREQ 1000000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
// PE related macros
const int MAX_NUM_PES = 6;
const size_t MAX_TYPE_NAME_LENTH = 8;
const int PE_IDX_CONV = 0;
const int PE_IDX_POOLING = 1;
const int PE_IDX_EW = 2;
const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 };
struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer;
pe_status status; // 0=idle 1=busy -1=fail
uint64_t interrupt_cnt;
};
struct pe_data_s {
pthread_mutex_t mutex;
struct fpga_pe pe_conv;
struct fpga_pe pe_pooling;
struct fpga_pe pe_ew;
struct fpga_pe pe_bypass;
struct fpga_pe *pes[MAX_NUM_PES];
int pe_num;
};
struct fpga_memory {
pthread_mutex_t mutex;
uint64_t *bitmap;
unsigned int *nr;
unsigned int page_num;
unsigned int page_num_long;
uint64_t mem_start;
uint64_t mem_end;
};
struct FPGA_INFO {
uint64_t FpgaRegPhyAddr;
uint64_t FpgaMemPhyAddr;
pthread_t poll_pid;
void *FpgaRegVirAddr;
struct pe_data_s *pe_data;
std::map<void *, size_t> fpga_addr2size_map;
std::map<void *, uint64_t> fpga_vaddr2paddr_map;
const char *drvdevice_path;
const char *memdevice_path;
struct fpga_memory *memory_info;
int fd_drv;
int fd_mem;
};
extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset;
uint64_t value =
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset); // NOLINT
return value;
}
inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) = // NOLINT
value;
}
int open_device_driver();
int close_device_driver();
void *fpga_malloc_driver(size_t size);
void fpga_free_driver(void *ptr);
/*pe*/
uint64_t vaddr_to_paddr(void *address);
int fpga_regpoll(uint64_t reg, uint64_t val, int time);
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/pe.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
namespace paddle_mobile {
namespace fpga {
#define MUL8(x) (x * 8)
#define BYPASS_DONE 1
float Findfp16Max() {
uint16_t abs_vals[16];
uint64_t max_fp16;
max_fp16 = reg_readq(MUL8(49));
abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(50));
abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(51));
abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT
abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
max_fp16 = reg_readq(MUL8(52));
abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT
abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT
abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT
uint16_t tmp = 0;
for (int i = 0; i < 16; i++) {
if (tmp < abs_vals[i]) {
tmp = abs_vals[i];
}
}
return fp16_2_fp32(tmp) / 127.0f;
}
int ComputeFpgaConv(const struct SplitConvArgs &args) {
ComputeBasicConv(args.conv_args[0]);
}
int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaPool===========";
DLOG << " mode:" << args.mode
<< " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled
<< " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels
<< " image0_height:" << args.image0.height
<< " image0_width:" << args.image0.width
<< " pad0_height:" << args.image0.pad_height
<< " pad0_width:" << args.image0.pad_width;
DLOG << " image1_address:" << args.image1.address
<< " image1_scale_address:" << args.image1.scale_address
<< " image1_channels:" << args.image1.channels
<< " image1_height:" << args.image1.height
<< " image1_width:" << args.image1.width
<< " pad1_height:" << args.image1.pad_height
<< " pad_width:" << args.image1.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
return 0;
}
int PerformBypass(const struct BypassArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaBypass===========";
DLOG << " input_type:" << args.input_data_type
<< " output_type:" << args.output_data_type
<< " input_layout_type:" << args.input_layout_type
<< " output_layout_type:" << args.output_layout_type;
DLOG << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#endif
uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
uint64_t bp_enable;
int64_t length;
uint64_t pixels;
// fp32->fp16
if ((args.input_data_type) && (!args.output_data_type)) {
pixels = (args.image.channels) * (args.image.width) * (args.image.height);
length = pixels * sizeof(float);
bp_enable = 0x8800000000000000 + length;
}
// fp16->fp32
else if ((!args.input_data_type) && (args.output_data_type)) {
pixels = filter::calc_aligned_channel((args.image.channels)) *
(args.image.width) * (args.image.height);
length = pixels * sizeof(short);
length = align_to_x((int)length, 64); // NOLINT
bp_enable = 0x8a00000000000000 + length;
}
// fp16->fp16 findmax
else if ((!args.input_data_type) && (!args.output_data_type)) {
pixels = (args.image.channels) * (args.image.width) * (args.image.height);
length = pixels * sizeof(short);
bp_enable = 0x8900000000000000 + length;
} else {
return -1;
}
// start bypass
reg_writeq(ifm_src_paddr, MUL8(27));
reg_writeq(ifm_dst_paddr, MUL8(28));
reg_writeq(0, MUL8(0));
reg_writeq(bp_enable, MUL8(0));
// poll
int ret = -1;
ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
if (ret != -1) {
// clear "irq"
reg_readq(MUL8(63));
}
// get max value
if ((!args.input_data_type) && (!args.output_data_type)) {
float scale = Findfp16Max();
args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT
args.output.scale_address[1] = scale;
}
return ret;
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out
<< " out_channel:" << args.out_channel;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
DLOG << " " << i << "th: ";
DLOG << " channel_num:" << args.channel_num[i]
<< " aligned_channel_num:" << args.aligned_channel_num[i]
<< " image_address:" << args.images_in[i]
<< " image_scale_address:" << args.scales_in[i];
}
#endif
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width, args.aligned_channel_num,
args.out_channel);
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "fpga/V2/api.h"
namespace paddle_mobile {
namespace fpga {
int PerformBypass(const struct BypassArgs& args);
int ComputeBasicConv(const struct ConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/V2/api.h"
namespace paddle_mobile {
namespace fpga {
namespace filter {
int calc_channel_parallelism(int channel) {
if (channel <= 16) {
return 16;
} else if (channel <= 32) {
return 32;
} else if (channel <= 64) {
return 64;
} else {
return 128;
}
}
int calc_aligned_channel(int channel) {
return align_to_x(channel, calc_channel_parallelism(channel));
}
int calc_num_parallelism(int channel) {
return FILTER_PARALLELISM / calc_channel_parallelism(channel);
}
int calc_aligned_num(int num, int channel) {
return align_to_x(num, calc_num_parallelism(channel));
}
int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
int aligned_channel = calc_aligned_channel(channel);
int aligned_filter_num = calc_aligned_num(num, channel);
return aligned_filter_num * aligned_channel * height * width;
}
void convert_to_hwc(float **data_in, int num, int channel, int height,
int width) {
float *tmp = *data_in;
int chw = channel * height * width;
float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float)); // NOLINT
for (int n = 0; n < num; n++) {
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + n * chw + offset_height + w * channel + c) =
*((*data_in)++);
}
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_filter(float **data_in, int num, int channel, int height,
int width) {
int aligned_channel = calc_channel_parallelism(channel);
int hw = height * width;
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float)); // NOLINT
float *temp = *data_in;
memset(new_data, 0, pixel_num * sizeof(float));
for (int i = 0; i < num; i++) {
for (int j = 0; j < hw; j++) {
memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
temp + i * channel * hw + j * channel, channel * sizeof(float));
}
}
*data_in = new_data;
fpga_free(temp);
}
void format_filter(float **data_in, int num, int channel, int height, int width,
int group_num, float max) {
convert_to_hwc(data_in, num, channel, height, width);
align_filter(data_in, num, channel, height, width);
int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
}
void convert_fc_filter(float **data_in, int num, int chw) {
float *tmp = *data_in;
float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float)); // NOLINT
for (int n = 0; n < num; n++) {
for (int c = 0; c < chw; c++) {
data_tmp[n * chw + c] = (*data_in)[num * c + n];
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void format_fc_filter(float **data_in, int num, int channel, int height,
int width, int group_num, float max) {
int chw = channel * height * width;
convert_fc_filter(data_in, num, chw);
align_filter(data_in, num, channel, height, width);
}
float find_max(float *data_in, int data_size) {
float max = 0.0;
for (int i = 0; i < data_size; ++i) {
float value = data_in[i];
float abs = value > 0 ? value : -value;
max = std::max(max, abs);
}
return max;
}
signed char float_to_int8(float fdata) {
if (fdata < 0.0) {
fdata -= 0.5;
} else {
fdata += 0.5;
}
return (signed char)fdata;
}
void quantize(float **data_in, int data_size, float max) {
float *tmp = *data_in;
float fix_range = 127;
float scale = fix_range / max;
signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
for (int i = 0; i < data_size; i++) {
tmp_data[i] = float_to_int8(
(*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale);
}
*data_in = (float *)tmp_data; // NOLINT
fpga_free(tmp);
}
} // namespace filter
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#define FILTER_PARALLELISM 1024
namespace paddle_mobile {
namespace fpga {
namespace filter {
int calc_channel_parallelism(int channel);
int calc_aligned_channel(int channel);
int calc_num_parallelism(int channel);
int calc_aligned_num(int num, int channel);
int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
void convert_to_hwc(float** data_in, int num, int channel, int height,
int width);
void format_filter(float** data_in, int num, int channel, int height, int width,
int group_num, float max);
void convert_fc_filter(float** data_in, int num, int chw);
void format_fc_filter(float** data_in, int num, int channel, int height,
int width, int group_num, float max);
float find_max(float* data_in, int data_size);
} // namespace filter
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/V2/api.h"
namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width) {
float *tmp = *data_in;
float *data_tmp =
(float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT
int64_t amount_per_row = width * channel;
for (int c = 0; c < channel; c++) {
for (int h = 0; h < height; h++) {
int64_t offset_height = h * amount_per_row;
for (int w = 0; w < width; w++) {
*(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
}
}
}
*data_in = data_tmp;
fpga_free(tmp);
}
void align_image(float **data_in, int channel, int height, int width,
int aligned_channel) {
if (channel == aligned_channel) return;
float *tmp = *data_in;
float *new_data =
(float *)fpga_malloc(aligned_channel * height * width * // NOLINT
sizeof(float)); // NOLINT
memset(new_data, 0, aligned_channel * height * width * sizeof(float));
for (int i = 0; i < height * width; i++) {
memcpy(new_data + i * aligned_channel, tmp + i * channel,
channel * sizeof(float));
}
*data_in = new_data;
fpga_free(tmp);
}
void format_image(float **data_in, int channel, int height, int width,
int aligned_channel) {
convert_to_hwc(data_in, channel, height, width);
align_image(data_in, channel, height, width, aligned_channel);
}
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, const uint32_t *channel_num,
int height, int width, const uint32_t *aligned_channel_num,
int out_channel) {
int hw = height * width;
scale_out[0] = 0.0;
scale_out[1] = 0.0;
for (int i = 0; i < image_num; i++) {
scale_out[0] = std::max(*scale_out, scales_in[i][0]);
}
scale_out[1] = 1 / scale_out[0];
for (int j = 0; j < hw; j++) {
int tmp_channel_sum = 0;
for (int i = 0; i < image_num; i++) {
memcpy(
(int16_t *)image_out + j * out_channel + tmp_channel_sum, // NOLINT
images_in[i] + j * aligned_channel_num[i],
channel_num[i] * sizeof(int16_t));
tmp_channel_sum += channel_num[i];
}
}
}
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
namespace paddle_mobile {
namespace fpga {
namespace image {
void convert_to_hwc(float **data_in, int channel, int height, int width);
void align_image(float **data_in, int channel, int height, int width,
int aligned_channel);
void format_image(float **data_in, int channel, int height, int width,
int aligned_channel);
void concat_images(
int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
int image_num, const uint32_t *channel_num, int height, int width,
const uint32_t *aligned_channel_num,
int out_channel); // Concat featuremaps along channel direction
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
......@@ -117,9 +117,9 @@ class Attribute {
template <typename Vistor>
static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
if (attr.variant_.TypeId() == typeid(int).hash_code()) {
if (attr.variant_.TypeId() == typeid(int).hash_code()) { // NOLINT
return vistor(attr.variant_.Get<int>());
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
} else if (attr.variant_.TypeId() == typeid(float).hash_code()) { // NOLINT
return vistor(attr.variant_.Get<float>());
} else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
return vistor(attr.variant_.GetString());
......@@ -129,7 +129,7 @@ class Attribute {
return vistor(attr.variant_.Get<vector<float>>());
} else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
return vistor(attr.variant_.Get<vector<string>>());
} else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
} else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { // NOLINT
return vistor(attr.variant_.Get<bool>());
} else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
return vistor(attr.variant_.Get<vector<bool>>());
......@@ -137,7 +137,6 @@ class Attribute {
return vistor(attr.variant_.Get<int64_t>());
} else {
PADDLE_MOBILE_THROW_EXCEPTION("type not support");
exit(0);
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "CL/cl.h"
struct CLKernelDeleter {
template <class T>
void operator()(T *clKernelObj) {
clReleaseKernel(clKernelObj);
}
};
struct CLMemDeleter {
template <class T>
void operator()(T *clMemObj) {
clReleaseMemObject(clMemObj);
}
};
struct CLEventDeleter {
template <class T>
void operator()(T *clEventObj) {
clReleaseEvent(clEventObj);
}
};
struct CLCommQueueDeleter {
template <class T>
void operator()(T *clQueueObj) {
clReleaseCommandQueue(clQueueObj);
}
};
struct CLContextDeleter {
template <class T>
void operator()(T *clContextObj) {
clReleaseContext(clContextObj);
}
};
struct CLProgramDeleter {
template <class T>
void operator()(T *clProgramObj) {
clReleaseProgram(clProgramObj);
}
};
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_engine.h"
#include "CL/cl.h"
#include "framework/cl/cl_tool.h"
#include <cstdlib>
#include <cstring>
namespace paddle_mobile {
namespace framework {
bool CLEngine::Init() {
if (initialized_) {
return true;
}
cl_int status;
SetPlatform();
SetClDeviceId();
initialized_ = true;
return initialized_;
// setClCommandQueue();
// std::string filename = "./HelloWorld_Kernel.cl";
// loadKernelFromFile(filename.c_str());
// buildProgram();
}
CLEngine *CLEngine::Instance() {
static CLEngine cl_engine_;
cl_engine_.Init();
return &cl_engine_;
}
bool CLEngine::SetPlatform() {
platform_ = NULL; // the chosen platform
cl_uint numPlatforms; // the NO. of platforms
cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
/**For clarity, choose the first available platform. */
if (numPlatforms > 0) {
cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
malloc(numPlatforms * sizeof(cl_platform_id)));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
platform_ = platforms[0];
free(platforms);
return true;
} else {
return false;
}
}
bool CLEngine::SetClDeviceId() {
cl_uint numDevices = 0;
devices_ = NULL;
cl_int status =
clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
if (numDevices > 0) {
devices_ = reinterpret_cast<cl_device_id *>(
malloc(numDevices * sizeof(cl_device_id)));
status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
NULL);
return true;
}
return false;
}
// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
// const std::string &kernel_name) {
// std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
// clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
// return std::move(kernel);
//}
//
// bool CLEngine::SetClCommandQueue() {
// cl_int status;
// command_queue_.reset(
// clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
// return true;
//}
// bool CLEngine::SetClContext() {
// context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
// return true;
//}
// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
// size_t size;
// char *str;
// std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
//
// if (!f.is_open()) {
// return false;
// }
//
// size_t fileSize;
// f.seekg(0, std::fstream::end);
// size = fileSize = (size_t)f.tellg();
// f.seekg(0, std::fstream::beg);
// str = new char[size + 1];
// if (!str) {
// f.close();
// return 0;
// }
//
// f.read(str, fileSize);
// f.close();
// str[size] = '\0';
// const char *source = str;
// size_t sourceSize[] = {strlen(source)};
// program_.reset(
// clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
// NULL));
// return true;
//}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include "CL/cl.h"
#include "common/enforce.h"
#include "common/log.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
class CLEngine {
public:
static CLEngine *Instance();
bool Init();
std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
cl_int status;
cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
CL_CHECK_ERRORS(status);
return std::move(context_ptr);
}
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
cl_context context) {
cl_int status;
cl_command_queue queue =
clCreateCommandQueue(context, devices_[0], 0, &status);
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
queue);
CL_CHECK_ERRORS(status);
return std::move(command_queue_ptr);
}
std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
cl_context context, std::string file_name) {
FILE *file = fopen(file_name.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
file_name.c_str());
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
rewind(file);
char *data = new char[size + 1];
size_t bytes_read = fread(data, 1, size, file);
data[size] = '\0';
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
fclose(file);
const char *source = data;
size_t sourceSize[] = {strlen(source)};
cl_program p =
clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
DLOG << " cl kernel file name: " << file_name;
DLOG << " source size: " << sourceSize[0];
CL_CHECK_ERRORS(status_);
std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
return std::move(program_ptr);
}
std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
cl_event event = clCreateUserEvent(context, &status_);
std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
CL_CHECK_ERRORS(status_);
return std::move(event_ptr);
}
bool BuildProgram(cl_program program) {
cl_int status;
std::string path = "-cl-fast-relaxed-math -I " +
CLEngine::Instance()->GetCLPath() + "/cl_kernel";
status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
CL_CHECK_ERRORS(status);
if (status_ == CL_BUILD_PROGRAM_FAILURE) {
size_t log_size;
clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = reinterpret_cast<char *>(malloc(log_size));
clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
DLOG << " program build error: " << log;
}
if (status == CL_SUCCESS) {
return true;
} else {
return false;
}
}
cl_device_id DeviceID(int index = 0) { return devices_[index]; }
std::string GetCLPath() { return cl_path_; }
void setClPath(std::string cl_path) { cl_path_ = cl_path; }
private:
CLEngine() { initialized_ = false; }
bool SetPlatform();
bool SetClDeviceId();
bool initialized_;
cl_platform_id platform_;
cl_device_id *devices_;
cl_int status_;
std::string cl_path_;
std::unique_ptr<_cl_program, CLProgramDeleter> program_;
// bool SetClContext();
// bool SetClCommandQueue();
// bool LoadKernelFromFile(const char *kernel_file);
// bool BuildProgram();
};
} // namespace framework
} // namespace paddle_mobile
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
namespace paddle_mobile {
namespace framework {
typedef uint16_t half_t;
half_t Float2Half(float f);
float Half2Float(half_t h);
void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <type_traits>
#include <vector>
#include "common/log.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_image.h"
#include "framework/cl/cl_scope.h"
namespace paddle_mobile {
namespace framework {
class CLHelper {
public:
CLHelper() = default;
explicit CLHelper(CLScope *scope) : scope_(scope) {}
void AddKernel(const std::string &kernel_name, const std::string &file_name) {
DLOG << " begin add kernel ";
auto kernel = scope_->GetKernel(kernel_name, file_name);
DLOG << " add kernel ing ";
kernels.emplace_back(std::move(kernel));
}
cl_kernel KernelAt(const int index) {
DLOG << " kernel count: " << kernels.size();
return kernels[index].get();
}
cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
cl_context CLContext() { return scope_->Context(); }
std::vector<size_t> DefaultWorkSize(const CLImage &image) {
// n c h w
auto image_dim = image.dims();
if (image_dim.size() == 4) {
auto n = image_dim[0];
auto h = image_dim[2];
auto w = image_dim[3];
auto image_width = image.ImageWidth();
auto work_size_0 = image_width / w;
auto work_size_1 = w;
auto work_size_2 = n * h;
return {work_size_0, work_size_1, work_size_2};
} else if (image_dim.size() == 2) {
return {1, image.ImageWidth(), image.ImageHeight()};
} else if (image_dim.size() == 1) {
return {1, image.ImageWidth(), 1};
}
PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
}
private:
CLScope *scope_;
std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_image.h"
namespace paddle_mobile {
namespace framework {
void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
cl_command_queue commandQueue) {
// TODO(yangfei): need imp
}
void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
cl_command_queue commandQueue) {
// TODO(yangfei): need imp
}
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &cl_image) {
int width = cl_image.ImageDims()[0];
int height = cl_image.ImageDims()[1];
half_t *image_data = new half_t[height * width * 4];
cl_int err;
cl_mem image = cl_image.GetCLImage();
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
region, 0, 0, image_data, 0, NULL, NULL);
CL_CHECK_ERRORS(err);
float *tensor_data = new float[cl_image.numel()];
auto converter = cl_image.Converter();
converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
cl_image.dims());
int stride = cl_image.numel() / 20;
stride = stride > 0 ? stride : 1;
printer << " dims: " << cl_image.dims() << "\n";
for (int i = 0; i < cl_image.numel(); i += stride) {
printer << tensor_data[i] << " ";
}
delete[](tensor_data);
delete[](image_data);
return printer;
}
#endif
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_image_converter.h"
#include "framework/cl/cl_tool.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
namespace paddle_mobile {
namespace framework {
class CLImage {
public:
CLImage() = default;
~CLImage() {
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
}
if (image_converter_) {
delete (image_converter_);
}
}
/*
* will not hold input tensor data, memcpy in this method
* */
void SetTensorData(float *tensorData, const DDim &dim) {
int numel = product(dim);
if (tensor_data_ != nullptr) {
delete[](tensor_data_);
tensor_data_ = nullptr;
}
tensor_data_ = new float[numel];
memcpy(tensor_data_, tensorData, numel * sizeof(float));
tensor_dims_ = dim;
}
/*
* need call SetTensorData first
*
* folder when one dim or two dim
* */
void InitCLImage(cl_context context, cl_command_queue command_queue) {
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
InitCLImage(context, command_queue, folder_converter);
}
void InitCLImage(cl_context context, cl_command_queue command_queue,
CLImageConverterBase *converter) {
if (image_converter_ != nullptr) {
delete (image_converter_);
}
PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
" need call SetTensorData first");
DLOG << " begin init cl image ";
image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
half_t *image_data = new half_t[product(image_dims_) * 4];
DLOG << " convert to image";
converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
DLOG << " end convert to image";
InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
delete[](image_data);
delete[](tensor_data_);
command_queue_ = command_queue;
tensor_data_ = nullptr;
image_converter_ = converter;
initialized_ = true;
DLOG << " end init cl image";
}
void InitNImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
InitCLImage(context, command_queue, folder_converter);
PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
}
void InitDWImage(cl_context context, cl_command_queue command_queue) {
if (tensor_data_ == nullptr) {
PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
}
CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
InitCLImage(context, command_queue, dw_converter);
PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
}
void InitEmptyImage(cl_context context, cl_command_queue command_queue,
const DDim &dim) {
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
DLOG << " to get image dims ";
image_dims_ = folder_converter->InitImageDimInfoWith(dim);
DLOG << " end get image dims " << image_dims_;
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
tensor_dims_ = dim;
command_queue_ = command_queue;
image_converter_ = folder_converter;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
}
cl_mem GetCLImage() const { return cl_image_.get(); }
const DDim &ImageDims() const { return image_dims_; }
inline size_t ImageWidth() const { return image_dims_[0]; }
inline size_t ImageHeight() const { return image_dims_[1]; }
inline cl_command_queue CommandQueue() const { return command_queue_; }
/*
* resize original tensor dim
* */
inline CLImage &Resize(const DDim &dims) {
tensor_dims_ = dims;
return *this;
}
template <typename T>
T *data() const {
if (initialized_) {
PADDLE_MOBILE_THROW_EXCEPTION(
" cl image has initialized, tensor data has been deleted, can't use "
"tensor data");
}
return reinterpret_cast<T *>(tensor_data_);
}
/*
* numel of tensor dim
* */
inline int64_t numel() const { return product(tensor_dims_); }
/*
* original tensor dim
* */
const DDim &dims() const { return tensor_dims_; }
cl_event GetClEvent() const { return cl_event_.get(); }
CLImageConverterBase *Converter() const { return image_converter_; }
private:
void InitCLImage(cl_context context, int width, int height, void *data) {
cl_image_format cf = {.image_channel_order = CL_RGBA,
.image_channel_data_type = CL_HALF_FLOAT};
cl_image_desc cid = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = width,
.image_height = height,
.image_depth = 1,
.image_array_size = 1,
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
// .buffer = nullptr
};
cid.buffer = nullptr;
cl_int err;
cl_mem cl_image = clCreateImage(
context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
&cf, // const cl_image_format *image_format
&cid, // const cl_image_desc *image_desc
data, // void *host_ptr
&err);
cl_image_.reset(cl_image);
if (err != CL_SUCCESS) {
CL_CHECK_ERRORS(err);
PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
}
}
bool initialized_ = false;
std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
DDim tensor_dims_;
DDim image_dims_;
float *tensor_data_ = nullptr;
cl_context context_;
cl_command_queue command_queue_;
CLImageConverterBase *image_converter_ = nullptr;
};
void TensorToCLImage(Tensor *tensor, CLImage *image,
cl_command_queue commandQueue);
void CLImageToTensor(CLImage *image, Tensor *tensor,
cl_command_queue commandQueue);
#ifdef PADDLE_MOBILE_DEBUG
Print &operator<<(Print &printer, const CLImage &image);
#endif
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_image_converter.h"
namespace paddle_mobile {
namespace framework {
const DDim &CLImageConverterDefault::InitImageDimInfoWith(
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
return make_ddim({width, height});
}
void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
DLOG << " tensor dim " << tensor_dim;
DLOG << " image dim " << in_image_dim;
size_t width = in_image_dim[0];
size_t height = in_image_dim[1];
int w_block = width / W;
float *p = nchw;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (c < C) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
int width = image_dim[0];
int height = image_dim[0];
float *p = tensor;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
const DDim &CLImageConverterFolder::InitImageDimInfoWith(
const DDim &tensor_dim) {
if (tensor_dim.size() <= 2) {
int tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
int width = (tdim[1] + 3) / 4;
int height = tdim[0];
width_of_one_block_ = width;
height_of_one_block_ = height;
c_block_ = 1;
return make_ddim({width, height});
} else {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[0];
C = new_dims[1];
H = new_dims[2];
W = new_dims[3];
size_t width = W * ((C + 3) / 4);
size_t height = H * N;
width_of_one_block_ = W;
height_of_one_block_ = H;
c_block_ = width / W;
return make_ddim({width, height});
}
}
void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
"tensor dim is not support ");
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.NCHWToImage(tensor, image, tensor_dim);
} else {
int tdim[2] = {1, 1};
if (tensor_dim.size() == 1) {
tdim[1] = tensor_dim[0];
} else {
tdim[0] = tensor_dim[0];
tdim[1] = tensor_dim[1];
}
DDim image_dim = InitImageDimInfoWith(tensor_dim);
int width = image_dim[0];
for (int h = 0; h < tdim[0]; h++) {
for (int w = 0; w < tdim[1]; w++) {
image[(h * width + w / 4) * 4 + (w % 4)] =
Float2Half(tensor[h * tdim[1] + w]);
}
}
}
}
void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
if (tensor_dim.size() > 2) {
CLImageConverterDefault default_converter;
default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
} else {
int width = image_dim[0];
int height = image_dim[1];
int H, W;
if (tensor_dim.size() == 2) {
H = tensor_dim[0];
W = tensor_dim[1];
} else if (tensor_dim.size() == 1) {
H = 1;
W = tensor_dim[0];
}
float *p = tensor;
for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) {
p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
}
}
}
}
const DDim &CLImageConverterNWBlock::InitImageDimInfoWith(
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return make_ddim({width, height});
}
void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
auto image_dim = InitImageDimInfoWith(tensor_dim);
float *p = tensor;
int N = tensor_dim[0];
int C = tensor_dim[1];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[1];
int block = image_dim[0] / tensor_dim[3];
for (int n = 0; n < block * 4; n++) {
for (int c = 0; c < C; c++) {
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
if (n < N) {
image[index] = Float2Half(*p);
p++;
} else {
image[index] = 0.0;
}
if (index >= (width * height * 4)) {
DLOG << " index out of range ";
}
}
}
}
}
DLOG << " init done";
}
void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
float *p = tensor;
int N = tensor_dim[0];
int C = tensor_dim[1];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[1];
int block = image_dim[0] / tensor_dim[3];
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
w * 4 + n % 4;
*p = Half2Float(image[index]);
p++;
if (index >= (width * height * 4)) {
DLOG << " index out of range ";
}
}
}
}
}
DLOG << " init done";
}
const DDim &CLImageConverterDWBlock::InitImageDimInfoWith(
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
size_t N, C, H, W;
N = tensor_dim[0];
C = tensor_dim[1];
H = tensor_dim[2];
W = tensor_dim[3];
size_t width = W * ((N + 3) / 4);
size_t height = C * H;
return make_ddim({width, height});
}
void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
const DDim &tensor_dim) {
size_t new_dims[] = {1, 1, 1, 1};
for (int j = 0; j < tensor_dim.size(); ++j) {
new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
}
size_t N, C, H, W;
N = new_dims[1];
C = new_dims[0];
H = new_dims[2];
W = new_dims[3];
DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
DLOG << " tensor dim " << tensor_dim;
DLOG << " image dim " << in_image_dim;
size_t width = in_image_dim[0];
size_t height = in_image_dim[1];
int w_block = width / W;
float *p = tensor;
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < w_block * 4; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
if (c < C) {
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image[i2] = Float2Half(*p);
i2 += 4;
p++;
} else {
image[i2] = 0.0;
i2 += 4;
}
}
i1 += width;
}
}
i0 += width * H;
}
}
void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
const DDim &image_dim,
const DDim &tensor_dim) {
PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
float *p = tensor;
int N = tensor_dim[1];
int C = tensor_dim[0];
int H = tensor_dim[2];
int W = tensor_dim[3];
int width = image_dim[0];
int height = image_dim[0];
size_t i0 = 0;
for (int n = 0; n < N; n++) {
for (int c = 0; c < C; c++) {
size_t i1 = i0 + (c / 4) * W;
for (int h = 0; h < H; h++) {
size_t i2 = (i1 << 2) + c % 4;
for (int w = 0; w < W; w++) {
*p = Half2Float(image[i2]);
i2 += 4;
p++;
}
i1 += width;
}
}
i0 += width * H;
}
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/cl/cl_half.h"
#include "framework/ddim.h"
namespace paddle_mobile {
namespace framework {
class CLImageConverterBase {
public:
virtual void NCHWToImage(float *nchw, half_t *image,
const DDim &tensor_dim) = 0;
virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
const DDim &tensor_dim) = 0;
virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0;
};
class CLImageConverterDefault : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterFolder : public CLImageConverterBase {
public:
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
/*
* width of original tensor
* */
inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
/*
* height of original tensor
* */
inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
int GetCBlock() const { return c_block_; }
private:
int c_block_;
int width_of_one_block_;
int height_of_one_block_;
};
class CLImageConverterNWBlock : public CLImageConverterBase {
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
class CLImageConverterDWBlock : public CLImageConverterBase {
const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
const DDim &tensor_dim);
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
class CLScope {
public:
CLScope() {
CLEngine *engin = CLEngine::Instance();
context_ = engin->CreateContext();
command_queue_ = engin->CreateClCommandQueue(context_.get());
}
cl_command_queue CommandQueue() { return command_queue_.get(); }
std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
const std::string &kernel_name, const std::string &file_name) {
DLOG << " to get program " << file_name;
auto program = Program(file_name);
DLOG << " end get program ~ ";
DLOG << " to create kernel: " << kernel_name;
std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
clCreateKernel(program, kernel_name.c_str(), &status_));
CL_CHECK_ERRORS(status_);
DLOG << " end create kernel ~ ";
return std::move(kernel);
}
cl_context Context() { return context_.get(); }
cl_program Program(const std::string &file_name) {
auto it = programs_.find(file_name);
if (it != programs_.end()) {
return it->second.get();
}
auto program = CLEngine::Instance()->CreateProgramWith(
context_.get(),
CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
DLOG << " --- begin build program -> " << file_name << " --- ";
CLEngine::Instance()->BuildProgram(program.get());
DLOG << " --- end build program -> " << file_name << " --- ";
programs_[file_name] = std::move(program);
return programs_[file_name].get();
}
private:
cl_int status_;
std::unique_ptr<_cl_context, CLContextDeleter> context_;
std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
std::unordered_map<std::string,
std::unique_ptr<_cl_program, CLProgramDeleter>>
programs_;
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/tensor_base.h"
namespace paddle_mobile {
namespace framework {
class CLTensor : TensorBase {
public:
CLTensor(cl_context context, cl_command_queue command_queue)
: context_(context), command_queue_(command_queue) {}
CLTensor() = default;
/*
* if init method haven't set context and command_queue, need set
* */
void SetContextAndCommandQueue(cl_context context,
cl_command_queue command_queue) {
context_ = context;
command_queue_ = command_queue;
}
/*! Resize the dimensions of the memory block. */
inline CLTensor &Resize(const DDim &dims) {
dims_ = dims;
return *this;
}
template <typename T>
inline cl_mem mutable_with_data(const T *data) {
int64_t size = numel() * sizeof(T);
holder_.reset(new PlaceholderImpl(
size, reinterpret_cast<void *>(const_cast<T *>(data)), typeid(T),
context_, command_queue_));
return reinterpret_cast<cl_mem>(holder_->ptr());
}
inline cl_mem mutable_data(std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
int64_t size = numel() * SizeOfType(type);
if (holder_ == nullptr || holder_->size() < size + offset_) {
holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
offset_ = 0;
}
return reinterpret_cast<cl_mem>(holder_->ptr());
}
/**
* @brief Return a pointer to cl buffer.
* @note If not exist, then allocation.
*/
template <typename T>
inline cl_mem mutable_data() {
return reinterpret_cast<cl_mem>(mutable_data(typeid(T)));
}
/**
* @brief Return a pointer to cl buffer.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline cl_mem mutable_data(DDim dims) {
Resize(dims);
return mutable_data<T>();
}
inline cl_mem CLBuffer() {
check_memory_size();
return reinterpret_cast<cl_mem>(
reinterpret_cast<uintptr_t>(holder_->ptr()));
}
template <typename T>
inline T *Data() {
if (host_ptr_) {
delete (host_ptr_);
host_ptr_ = nullptr;
}
cl_mem buffer = CLBuffer();
host_ptr_ = new char[holder_->size()];
cl_int status;
status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
holder_->size(), host_ptr_, 0, NULL, NULL);
CL_CHECK_ERRORS(status);
return reinterpret_cast<T *>(host_ptr_);
}
int memorySize() { return holder_->size(); }
~CLTensor() {
DLOG << "~CLTensor";
if (host_ptr_) {
DLOG << " delete host ptr ";
delete (host_ptr_);
host_ptr_ = nullptr;
}
}
private:
cl_context context_;
cl_command_queue command_queue_;
void *host_ptr_ = nullptr;
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(size_t size, void *input, std::type_index type,
cl_context context, cl_command_queue command_queue)
: ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
size, reinterpret_cast<void *>(input), NULL)),
size_(size),
type_(type),
command_queue_(command_queue) {}
PlaceholderImpl(size_t size, std::type_index type, cl_context context,
cl_command_queue command_queue)
: ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
size_(size),
type_(type),
command_queue_(command_queue) {}
virtual size_t size() const { return size_; }
virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
size_t size_;
/* the current type of memory */
std::type_index type_;
cl_command_queue command_queue_;
};
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
namespace framework {
const char *opencl_error_to_str(cl_int error) {
#define CASE_CL_CONSTANT(NAME) \
case NAME: \
return #NAME;
// Suppose that no combinations are possible.
switch (error) {
CASE_CL_CONSTANT(CL_SUCCESS)
CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
CASE_CL_CONSTANT(CL_MAP_FAILURE)
CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
CASE_CL_CONSTANT(CL_INVALID_VALUE)
CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
CASE_CL_CONSTANT(CL_INVALID_DEVICE)
CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
CASE_CL_CONSTANT(CL_INVALID_BINARY)
CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
CASE_CL_CONSTANT(CL_INVALID_KERNEL)
CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
CASE_CL_CONSTANT(CL_INVALID_EVENT)
CASE_CL_CONSTANT(CL_INVALID_OPERATION)
CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
default:
return "UNKNOWN ERROR CODE";
}
#undef CASE_CL_CONSTANT
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "CL/cl.h"
namespace paddle_mobile {
namespace framework {
const char* opencl_error_to_str(cl_int error);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
}
} // namespace framework
} // namespace paddle_mobile
......@@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) {
return DataLayout::kAnyLayout;
} else {
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
exit(0);
}
}
......@@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
return "ANY_LAYOUT";
default:
PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
exit(0);
break;
}
}
......
......@@ -42,7 +42,7 @@ struct Dim {
: head(idx % size.head), tail(idx / size.head, size.tail) {}
/** Construct a Dim with each dimension set to the given index */
Dim(int64_t idx) : head(idx), tail(idx) {}
explicit Dim(int64_t idx) : head(idx), tail(idx) {}
bool operator==(const Dim<i> &o) const {
return (head == o.head) && (tail == o.tail);
......@@ -65,7 +65,7 @@ template <>
struct Dim<0> {
static constexpr int dimensions = 0;
Dim(int64_t _head) {}
explicit Dim(int64_t _head) {}
Dim() {}
......@@ -131,7 +131,6 @@ int64_t &indexer(Dim<D> &dim, int idx) {
template <>
int64_t &indexer<0>(Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
exit(0);
}
template <int D>
......@@ -148,7 +147,6 @@ int64_t indexer(const Dim<D> &dim, int idx) {
template <>
int64_t indexer<0>(const Dim<0> &dim, int idx) {
PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
exit(0);
}
} // namespace
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "io/executor.h"
#include "framework/executor.h"
#include <algorithm>
#include <utility>
#include <vector>
......@@ -26,12 +26,25 @@ limitations under the License. */
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
#include "operators/math/gemm.h"
#ifdef PADDLE_EXECUTOR_MULTITHREAD
#include <queue>
#include <utility>
#include "common/threadpool.h"
#endif
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
namespace paddle_mobile {
namespace framework {
using framework::Variable;
using framework::Variable;
#pragma mark - executor
template <typename Dtype, Precision P>
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
const bool use_optimize, const bool loddable)
......@@ -390,13 +403,18 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
framework::Tensor tensor(input, framework::make_ddim(dims));
std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
for (int j = 0; j < output_tensor->numel(); ++j) {
result_vector.push_back(output_ptr[j]);
if (output_tensor != nullptr) {
Executor<Dtype, P>::Ptype *output_ptr =
output_tensor->data<typename Executor<Dtype, P>::Ptype>();
std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
for (int j = 0; j < output_tensor->numel(); ++j) {
result_vector.push_back(output_ptr[j]);
}
return result_vector;
} else {
DLOG << "return empty vector";
return {};
}
return result_vector;
}
#ifdef PADDLE_MOBILE_FPGA
......@@ -470,8 +488,236 @@ void Executor<Dtype, P>::Predict_To(int end) {
}
#endif
#ifdef PADDLE_MOBILE_CL
template <typename Dtype, Precision P>
void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
float *tensorInput, char **data) {}
template <>
void Executor<GPU_CL, Precision::FP32>::LoadMemory(
const framework::VarDesc var_desc, float *tensorInput, char **data) {
// 1. version
uint32_t version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 2 Lod information
uint64_t *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
uint64_t lod_level = *lod_level_ptr;
delete lod_level_ptr;
(*data) += sizeof(uint64_t);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(*data);
(*data) += sizeof(uint64_t);
std::vector<size_t> tmp(size / sizeof(size_t));
for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *reinterpret_cast<size_t *>(*data);
(*data) += sizeof(size_t);
}
}
// 3. tensor version
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
(*data) += sizeof(uint32_t);
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(*data);
(*data) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (*data)[m];
}
(*data) += (sizeof(char) * size);
const framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
for (auto l : desc.Dims()) {
memory_size *= l;
}
void *memory = nullptr;
// int type_size = 0;
// switch (desc.DataType()) {
// case framework::VARTYPE_TYPE_FP16:
// type_size = 2;
// break;
// case framework::VARTYPE_TYPE_FP32:
// type_size = 4;
// memory = tensor->mutable_data<float>();
// break;
// case framework::VARTYPE_TYPE_FP64:
// type_size = 8;
// break;
// case framework::VARTYPE_TYPE_INT32:
// memory = tensor->mutable_data<int32_t>();
// type_size = 4;
// break;
// case framework::VARTYPE_TYPE_INT64:
// type_size = 8;
// break;
// case framework::VARTYPE_TYPE_BOOL:
// type_size = 1;
// break;
// default:
// break;
// }
int type_size = 4;
memory = tensorInput;
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size; n++) {
float value;
memcpy(&value, *data + n * type_size, type_size);
if (value < 1e-30 && value > -1e-30) {
static_cast<float *>(memory)[n] = 0.0;
} else {
static_cast<float *>(memory)[n] = value;
}
}
(*data) += (sizeof(char) * memory_size * type_size);
}
}
template <>
void Executor<GPU_CL, Precision::FP32>::InitMemory() {
for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
if (var_desc->Persistable()) {
CLImage *cl_image = nullptr;
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
var->template GetMutable<framework::LoDTensor>();
continue;
} else {
cl_image = var->template GetMutable<framework::CLImage>();
}
char *origin_data =
ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
char *data = origin_data;
cl_context context = program_.scope->GetCLScpoe()->Context();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
int numel = 1;
for (auto l : desc.Dims()) {
numel *= l;
}
DLOG << var_desc->Name();
float *tensorInput = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
LoadMemory(*var_desc, tensorInput, &data);
framework::DDim ddim = framework::make_ddim(desc.Dims());
// has not init
cl_image->SetTensorData(tensorInput, ddim);
delete origin_data;
paddle_mobile::memory::Free(tensorInput);
} else {
if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework::DDim ddim = cl_image->dims();
DLOG << var_desc->Name();
cl_image->InitEmptyImage(context, command_queue, ddim);
}
}
}
}
}
template <>
void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
char *origin_data = nullptr;
bool self_alloc = false;
if (program_.combined_params_buf && program_.combined_params_len) {
LOG(kLOG_INFO) << "use outter memory";
origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
} else {
LOG(kLOG_INFO) << " begin init combine memory";
self_alloc = true;
origin_data = ReadFileToBuff(program_.para_path);
}
PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
float *data = reinterpret_cast<float *>(origin_data);
for (const auto &block : to_predict_program_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
if (var_desc->Persistable()) {
CLImage *cl_image = nullptr;
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
var->template GetMutable<framework::LoDTensor>();
continue;
} else {
cl_image = var->template GetMutable<framework::CLImage>();
}
cl_context context = program_.scope->GetCLScpoe()->Context();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = framework::make_ddim(desc.Dims());
int numel = 1;
for (int i = 0; i < ddim.size(); i++) {
numel = numel * ddim[i];
}
float *tensorInput = static_cast<float *>(
paddle_mobile::memory::Alloc(sizeof(float) * numel));
LoadMemory(*var_desc, tensorInput, &origin_data);
// has not init
cl_image->SetTensorData(tensorInput, ddim);
paddle_mobile::memory::Free(tensorInput);
} else {
auto cl_image = var->template GetMutable<framework::CLImage>();
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
const framework::TensorDesc &desc = var_desc->Tensor_desc();
framework::DDim ddim = cl_image->dims();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image->InitEmptyImage(context, command_queue, ddim);
}
}
}
if (self_alloc) {
delete data;
}
LOG(kLOG_INFO) << " end init combine memory ";
}
#endif
template class Executor<CPU, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
template class Executor<FPGA, Precision::FP32>;
template class Executor<GPU_CL, Precision::FP32>;
template class Executor<GPU_MALI, Precision::FP32>;
} // namespace framework
} // namespace paddle_mobile
......@@ -26,6 +26,7 @@ limitations under the License. */
#include "framework/tensor.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Executor {
......@@ -79,7 +80,10 @@ class Executor {
void LoadMemory(void **data,
const std::shared_ptr<framework::VarDesc> var_desc,
framework::LoDTensor *tensor);
#ifdef PADDLE_MOBILE_CL
void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
char **data);
#endif
framework::Program<Dtype> program_;
int batch_size_ = 1;
std::shared_ptr<framework::ProgramDesc> to_predict_program_;
......@@ -97,4 +101,5 @@ class Executor {
bool loddable_ = false;
};
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "common/types.h"
#include "framework/program/program.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype = CPU, Precision P = Precision::FP32>
class Loader {
public:
/*
* @b load separate format fluid model
* @b 加载分开形式的 fluid 模型
* */
const Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
const Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false,
bool quantification = false);
const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
const uint8_t *model_buf,
size_t combined_params_len,
uint8_t *combined_params_buf,
bool optimize = false,
bool quantification = false);
private:
const Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
void InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope);
};
} // namespace framework
} // namespace paddle_mobile
......@@ -14,8 +14,10 @@ limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <tuple>
#include "common/log.h"
#include "common/type_define.h"
#include "framework/op_info.h"
......@@ -120,5 +122,8 @@ class OpRegistry {
#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
#define REGISTER_OPERATOR_CL(op_type, op_class) \
REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
} // namespace framework
} // namespace paddle_mobile
......@@ -56,7 +56,7 @@ template <typename Dtype>
void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template <typename Dtype>
void OperatorBase<Dtype>::Run() const {
void OperatorBase<Dtype>::Run() {
RunImpl();
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
......@@ -84,9 +84,57 @@ void OperatorBase<Dtype>::Run() const {
#endif
}
#ifdef PADDLE_MOBILE_CL
template <>
void OperatorBase<GPU_CL>::Run() {
RunImpl();
#ifdef PADDLE_MOBILE_DEBUG
DLOG << "-------------" << type_ << "----------------------------";
vector<string> input_keys = GetInputKeys();
for (const auto key : input_keys) {
auto var_vec_in = inputs_.at(key);
for (int i = 0; i < var_vec_in.size(); ++i) {
auto vari = scope_->FindVar(var_vec_in[i]);
if (vari->IsInitialized()) {
if (type_ == "feed") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
if (cl_image) {
DLOG << type_ << " input- " << key << "=" << *cl_image;
}
}
}
}
}
for (const auto key : GetOutKeys()) {
auto var_vec_out = outputs_.at(key);
for (int i = 0; i < var_vec_out.size(); ++i) {
auto vari = scope_->FindVar(var_vec_out[i]);
if (vari->IsInitialized()) {
if (type_ == "fetch") {
Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
if (tensor) {
DLOG << type_ << " output- " << key << "=" << *tensor;
}
} else {
CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
if (cl_image) {
DLOG << type_ << " output- " << key << "=" << *cl_image;
}
}
}
}
}
#endif
}
#endif
template class OperatorBase<CPU>;
template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>;
template class OperatorBase<GPU_CL>;
} // namespace framework
} // namespace paddle_mobile
此差异已折叠。
......@@ -18,6 +18,8 @@ limitations under the License. */
#include "framework/program/program_desc.h"
#include "framework/scope.h"
#include <string>
namespace paddle_mobile {
namespace framework {
......@@ -32,7 +34,7 @@ class Program {
bool combined = false;
bool quantification = false;
size_t combined_params_len;
const uint8_t *combined_params_buf;
uint8_t *combined_params_buf;
};
} // namespace framework
......
......@@ -15,8 +15,14 @@ limitations under the License. */
#pragma once
#include <list>
#include <string>
#include <unordered_map>
#include "variable.h"
#include <vector>
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_scope.h"
#endif
#include "framework/variable.h"
namespace paddle_mobile {
namespace framework {
......@@ -33,6 +39,10 @@ class Scope {
delete kid;
}
kids_.clear();
#ifdef PADDLE_MOBILE_CL
delete cl_scope_;
#endif
}
Scope &NewScope() const;
......@@ -72,6 +82,10 @@ class Scope {
Variable *FindVarLocally(const std::string &name) const;
#ifdef PADDLE_MOBILE_CL
CLScope *GetCLScpoe() { return cl_scope_; }
#endif
private:
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const *parent) : parent_(parent) {}
......@@ -79,6 +93,10 @@ class Scope {
mutable std::unordered_map<std::string, Variable *> vars_;
mutable std::list<Scope *> kids_;
Scope const *parent_{nullptr};
#ifdef PADDLE_MOBILE_CL
CLScope *cl_scope_ = new CLScope();
#endif
};
} // namespace framework
} // namespace paddle_mobile
此差异已折叠。
此差异已折叠。
......@@ -29,7 +29,9 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
template <typename Dtype, Precision P>
bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
#ifdef PADDLE_MOBILE_CL
paddle_mobile_->SetCLPath(config.cl_path);
#endif
if (config.memory_pack.from_memory) {
DLOG << "load from memory!";
paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
......@@ -126,6 +128,8 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
} else if (config.device == PaddleMobileConfig::kGPU_MALI) {
x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
} else if (config.device == PaddleMobileConfig::kGPU_CL) {
x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
} else {
LOG(kLOG_ERROR) << "unsupport device type!";
return nullptr;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册