diff --git a/CMakeLists.txt b/CMakeLists.txt index bdbf5a6ea604400fb5087976df0e1e9c279fd78d..7f1cffd332dfc4f1614ca63ed60f358acf59a74b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,15 +1,22 @@ -cmake_minimum_required(VERSION 3.0) -project(paddle-mobile) - -# select the platform to build -option(CPU "armv7 with neon support" ON) -option(MALI_GPU "mali gpu support" OFF) -option(FPGA "fpga support" OFF) +cmake_minimum_required(VERSION 3.0.0) -option(USE_OPENMP "openmp support" OFF) +option(USE_OPENMP "openmp support" ON) option(DEBUGING "enable debug mode" ON) -option(USE_EXCEPTION "use std exception" OFF) +option(USE_EXCEPTION "use std exception" ON) +option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io option(LOG_PROFILE "log profile" OFF) +# select the platform to build +option(CPU "armv7 with neon" ON) +option(GPU_MALI "mali gpu" OFF) +option(GPU_CL "opencl gpu" OFF) +option(FPGA "fpga" OFF) +if(FPGA) + option(FPGAV1 "fpga v1" ON) + option(FPGAV2 "fpga v2" OFF) +endif() + + +project(paddle-mobile) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) @@ -29,10 +36,10 @@ if(DEBUGING) message(STATUS "debugging mode") add_definitions(-DPADDLE_MOBILE_DEBUG) else() - if(FPGA) - else() - add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) - endif() +endif() + +if(SYMBOL_HIDDEN) + add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) endif() if(USE_EXCEPTION) @@ -70,7 +77,27 @@ else() endforeach() endif() -if(MALI_GPU) +if (GPU_CL) + add_definitions(-DPADDLE_MOBILE_CL) + + # opencl version + add_definitions(-DCL_TARGET_OPENCL_VERSION=220) + + link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so) + include_directories(third_party/opencl/OpenCL-Headers) +else() + file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + + file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h) + foreach(f ${_tmp_list_h}) + list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) + endforeach() +endif() + +if (GPU_MALI) add_definitions(-DPADDLE_MOBILE_MALI_GPU) add_definitions(-DUSE_ACL=1) add_definitions(-DUSE_OPENCL) @@ -96,8 +123,43 @@ else() endif() if(FPGA) - message("FPGA mode enabled") add_definitions(-DPADDLE_MOBILE_FPGA) + file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h) + foreach(f ${_tmp_list_h}) + list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) + endforeach() + list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp) + list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h) + list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h) + if(FPGAV1) + message("FPGA_V1 enabled") + add_definitions(-DPADDLE_MOBILE_FPGA_V1) + file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + endif() + if(FPGAV2) + message("FPGA_V2 enabled") + add_definitions(-DPADDLE_MOBILE_FPGA_V2) + file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h) + foreach(f ${_tmp_list}) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) + endforeach() + endif() + else() file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) foreach(f ${_tmp_list}) @@ -124,17 +186,17 @@ endif() if(ANDROID_NDK_TOOLCHAIN_INCLUDED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h) endif() if(IS_IOS) else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm) - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h) -endif() + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h) +endif () set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -143,8 +205,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) # NET default -if(FPGA) - set(NET "FPGAnets" CACHE STRING "select net type") +if(FPGAV1) + set(NET "FPGA_NET_V1" CACHE STRING "select net type") +elseif(FPGAV2) + set(NET "FPGA_NET_V2" CACHE STRING "select net type") else() set(NET "default" CACHE STRING "select net type") endif() diff --git a/README.md b/README.md index ee4e20513186979fe76c1259e7fc3ca962426843..2572f25444dc4268e7a6a3f43cfdc1b38dae8e02 100644 --- a/README.md +++ b/README.md @@ -8,46 +8,23 @@ [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)--> -欢迎来到 Paddle-Mobile GitHub 项目。 - -Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。Paddle-Mobile设计思想和PaddlePaddle的最新版fluid版本保持了高度一致,同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。 - -## 简单搜索线上效果 - -如下gif是简单搜索app的线上主体检测应用效果 - -![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif) - -## Demo目录 - -[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo) +欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平台的深度学习的框架。 ## Features -- **ARM CPU** - -- **Mali GPU** - -- **苹果设备的GPU Metal实现** - -- **FPGA** +- 高性能支持ARM CPU +- 支持Mali GPU +- 支持Andreno GPU +- 支持苹果设备的GPU Metal实现 +- 支持ZU5、ZU9等FPGA开发板 +- 支持树莓派等arm-linux开发板 - 目前已经支持 ZCU102 开发板。 +## Demo +- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo) -- **灵活性** - - * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。 - * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。 - * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。 - * 使用 docker 编译, 提供统一的编译环境。 - * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。 - * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。 - -- **体积** - - paddle-mobile从设计之初就深入考虑到移动端的包体积的问题,cpu实现中没有外部依赖。在编译过程中,如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。 - 除了二进制体积,我们对代码体积极力避免过大。整个仓库的代码体积也非常小。 +### 原Domo目录 +[https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo) ## 文档 @@ -62,6 +39,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md) * [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md) +* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md) ### 贡献文档 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md) @@ -73,18 +51,22 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ### 1. 直接使用Paddle Fluid训练 该方式最为可靠,推荐方式 ### 2. caffe转为Paddle Fluid模型 -[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid) +[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid) ### 3. ONNX ONNX全称为“Open Neural Network Exchange”,即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。 除直接使用PaddlePaddle训练fluid版本的模型外,还可以通过onnx转换得到个别Paddle fluid模型。 -目前,百度也在做onnx支持工作。相关转换项目在这里:[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。 - -![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg) +目前,百度也在做onnx支持工作。相关转换项目在这里: +[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx) ### 4. 部分测试模型和测试图片下载 -[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) +[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) + + ## 问题解决 @@ -96,5 +78,3 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L ## 旧版 Mobile-Deep-Learning 原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) - - diff --git a/demo/ReadMe.md b/demo/ReadMe.md index aa71f75cb7526234bb0bb32e2e5e1f93c1789711..c6d7b3def9fb44db86ea4456396c91354953d99d 100644 --- a/demo/ReadMe.md +++ b/demo/ReadMe.md @@ -1,11 +1,10 @@ -## 如何运行demo -- Android demo下载路径 - http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip -- iOS demo下载路径: - http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip - -在demo目录下执行下载demo的脚本 +## Demo 下载路径 +- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip) + +- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip) + +- 原demo亦可使用getDemo.sh进行下载 + ``` sh getDemo.sh ``` -demo工程就下载解压到当前目录中了。 \ No newline at end of file diff --git a/doc/development_android_GPU.md b/doc/development_android_GPU.md new file mode 100644 index 0000000000000000000000000000000000000000..03750260cf343692e52fd667cb797e27e7b6983d --- /dev/null +++ b/doc/development_android_GPU.md @@ -0,0 +1,85 @@ +## paddle-mobile GPU开发文档 + +编译环境配置方法请参考development_android.md文档 + +1. 下载 paddle-mobile + +``` +git clone https://github.com/PaddlePaddle/paddle-mobile.git + +adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl + +修改paddle-mobile/CMakeLists.txt文件,执行如下操作: +option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON) + +cd paddle-mobile/tools + +sh build.sh android + +``` +2. 将单测可执行文件和模型部署到手机 + +下载测试需要的mobilenet和test_image_1x3x224x224_float文件,下载地址:http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip + +``` +cd ../test +mkdir models +mkdir images + +``` +将mobilenet复制到paddle-mobile/test/models目录下 +将test_image_1x3x224x224_float复制到paddle-mobile/test/images目录下 + +执行下面命令将可执行文件和预测需要的文件部署到手机 + +``` +cd ../tools/android-debug-script +sh push2android.sh + +``` +3. 在adb shell中执行对应的可执行文件(目前只支持mobilenet,后续会支持更多的网络模型) + +``` +adb shell +cd /data/local/tmp/bin/ +export LD_LIBRARY_PATH=. +./test-mobilenetgpu + +``` +4. mobilenet cpu模型预测结果 + +假设mobilenet和test_image_1x3x224x224_float文件已经推送到手机上,执行下面命令进行mobilenet cpu的预测 + +``` +adb shell +cd /data/local/tmp/bin/ +export LD_LIBRARY_PATH=. +./test-mobilenet + +``` +5. 预测结果 + + 手机型号:小米6(CPU 835,GPU Adreno 540) + + mobilenet gpu:预测性能,耗时41ms左右。 + + mobilenet cpu: + + 1线程:108ms + 2线程:65ms + 4线程:38ms + + 手机型号:OPPO Findx(CPU 845,GPU Adreno 630) + + mobilenet gpu:预测性能,耗时27ms左右。 + + mobilenet cpu: + + 1线程:90ms + 2线程:50ms + 4线程:29ms + + + + + diff --git a/doc/development_arm_linux.md b/doc/development_arm_linux.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7874179480ec579c8c7e5b46cd0f1905fb4f9c43 100644 --- a/doc/development_arm_linux.md +++ b/doc/development_arm_linux.md @@ -0,0 +1,28 @@ +# ARM_LINUX开发文档 +目前支持直接在arm_linux平台上编译paddle-mobile + +## 以Raspberrypi3为例: +### 执行编译 +在paddle-mobile根目录中,执行以下命令: +``` +cd tools +/bin/bash build.sh arm_linux googlenet +``` +执行完毕后,生成的so位于paddle-mobile/build/release/arm-linux/build目录中,单测可执行文件位于test/build目录中。 + +### 运行 +``` +cd ../build/release/arm-linux/build +export LD_LIBRARY_PATH=. +cd ../../../../test/build/ +./test-googlenet +``` +*注1:如果本地test目录下没有模型的话,会自动下载官方demo模型并解压.* + +*注2:因为arm_linux设备算力限制,建议编译时,根据需要指定编译某个模型(如googlenet)或扩大系统的swap交换空间,避免编译时卡死.* + +## 其他ARM_LINUX平台 + +其他的arm_linux平台可以修改 tools/build.sh中的相关编译参数进行编译。可以参考对应平台的编译选项。 +特别说明的是Android平台请参考Android开发文档. + diff --git a/doc/development_fpga.md b/doc/development_fpga.md index 3389ddde676a5d1c7b452dc734880eb50170bd3e..1f0d6ffb364fc35cda306ad748c45c085d5986d6 100644 --- a/doc/development_fpga.md +++ b/doc/development_fpga.md @@ -1,6 +1,6 @@ # FPGA开发文档 -FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功,预测结果正确。 +FPGA平台的代码分为V1和V2。其中V1在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功,预测结果正确。以下描述适用于复现V1运行的结果。 ## 准备硬件 ___ @@ -17,7 +17,7 @@ ___ ## 编译工程 ___ 1. 将最新的paddle mobile 代码复制到ZCU102开发板中。 -2. 进入paddle-mobile根目录, CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。 +2. 进入paddle-mobile根目录, CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。设置option(FPGAV1 "fpga v1" ON), option(FPGAV2 "fpga v2" OFF)。 2. 执行以下命令,可在./test/build下生成test-resnet50可执行程序。 * mkdir build * cd build diff --git a/src/common/common.h b/src/common/common.h index 12157b5e946490d041f0cc0d235142a13a3a2527..c7a681f426f788bcd8ee8f52dbfab3c6e1afeb8f 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include +#include // NOLINT + +namespace paddle_mobile { using Time = decltype(std::chrono::high_resolution_clock::now()); @@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) { ms counter = std::chrono::duration_cast(diff); return counter.count() / 1000.0; } + +} // namespace paddle_mobile diff --git a/src/common/enforce.h b/src/common/enforce.h index aebe2a58031cb1341596f07dbf653be4a5e01900..bf21b5b9a2fe5f70b3bd23a581f0c1dfbf373f42 100644 --- a/src/common/enforce.h +++ b/src/common/enforce.h @@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception { std::string detail(buffer); \ throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \ __FILE__, __LINE__); \ - } + } \ + exit(0); #define PADDLE_MOBILE_ENFORCE(stat, ...) \ { \ diff --git a/src/common/types.cpp b/src/common/types.cpp index 8c8de7765161e61dc75036a87a34fc6abd2df43e..510313d9fee0940d7162ea2c6b09426f6d9ce17a 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -40,9 +40,11 @@ const char *G_OP_TYPE_POOL2D = "pool2d"; const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; const char *G_OP_TYPE_RELU = "relu"; const char *G_OP_TYPE_RESHAPE = "reshape"; +const char *G_OP_TYPE_RESHAPE2 = "reshape2"; const char *G_OP_TYPE_SIGMOID = "sigmoid"; const char *G_OP_TYPE_SOFTMAX = "softmax"; const char *G_OP_TYPE_TRANSPOSE = "transpose"; +const char *G_OP_TYPE_TRANSPOSE2 = "transpose2"; const char *G_OP_TYPE_SPLIT = "split"; const char *G_OP_TYPE_FEED = "feed"; const char *G_OP_TYPE_FETCH = "fetch"; @@ -69,6 +71,8 @@ const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; +extern const char *G_OP_TYPE_TANH = "tanh"; +extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; std::unordered_map< std::string, std::pair, std::vector>> @@ -80,6 +84,7 @@ std::unordered_map< {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}}, {G_OP_TYPE_RELU, {{"X"}, {"Out"}}}, {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}}, + {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}}, {G_OP_TYPE_MUL, {{"X"}, {"Out"}}}, {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}}, @@ -90,6 +95,7 @@ std::unordered_map< {G_OP_TYPE_FEED, {{"X"}, {"Out"}}}, {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}, {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}}, + {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}}, {G_OP_TYPE_BOX_CODER, {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}}, {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, @@ -99,6 +105,7 @@ std::unordered_map< {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, + {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, @@ -124,5 +131,7 @@ std::unordered_map< {G_OP_TYPE_SUM, {{"X"}, {"Out"}}}, {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, - {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}}; + {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, + {G_OP_TYPE_TANH, {{"X"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}}; } // namespace paddle_mobile diff --git a/src/common/types.h b/src/common/types.h index 0855bd053f0dc804b6f3289796f3818657675864..4cd35ac91084f6518858c97cf4c0e8da5b09555b 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -39,7 +39,13 @@ struct PrecisionTrait { }; //! device type -enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; +enum DeviceTypeEnum { + kINVALID = -1, + kCPU = 0, + kFPGA = 1, + kGPU_MALI = 2, + kGPU_CL = 3 +}; template struct DeviceType {}; @@ -47,6 +53,7 @@ struct DeviceType {}; typedef DeviceType CPU; typedef DeviceType FPGA; typedef DeviceType GPU_MALI; +typedef DeviceType GPU_CL; //! data type enum DataType { @@ -132,6 +139,9 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL; extern const char *G_OP_TYPE_QUANTIZE; extern const char *G_OP_TYPE_DEQUANTIZE; +extern const char *G_OP_TYPE_TANH; +extern const char *G_OP_TYPE_FUSION_DECONV_RELU; + extern std::unordered_map< std::string, std::pair, std::vector>> op_input_output_key; diff --git a/src/fpga/api.cpp b/src/fpga/V1/api.cpp similarity index 99% rename from src/fpga/api.cpp rename to src/fpga/V1/api.cpp index d3f473a7f43714592779de941ed1a6ea53baea83..04e51ab9b09fabc41fcd1cd73864bc285d183821 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/V1/api.cpp @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/api.h" +#include "fpga/V1/api.h" #include #include #include #include #include -#include "fpga/bias_scale.h" -#include "fpga/filter.h" -#include "fpga/image.h" +#include "fpga/V1/bias_scale.h" +#include "fpga/V1/filter.h" +#include "fpga/V1/image.h" #define FPGA_TEST_MODE #define PADDLE_MOBILE_OS_LINUX diff --git a/src/fpga/api.h b/src/fpga/V1/api.h similarity index 100% rename from src/fpga/api.h rename to src/fpga/V1/api.h diff --git a/src/fpga/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp similarity index 98% rename from src/fpga/bias_scale.cpp rename to src/fpga/V1/bias_scale.cpp index 50f1ed03f0121b5afdc41d427e5b52675994bd1e..3c2c04dc1d7f76953b04a879fbcfa8377dd7ba8a 100644 --- a/src/fpga/bias_scale.cpp +++ b/src/fpga/V1/bias_scale.cpp @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/bias_scale.h" +#include "fpga/V1/bias_scale.h" #include -#include "fpga/api.h" +#include "fpga/V1/api.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/bias_scale.h b/src/fpga/V1/bias_scale.h similarity index 100% rename from src/fpga/bias_scale.h rename to src/fpga/V1/bias_scale.h diff --git a/src/fpga/filter.cpp b/src/fpga/V1/filter.cpp similarity index 99% rename from src/fpga/filter.cpp rename to src/fpga/V1/filter.cpp index db851b926bbbd549205ee5d75bc46a6c04888098..3f4a3e2c876f0b54546f0e385d4a5e8bbfacdf3c 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/filter.h" +#include "fpga/V1/filter.h" #include #include -#include "fpga/api.h" +#include "fpga/V1/api.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/filter.h b/src/fpga/V1/filter.h similarity index 100% rename from src/fpga/filter.h rename to src/fpga/V1/filter.h diff --git a/src/fpga/image.cpp b/src/fpga/V1/image.cpp similarity index 98% rename from src/fpga/image.cpp rename to src/fpga/V1/image.cpp index dac6e2a633155e593550ede4d738c5606cec3283..73be05c942d6a848db830148d25bc8b3e14b53e4 100644 --- a/src/fpga/image.cpp +++ b/src/fpga/V1/image.cpp @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/image.h" +#include "fpga/V1/image.h" #include #include -#include "fpga/api.h" +#include "fpga/V1/api.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/image.h b/src/fpga/V1/image.h similarity index 100% rename from src/fpga/image.h rename to src/fpga/V1/image.h diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2f8a9f119e643b3836ef2c541e098f39ab3cbd17 --- /dev/null +++ b/src/fpga/V2/api.cpp @@ -0,0 +1,295 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/api.h" +#include +#include "fpga/V2/bias_scale.h" +#include "fpga/V2/config.h" +#include "fpga/V2/driver/driver.h" +#include "fpga/V2/filter.h" +#include "fpga/V2/image.h" + +namespace paddle_mobile { +namespace fpga { + +static std::map memory_map; + +int open_device() { + int ret = driver::open_device_driver(); + return ret; +} + +int close_device() { + int ret = driver::close_device_driver(); + return ret; +} + +void *fpga_malloc(size_t size) { + static uint64_t counter = 0; +#ifdef PADDLE_MOBILE_ZU5 + auto ptr = driver::fpga_malloc_driver(size); +#else + auto ptr = malloc(size); +#endif + counter += size; + memory_map.insert(std::make_pair(ptr, size)); + // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " + // << counter << " bytes"; + return ptr; +} + +void fpga_free(void *ptr) { + static uint64_t counter = 0; + size_t size = 0; + auto iter = memory_map.find(ptr); // std::map::iterator + if (iter != memory_map.end()) { + size = iter->second; + memory_map.erase(iter); +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_free_driver(ptr); +#else + free(ptr); +#endif + counter += size; + // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " + // << counter << " bytes"; + } else { + DLOG << "Invalid pointer"; + } +} +void fpga_copy(void *dest, const void *src, size_t num) { +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_copy_driver(dest, src, num); +#else + memcpy(dest, src, num); +#endif +} + +int fpga_flush(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_flush_driver(address, size); +#else + return 0; +#endif +} +int fpga_invalidate(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_invalidate_driver(address, size); +#else + return 0; +#endif +} + +void format_image(framework::Tensor *image_tensor) { + auto dims = image_tensor->dims(); + auto channel = dims[1], height = dims[2], width = dims[3]; + auto data_ptr = image_tensor->data(); + size_t memory_size = channel * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + memcpy(new_data, data_ptr, memory_size); + int aligned_channel = filter::calc_aligned_channel((int)channel); // NOLINT + image::format_image(&new_data, (int)channel, (int)height, // NOLINT + (int)width, // NOLINT + aligned_channel); + image_tensor->reset_data_ptr(new_data); +} + +void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) { + auto dims = ofm_tensor->dims(); + size_t memory_size = 0; + if (dims.size() == 4) { + auto height = dims[2], width = dims[3]; + memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half); + } else if (dims.size() == 2) { + memory_size = aligned_channel * sizeof(half); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + memset(p, 0, memory_size); + ofm_tensor->reset_data_ptr(p); +} + +void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) { + auto dims = ofm_tensor->dims(); + size_t memory_size = 0; + if (dims.size() == 4) { + auto height = dims[2], width = dims[3]; + memory_size = height * width * aligned_channel * sizeof(float); + } else if (dims.size() == 2) { + memory_size = aligned_channel * sizeof(float); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + memset(p, 0, memory_size); + ofm_tensor->reset_data_ptr(p); +} + +float filter_find_max(framework::Tensor *filter_tensor) { + auto filter_ptr = filter_tensor->data(); + return filter::find_max(filter_ptr, (int)filter_tensor->numel()); // NOLINT +} + +int get_aligned_channel_num(int channel_num) { + return filter::calc_aligned_channel(channel_num); +} + +int get_aligned_filter_num(framework::Tensor *filter_tensor) { + auto dims = filter_tensor->dims(); + return filter::calc_aligned_num((int)dims[0], (int)dims[1]); // NOLINT +} + +int get_conv_output_channel(framework::Tensor *filter_tensor) { + int aligned_filter_num = get_aligned_filter_num(filter_tensor); + return get_aligned_channel_num(aligned_filter_num); +} +void format_filter(framework::Tensor *filter_tensor, float max_value, + int group_num) { + filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT + filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT + auto dims = filter_tensor->dims(); + auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; + auto data_ptr = filter_tensor->data(); + size_t memory_size = num * channel * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + memcpy(new_data, data_ptr, memory_size); + filter::format_filter(&new_data, (int)num, (int)channel, // NOLINT + (int)height, // NOLINT + (int)width, group_num, max_value); // NOLINT + filter_tensor->reset_data_ptr(new_data); +} + +void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { + filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT + filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT + auto dims = filter_tensor->dims(); + auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; + auto data_ptr = filter_tensor->data(); + size_t memory_size = num * channel * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + memcpy(new_data, data_ptr, memory_size); + filter::format_fc_filter(&new_data, (int)num, (int)channel, // NOLINT + (int)height, // NOLINT + (int)width, 1, max_value); // NOLINT + filter_tensor->reset_data_ptr(new_data); +} + +void format_bias_scale_array(float **bias_scale_array, int filter_num, + int filter_channel) { + int num_after_alignment = + filter::calc_aligned_num(filter_channel, filter_channel); + bias_scale::format_bias_scale_array(bias_scale_array, filter_num, + num_after_alignment); +} + +void format_concat_output(framework::Tensor *out, int height, int width, + uint32_t out_channel) { + auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half)); + auto ddim = framework::make_ddim({1, out_channel, height, width}); + out->Resize(ddim); + out->reset_data_ptr(data_ptr); +} + +int format_conv_data(framework::Tensor *filter_tensor, + framework::Tensor *ofm_tensor, float *bs_ptr, int group) { + float max_value = fpga::filter_find_max(filter_tensor); + fpga::format_filter(filter_tensor, max_value, group); + int aligned_num = get_aligned_filter_num(filter_tensor); + fpga::format_bias_scale_array(&bs_ptr, + (int)filter_tensor->dims()[0], // NOLINT + aligned_num); + int aligned_channel = fpga::get_conv_output_channel(filter_tensor); + fpga::format_fp16_ofm(ofm_tensor, aligned_channel); + DLOG << aligned_channel; + return aligned_channel; +} + +int format_fc_data(framework::Tensor *filter_tensor, + framework::Tensor *ofm_tensor, float *bs_ptr) { + float max_value = fpga::filter_find_max(filter_tensor); + fpga::format_fc_filter(filter_tensor, max_value); + int aligned_num = get_aligned_filter_num(filter_tensor); + fpga::format_bias_scale_array(&bs_ptr, + (int)filter_tensor->dims()[0], // NOLINT + aligned_num); + int aligned_channel = fpga::get_conv_output_channel(filter_tensor); + fpga::format_fp16_ofm(ofm_tensor, aligned_channel); + DLOG << aligned_channel; + return aligned_channel; +} + +void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, + framework::Tensor *out, framework::Tensor *filter, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float *bs_ptr) { + auto input_ptr = input->data(); + auto filter_ptr = filter->data(); + auto out_ptr = out->data(); + + arg->group_num = (uint32_t)group_num; + arg->split_num = 1; + arg->filter_num = (uint32_t)filter->dims()[0]; + arg->output.address = out_ptr; + arg->output.scale_address = out->scale; + arg->conv_arg = + (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT + + arg->concat_arg.image_num = arg->split_num; + arg->concat_arg.image_out = out_ptr; + arg->concat_arg.scale_out = out->scale; + arg->concat_arg.height = (uint32_t)out->dims()[2]; + arg->concat_arg.width = (uint32_t)out->dims()[3]; + + int n = arg->split_num; + arg->concat_arg.images_in = + (half **)fpga_malloc(n * sizeof(int *)); // NOLINT + arg->concat_arg.scales_in = + (float **)fpga_malloc(n * sizeof(float *)); // NOLINT + arg->concat_arg.channel_num = + (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT + + for (int i = 0; i < n; i++) { + arg->conv_arg[i].relu_enabled = relu_enabled; + arg->conv_arg[i].sb_address = bs_ptr; + arg->conv_arg[i].filter_address = (int8_t *)filter_ptr; // NOLINT + arg->conv_arg[i].filter_scale_address = filter->scale; + arg->conv_arg[i].filter_num = arg->filter_num; + arg->conv_arg[i].group_num = (uint32_t)group_num; + + arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; + arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; + arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2]; + arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3]; + + arg->conv_arg[i].image.address = input_ptr; + arg->conv_arg[i].image.scale_address = input->scale; + arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; + arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; + arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; + arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; + arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; + + arg->conv_arg[i].output.address = out_ptr; + arg->conv_arg[i].output.scale_address = out->scale; + + int num_after_alignment = + filter::calc_aligned_num((int)input->dims()[1], arg->filter_num); + arg->conv_arg[i].free_space = + fpga_malloc(num_after_alignment * 2 * sizeof(half)); + } +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h new file mode 100644 index 0000000000000000000000000000000000000000..1f4a203936b517d93e2d417b08a8b8456cc1fc93 --- /dev/null +++ b/src/fpga/V2/api.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "fpga/V2/driver/pe.h" +#include "fpga/V2/fpga_common.h" +#include "framework/tensor.h" + +namespace paddle_mobile { +namespace fpga { + +int open_device(); +int close_device(); +void* fpga_malloc(size_t size); +void fpga_free(void* ptr); +void fpga_copy(void* dest, const void* src, size_t num); +int fpga_flush(void* address, size_t size); +int fpga_invalidate(void* address, size_t size); + +float filter_find_max(framework::Tensor* filter_tensor); +int get_aligned_channel_num(int channel_num); +int get_aligned_filter_num(framework::Tensor* filter_tensor); +int get_conv_output_channel(framework::Tensor* filter_tensor); + +void format_image(framework::Tensor* image_tensor); +void format_fp16_ofm(framework::Tensor* ofm_tensor, + int aligned_channel); // only allocate memory +void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel); + +void format_filter(framework::Tensor* filter_tensor, float max_value, + int group_num); +void format_fc_filter(framework::Tensor* filter_tensor, float max_value); +void format_bias_scale_array(float** bias_scale_array, int filter_num, + int filter_channel); +void format_concat_output(framework::Tensor* out, int height, int width, + uint32_t out_channel); +int format_conv_data(framework::Tensor* filter_tensor, + framework::Tensor* ofm_tensor, float* bs_ptr, int group); +int format_fc_data(framework::Tensor* filter_tensor, + framework::Tensor* ofm_tensor, float* bs_ptr); +void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, + framework::Tensor* out, framework::Tensor* filter, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float* bs_ptr); + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3afd3f51bbb10e3bb2d66195fcc54d25c56e2393 --- /dev/null +++ b/src/fpga/V2/bias_scale.cpp @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/bias_scale.h" +#include +#include "fpga/V2/api.h" + +namespace paddle_mobile { +namespace fpga { +namespace bias_scale { + +void align_element(float **data_in, int num, int num_after_alignment) { + float *ptr_unaligned = *data_in; + int total_element = 2 * num_after_alignment; // including bias & scale + float *ptr_aligned = + (float *)fpga_malloc(total_element * sizeof(float)); // NOLINT + memset(ptr_aligned, 0, total_element * sizeof(float)); + + for (int i = 0; i < num; i++) { + ptr_aligned[i * 2 + 0] = ptr_unaligned[i]; + ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num]; + } + + fpga_free(ptr_unaligned); + *data_in = ptr_aligned; +} + +void format_bias_scale_array(float **data_in, int num, + int num_after_alignment) { + align_element(data_in, num, num_after_alignment); + fpga_flush(*data_in, 2 * num_after_alignment * sizeof(float)); +} + +} // namespace bias_scale +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/bias_scale.h b/src/fpga/V2/bias_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..6040c0bef138631e2d1ada280d7a1fc593915e36 --- /dev/null +++ b/src/fpga/V2/bias_scale.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle_mobile { +namespace fpga { +namespace bias_scale { + +void align_element(float **data_in, int num, int num_after_alignment); +void format_bias_scale_array(float **data_in, int num, int num_after_alignment); + +} // namespace bias_scale +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/config.h b/src/fpga/V2/config.h new file mode 100644 index 0000000000000000000000000000000000000000..27187c7b854c84d501949db41fe89f9dca1d2bf1 --- /dev/null +++ b/src/fpga/V2/config.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define PADDLE_MOBILE_ZU5 +#define FPGA_PRINT_MODE diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/V2/driver/bitmap.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c612faa6aed11b683ff81fffdf6c57a6fed9536d --- /dev/null +++ b/src/fpga/V2/driver/bitmap.cpp @@ -0,0 +1,131 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/driver/bitmap.h" + +namespace fpga_bitmap { +void bitmap_set(uint64_t *map, unsigned int start, int len) { + uint64_t *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *p |= mask_to_set; + } +} + +void bitmap_clear(uint64_t *map, unsigned int start, int len) { + uint64_t *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} + +static uint64_t ffs(uint64_t data) { + uint64_t bit = 0; + int i = 0; + + for (i = 0; i < sizeof(data) * 8; i++) { + if (data & (1UL << i)) { + bit = i; + break; + } + } + + return bit; +} + +static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits, + uint64_t start, uint64_t invert) { + uint64_t tmp = 0; + + if (!nbits || start >= nbits) return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + + /* Handle 1st word. */ + tmp &= BITMAP_FIRST_WORD_MASK(start); + start = round_down(start, BITS_PER_LONG); + + while (!tmp) { + start += BITS_PER_LONG; + if (start >= nbits) return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + } + + return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits; +} + +uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size, + uint64_t offset) { + return _find_next_bit(addr, size, offset, ~0UL); +} + +uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) { + return _find_next_bit(addr, size, offset, 0UL); +} + +uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size, + uint64_t start, unsigned int nr, + uint64_t align_mask, + uint64_t align_offset) { + uint64_t index = 0; + uint64_t end = 0; + uint64_t i = 0; + +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; + + end = index + nr; + if (end > size) return end; + i = find_next_bit(map, end, index); + if (i < end) { + start = i + 1; + goto again; + } + + return index; +} + +uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, + uint64_t start, unsigned int nr, + uint64_t align_mask) { + return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); +} +} // namespace fpga_bitmap diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/V2/driver/bitmap.h new file mode 100644 index 0000000000000000000000000000000000000000..4cb1673d91d61c1ec27bbc6923e49e8dd04e3a37 --- /dev/null +++ b/src/fpga/V2/driver/bitmap.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#define BITS_PER_LONG 64 +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) +#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) + +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) + +#define round_down(x, y) ((x) & ~((y)-1)) + +namespace fpga_bitmap { +void bitmap_set(uint64_t *map, unsigned int start, int len); +void bitmap_clear(uint64_t *map, unsigned int start, int len); +uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, + uint64_t start, unsigned int nr, + uint64_t align_mask); + +} // namespace fpga_bitmap diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/V2/driver/driver.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7e71782676fd350f938847c03e9736ff0adb64a --- /dev/null +++ b/src/fpga/V2/driver/driver.cpp @@ -0,0 +1,432 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/enforce.h" +#include "fpga/V2/driver/bitmap.h" +#include "fpga/V2/driver/driver.h" + +namespace paddle_mobile { +namespace fpga { +namespace driver { +struct FPGA_INFO g_fpgainfo; + +int open_drvdevice() { + if (g_fpgainfo.fd_drv == -1) { + g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR); + } + return g_fpgainfo.fd_drv; +} + +int open_memdevice() { + if (g_fpgainfo.fd_mem == -1) { + // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC); + g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR); + } + return g_fpgainfo.fd_mem; +} + +void pl_reset() { + // DLOG << "PL RESET"; + + usleep(100 * 1000); +} + +void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe, + char const *type_name, int pe_idx) { + memset(pe, 0, sizeof(struct fpga_pe)); + + pe->outer = pe_data; + snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name); + + pe->status = IDLE; + pe->interrupt_cnt = 0; + pe_data->pes[pe_idx] = pe; + pe_data->pe_num++; +} + +void pl_init() { + struct pe_data_s *pe_data = nullptr; + + pl_reset(); + + pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s)); + if (pe_data == nullptr) { + DLOG << "pe_data malloc error!"; + return; + } + memset(pe_data, 0, sizeof(struct pe_data_s)); + pthread_mutex_init(&pe_data->mutex, 0); + + setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV); + setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING); + setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW); + setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS); + + g_fpgainfo.pe_data = pe_data; +} + +void pl_destroy() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + pthread_mutex_destroy(&pe_data->mutex); + free(pe_data); +} + +void pl_start() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + + pthread_mutex_unlock(&pe_data->mutex); +} + +void pl_stop() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + + pthread_mutex_lock(&pe_data->mutex); +} + +void pl_reinit() { + struct pe_data_s *pe_data = g_fpgainfo.pe_data; + struct fpga_pe *pe = nullptr; + int i = 0; + + pl_stop(); + pl_reset(); + pl_start(); + + for (i = 0; i < pe_data->pe_num; i++) { + pe = pe_data->pes[i]; + pe->status = IDLE; + pe->interrupt_cnt = 0; + } + + pl_start(); +} + +int pl_get_status() { return 0; } + +/*tmie单位us*/ +int fpga_regpoll(uint64_t reg, uint64_t val, int time) { + uint64_t i = 0; + /*timeout精确性待确认*/ + int64_t timeout = time * 6; + + for (i = 0; i < timeout; i++) { + if (val == reg_readq(reg)) { + break; + } + } + + if (i <= timeout) { + return 0; + } else { + return -1; + } +} + +/*内存管理*/ +int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { + uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); + unsigned int nr = (unsigned int)_nr; + int ret = 0; + + pthread_mutex_lock(&memory->mutex); + + unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area( + memory->bitmap, memory->page_num, 0, nr, 0); + if (pos <= memory->page_num) { + uint64_t address_ofset = + memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE; + fpga_bitmap::bitmap_set(memory->bitmap, pos, nr); + memory->nr[pos] = nr; + + *addr = address_ofset; + } else { + ret = -ENOMEM; + } + + pthread_mutex_unlock(&memory->mutex); + + return ret; +} + +void memory_release(struct fpga_memory *memory) { + void *ptr = nullptr; + + /*unmap memory*/ + std::map map = g_fpgainfo.fpga_addr2size_map; + std::map::iterator iter; + for (iter = map.begin(); iter != map.end(); iter++) { + fpga_free_driver(ptr); + } +} + +int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) { + int rc = 0; + + uint64_t *bitmap = nullptr; + unsigned int *nr = nullptr; + + // 不允许多份memory创建,所以创建memory结构体不存在互斥 + // pthread_mutex_lock(&memory->mutex); + memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE); + memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG); + + bitmap = + (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT + if (!bitmap) { + rc = -EFAULT; + return rc; + } + memory->bitmap = bitmap; + + nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int)); + if (!nr) { + rc = -EFAULT; + free(bitmap); + return rc; + } + memory->nr = nr; + + memory->mem_start = FPGA_MEM_PHY_ADDR; + memory->mem_end = FPGA_MEM_SIZE; + // pthread_mutex_unlock(memory->mutex); + + return rc; +} + +int create_fpga_memory(struct fpga_memory **memory_info) { + int rc = 0; + + *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory)); + if (*memory_info == NULL) { + rc = -EFAULT; + return rc; + } + pthread_mutex_init(&((*memory_info)->mutex), nullptr); + + rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE); + if (rc) { + free(*memory_info); + } + + return rc; +} + +int init_fpga_memory(struct fpga_memory *memory) { + int rc = 0; + + if (!memory) { + rc = -EFAULT; + return rc; + } + + fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); + fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0. + + return 0; +} + +void destroy_fpga_memory(struct fpga_memory *memory) { + if (memory) { + free(memory->nr); + free(memory->bitmap); + free(memory); + } +} + +int fpga_memory_add() { + int rc = 0; + + rc = create_fpga_memory(&g_fpgainfo.memory_info); + if (rc) { + return rc; + } + + rc = init_fpga_memory(g_fpgainfo.memory_info); + if (rc) { + destroy_fpga_memory(g_fpgainfo.memory_info); + return rc; + } + + return 0; +} + +uint64_t vaddr_to_paddr(void *address) { + uint64_t paddr = 0; + auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); + if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { + paddr = iter->second; + } else { + DLOG << "Invalid pointer"; + } + + return paddr; +} + +void *fpga_reg_malloc(size_t size) { + void *ret = nullptr; + ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, + g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR); + // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); + + g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); + + return ret; +} + +void *fpga_reg_free(void *ptr) { + size_t size = 0; + + auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); + if (iter != g_fpgainfo.fpga_addr2size_map.end()) { + size = iter->second; + g_fpgainfo.fpga_addr2size_map.erase(iter); + munmap(ptr, size); + } else { + DLOG << "Invalid pointer"; + } +} + +void *fpga_malloc_driver(size_t size) { + void *ret = nullptr; + uint64_t phy_addr = 0; + int i = 0; + + memory_request(g_fpgainfo.memory_info, size, &phy_addr); + + ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, + g_fpgainfo.fd_mem, phy_addr); + PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); + + g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr)); + g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); + + return ret; +} + +void fpga_free_driver(void *ptr) { + size_t size = 0; + uint32_t pos = 0; + uint64_t p_addr = 0; + + auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr); + if (iter != g_fpgainfo.fpga_addr2size_map.end()) { + size = iter->second; + g_fpgainfo.fpga_addr2size_map.erase(iter); + munmap(ptr, size); + + p_addr = vaddr_to_paddr(ptr); + pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; + + /*clear bitmap*/ + pthread_mutex_lock(&g_fpgainfo.memory_info->mutex); + fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, + g_fpgainfo.memory_info->nr[pos]); + pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); + } else { + DLOG << "Invalid pointer"; + } +} + +static inline int do_ioctl(unsigned long req, const void *arg) { + return ioctl(g_fpgainfo.fd_mem, req, arg); +} + +int fpga_flush_driver(void *address, size_t size) { + struct MemoryCacheArgs args; + uint64_t p_addr; + + p_addr = vaddr_to_paddr(address); + + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.size = size; + + return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); +} + +int fpga_invalidate_driver(void *address, size_t size) { + struct MemoryCacheArgs args; + uint64_t p_addr; + + p_addr = vaddr_to_paddr(address); + + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.size = size; + + return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); +} + +void fpga_copy_driver(void *dest, const void *src, size_t num) { + uint64_t i; + + DLOG << "dest:" << dest << " src:" << src << " size:" << num; + + for (i = 0; i < num; i++) { + // DLOG << "i:" << i << " val:" << *((int8_t *)src + i); + // usleep(1); + *((int8_t *)dest + i) = *((int8_t *)src + i); + } + + return; +} + +int open_device_driver() { + g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR; + g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR; + g_fpgainfo.FpgaRegVirAddr = nullptr; + g_fpgainfo.pe_data = nullptr; + g_fpgainfo.drvdevice_path = "/dev/fpgadrv0"; + g_fpgainfo.memdevice_path = "/dev/fpgamem0"; + g_fpgainfo.fd_drv = -1; + g_fpgainfo.fd_mem = -1; + + int ret = 0; + ret = open_drvdevice(); + ret |= open_memdevice(); + + g_fpgainfo.FpgaRegVirAddr = + (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT + fpga_memory_add(); + + pl_init(); + + return ret; +} + +int close_device_driver() { + pl_destroy(); + fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); + memory_release(g_fpgainfo.memory_info); + destroy_fpga_memory(g_fpgainfo.memory_info); + + return 0; +} + +} // namespace driver +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/V2/driver/driver.h new file mode 100644 index 0000000000000000000000000000000000000000..633e95ea8204ada2a330a6bb4fab4ce8fe23248b --- /dev/null +++ b/src/fpga/V2/driver/driver.h @@ -0,0 +1,143 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "common/log.h" + +namespace paddle_mobile { +namespace fpga { +namespace driver { + +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) + +#define FPGA_REG_PHY_ADDR 0xa0000000 +#define FPGA_REG_SIZE 0x1000 +#define FPGA_MEM_PHY_ADDR 0x20000000 +#define FPGA_MEM_SIZE 0x20000000 + +#define CPU_FREQ 1000000000 + +#define FPGA_PAGE_SIZE (16UL * 1024UL) + +// PE related macros +const int MAX_NUM_PES = 6; +const size_t MAX_TYPE_NAME_LENTH = 8; + +const int PE_IDX_CONV = 0; +const int PE_IDX_POOLING = 1; +const int PE_IDX_EW = 2; +const int PE_IDX_BYPASS = 3; + +enum pe_status { IDLE = 0, BUSY = 1 }; + +struct MemoryCacheArgs { + void *offset; + size_t size; +}; + +#define IOCTL_FPGA_MAGIC 'FPGA' +#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) +#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) + +struct fpga_pe { + char type_name[MAX_TYPE_NAME_LENTH + 1]; + struct pe_data_s *outer; + pe_status status; // 0=idle 1=busy -1=fail + uint64_t interrupt_cnt; +}; + +struct pe_data_s { + pthread_mutex_t mutex; + struct fpga_pe pe_conv; + struct fpga_pe pe_pooling; + struct fpga_pe pe_ew; + struct fpga_pe pe_bypass; + + struct fpga_pe *pes[MAX_NUM_PES]; + int pe_num; +}; + +struct fpga_memory { + pthread_mutex_t mutex; + uint64_t *bitmap; + unsigned int *nr; + unsigned int page_num; + unsigned int page_num_long; + uint64_t mem_start; + uint64_t mem_end; +}; + +struct FPGA_INFO { + uint64_t FpgaRegPhyAddr; + uint64_t FpgaMemPhyAddr; + pthread_t poll_pid; + void *FpgaRegVirAddr; + struct pe_data_s *pe_data; + + std::map fpga_addr2size_map; + std::map fpga_vaddr2paddr_map; + const char *drvdevice_path; + const char *memdevice_path; + struct fpga_memory *memory_info; + int fd_drv; + int fd_mem; +}; + +extern struct FPGA_INFO g_fpgainfo; + +inline uint64_t reg_readq(uint32_t offset) { + // DLOG << "offset : " << offset; + uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + + offset); // NOLINT + + return value; +} + +inline void reg_writeq(uint64_t value, uint32_t offset) { + // DLOG << "offset : " << offset << ", value : " << value; + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + + offset) = // NOLINT + value; +} + +int open_device_driver(); + +int close_device_driver(); + +void *fpga_malloc_driver(size_t size); + +void fpga_free_driver(void *ptr); + +void fpga_copy_driver(void *dest, const void *src, size_t num); + +int fpga_flush_driver(void *address, size_t size); + +int fpga_invalidate_driver(void *address, size_t size); + +/*pe*/ + +uint64_t vaddr_to_paddr(void *address); + +int fpga_regpoll(uint64_t reg, uint64_t val, int time); + +} // namespace driver +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e806bfb37c131fad1c011c960bc79aa1b121186 --- /dev/null +++ b/src/fpga/V2/driver/pe.cpp @@ -0,0 +1,244 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/driver/pe.h" +#include "fpga/V2/config.h" +#include "fpga/V2/driver/driver.h" +#include "fpga/V2/filter.h" +#include "fpga/V2/image.h" + +namespace paddle_mobile { +namespace fpga { +#define MUL8(x) ((x)*8) +#define BYPASS_DONE 1 + +float Findfp16Max() { + uint16_t abs_vals[16]; + uint64_t max_fp16; + + max_fp16 = driver::reg_readq(MUL8(49)); + abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = driver::reg_readq(MUL8(50)); + abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = driver::reg_readq(MUL8(51)); + abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = driver::reg_readq(MUL8(52)); + abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16)); + abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + + uint16_t tmp = 0; + for (int i = 0; i < 16; i++) { + if (tmp < abs_vals[i]) { + tmp = abs_vals[i]; + } + } + return fp16_2_fp32(tmp) / 127.0f; +} + +int ComputeFpgaConv(const struct SplitConvArgs &args) { + ComputeBasicConv(args.conv_arg[0]); +} + +int ComputeBasicConv(const struct ConvArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "======Compute Basic Conv======"; + DLOG << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + return 0; +} + +int ComputeFpgaPool(const struct PoolingArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaPool==========="; + DLOG << " mode:" << args.mode + << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int ComputeFpgaEWAdd(const struct EWAddArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaEWAdd==========="; + DLOG << " relu_enabled:" << args.relu_enabled + << " const0:" << fp16_2_fp32(int16_t(args.const0)) + << " const1:" << fp16_2_fp32(int16_t(args.const1)); + DLOG << " image0_address:" << args.image0.address + << " image0_scale_address:" << args.image0.scale_address + << " image0_channels:" << args.image0.channels + << " image0_height:" << args.image0.height + << " image0_width:" << args.image0.width + << " pad0_height:" << args.image0.pad_height + << " pad0_width:" << args.image0.pad_width; + DLOG << " image1_address:" << args.image1.address + << " image1_scale_address:" << args.image1.scale_address + << " image1_channels:" << args.image1.channels + << " image1_height:" << args.image1.height + << " image1_width:" << args.image1.width + << " pad1_height:" << args.image1.pad_height + << " pad_width:" << args.image1.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int PerformBypass(const struct BypassArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaBypass==========="; + DLOG << " input_type:" << args.input_data_type + << " output_type:" << args.output_data_type + << " input_layout_type:" << args.input_layout_type + << " output_layout_type:" << args.output_layout_type; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); + uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); + uint64_t bp_enable; + int64_t length; + uint64_t pixels; + + // fp32->fp16 + if ((args.input_data_type) && (!args.output_data_type)) { + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(float); + bp_enable = 0x8800000000000000 + length; + } + // fp16->fp32 + else if ((!args.input_data_type) && (args.output_data_type)) { + pixels = filter::calc_aligned_channel((args.image.channels)) * + (args.image.width) * (args.image.height); + length = pixels * sizeof(short); + length = align_to_x((int)length, 64); // NOLINT + bp_enable = 0x8a00000000000000 + length; + } + // fp16->fp16 findmax + else if ((!args.input_data_type) && (!args.output_data_type)) { + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(short); + bp_enable = 0x8900000000000000 + length; + } else { + return -1; + } + + // start bypass + driver::reg_writeq(ifm_src_paddr, MUL8(27)); + driver::reg_writeq(ifm_dst_paddr, MUL8(28)); + driver::reg_writeq(0, MUL8(0)); + driver::reg_writeq(bp_enable, MUL8(0)); + // poll + int ret = -1; + ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); + if (ret != -1) { + // clear "irq" + driver::reg_readq(MUL8(63)); + } + // get max value + if ((!args.input_data_type) && (!args.output_data_type)) { + float scale = Findfp16Max(); + args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT + args.output.scale_address[1] = scale; + } + return ret; +} + +int ComputeFPGAConcat(const struct ConcatArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaConcat==========="; + DLOG << " Image_num: " << args.image_num + << " out_address:" << args.image_out + << " out_scale_address:" << args.scale_out + << " out_channel:" << args.out_channel; + DLOG << " image_height:" << args.height << " image_width:" << args.width; + for (int i = 0; i < args.image_num; i++) { + DLOG << " " << i << "th: "; + DLOG << " channel_num:" << args.channel_num[i] + << " aligned_channel_num:" << args.aligned_channel_num[i] + << " image_address:" << args.images_in[i] + << " image_scale_address:" << args.scales_in[i]; + } +#endif + + image::concat_images(args.images_in, args.scales_in, args.image_out, + args.scale_out, args.image_num, args.channel_num, + args.height, args.width, args.aligned_channel_num, + args.out_channel); + return 0; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/V2/driver/pe.h new file mode 100644 index 0000000000000000000000000000000000000000..4903bf4c33f6b5d5899c56eeaada8c7a21d1a875 --- /dev/null +++ b/src/fpga/V2/driver/pe.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "fpga/V2/fpga_common.h" + +namespace paddle_mobile { +namespace fpga { + +int PerformBypass(const struct BypassArgs& args); +int ComputeBasicConv(const struct ConvArgs& args); +int ComputeFpgaPool(const struct PoolingArgs& args); +int ComputeFpgaEWAdd(const struct EWAddArgs& args); + +int ComputeFpgaConv(const struct SplitConvArgs& args); +int ComputeFPGAConcat(const struct ConcatArgs& args); + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ce278edbeed64f2ca413c1f75ff620ee1f44c83d --- /dev/null +++ b/src/fpga/V2/filter.cpp @@ -0,0 +1,156 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/filter.h" +#include +#include +#include "fpga/V2/api.h" + +namespace paddle_mobile { +namespace fpga { +namespace filter { + +int calc_channel_parallelism(int channel) { + if (channel <= 16) { + return 16; + } else if (channel <= 32) { + return 32; + } else if (channel <= 64) { + return 64; + } else { + return 128; + } +} +int calc_aligned_channel(int channel) { + return align_to_x(channel, calc_channel_parallelism(channel)); +} + +int calc_num_parallelism(int channel) { + return FILTER_PARALLELISM / calc_channel_parallelism(channel); +} + +int calc_aligned_num(int num, int channel) { + return align_to_x(num, calc_num_parallelism(channel)); +} + +int calc_aligned_total_pixel_num(int num, int channel, int height, int width) { + int aligned_channel = calc_aligned_channel(channel); + int aligned_filter_num = calc_aligned_num(num, channel); + return aligned_filter_num * aligned_channel * height * width; +} + +void convert_to_hwc(float **data_in, int num, int channel, int height, + int width) { + float *tmp = *data_in; + int chw = channel * height * width; + float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float)); // NOLINT + for (int n = 0; n < num; n++) { + int64_t amount_per_row = width * channel; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_tmp + n * chw + offset_height + w * channel + c) = + *((*data_in)++); + } + } + } + } + *data_in = data_tmp; + fpga_free(tmp); +} + +void align_filter(float **data_in, int num, int channel, int height, + int width) { + int aligned_channel = calc_channel_parallelism(channel); + int hw = height * width; + int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float)); // NOLINT + float *temp = *data_in; + memset(new_data, 0, pixel_num * sizeof(float)); + for (int i = 0; i < num; i++) { + for (int j = 0; j < hw; j++) { + memcpy(new_data + i * aligned_channel * hw + j * aligned_channel, + temp + i * channel * hw + j * channel, channel * sizeof(float)); + } + } + *data_in = new_data; + fpga_free(temp); +} + +void format_filter(float **data_in, int num, int channel, int height, int width, + int group_num, float max) { + convert_to_hwc(data_in, num, channel, height, width); + align_filter(data_in, num, channel, height, width); + int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + fpga_flush(*data_in, pixel_num * sizeof(float)); +} + +void convert_fc_filter(float **data_in, int num, int chw) { + float *tmp = *data_in; + float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float)); // NOLINT + for (int n = 0; n < num; n++) { + for (int c = 0; c < chw; c++) { + data_tmp[n * chw + c] = (*data_in)[num * c + n]; + } + } + *data_in = data_tmp; + fpga_free(tmp); +} + +void format_fc_filter(float **data_in, int num, int channel, int height, + int width, int group_num, float max) { + int chw = channel * height * width; + convert_fc_filter(data_in, num, chw); + align_filter(data_in, num, channel, height, width); + int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); + fpga_flush(*data_in, pixel_num * sizeof(float)); +} + +float find_max(float *data_in, int data_size) { + float max = 0.0; + for (int i = 0; i < data_size; ++i) { + float value = data_in[i]; + float abs = value > 0 ? value : -value; + max = std::max(max, abs); + } + return max; +} + +signed char float_to_int8(float fdata) { + if (fdata < 0.0) { + fdata -= 0.5; + } else { + fdata += 0.5; + } + return (signed char)fdata; +} + +void quantize(float **data_in, int data_size, float max) { + float *tmp = *data_in; + float fix_range = 127; + float scale = fix_range / max; + + signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char)); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = float_to_int8( + (*data_in)[i] * scale); // (signed char)((*data_in)[i] * scale); + } + *data_in = (float *)tmp_data; // NOLINT + fpga_free(tmp); +} + +} // namespace filter +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/filter.h b/src/fpga/V2/filter.h new file mode 100644 index 0000000000000000000000000000000000000000..08c758bca4a65d232f6dd69eef9c752558b01da0 --- /dev/null +++ b/src/fpga/V2/filter.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#define FILTER_PARALLELISM 1024 +namespace paddle_mobile { +namespace fpga { +namespace filter { + +int calc_channel_parallelism(int channel); +int calc_aligned_channel(int channel); +int calc_num_parallelism(int channel); +int calc_aligned_num(int num, int channel); +int calc_aligned_total_pixel_num(int num, int channel, int height, int width); +void convert_to_hwc(float** data_in, int num, int channel, int height, + int width); +void format_filter(float** data_in, int num, int channel, int height, int width, + int group_num, float max); +void convert_fc_filter(float** data_in, int num, int chw); +void format_fc_filter(float** data_in, int num, int channel, int height, + int width, int group_num, float max); +float find_max(float* data_in, int data_size); +} // namespace filter +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..01bca30a9ccf79232e1f28bbf77b1c030632f5bc --- /dev/null +++ b/src/fpga/V2/fpga_common.cpp @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +namespace paddle_mobile { +namespace fpga { + +int16_t fp32_2_fp16(float fp32_num) { + unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT + auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | + (((tmp & 0x7f800000) >> 13) - (112 << 10))); + if (tmp & 0x1000) { + t++; // roundoff + } + return t; +} + +float fp16_2_fp32(int16_t fp16_num) { + if (0 == fp16_num) { + return 0; + } + int frac = (fp16_num & 0x3ff); + int exp = ((fp16_num & 0x7c00) >> 10) + 112; + int s = fp16_num & 0x8000; + int tmp = 0; + float fp32_num; + tmp = s << 16 | exp << 23 | frac << 13; + fp32_num = *(float *)&tmp; // NOLINT + return fp32_num; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/V2/fpga_common.h new file mode 100644 index 0000000000000000000000000000000000000000..1862d843503ee8faf58caf038202e198ca079905 --- /dev/null +++ b/src/fpga/V2/fpga_common.h @@ -0,0 +1,125 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle_mobile { +namespace fpga { + +enum DataType { + DATA_TYPE_FP32 = 1, + DATA_TYPE_FP16 = 0, +}; + +enum LayoutType { + LAYOUT_CHW = 1, + LAYOUT_HWC = 0, +}; + +struct KernelArgs { + uint32_t width; + uint32_t height; + uint32_t stride_w; + uint32_t stride_h; +}; + +struct ImageInputArgs { + void* address; // input featuremap virtual address + float* scale_address; // input scale address; + uint32_t channels; + uint32_t width; // featuremap width + uint32_t height; + uint32_t pad_width; // padding width; + uint32_t pad_height; +}; + +struct ImageOutputArgs { + void* address; // output result address; + float* scale_address; // output scale address; + uint64_t timer_cnt; // time counter for FPGA computation +}; + +struct ConvArgs { + bool relu_enabled; + void* sb_address; // scale and bias + void* filter_address; + float* filter_scale_address; + void* free_space; // used by FPGA logic + uint32_t filter_num; + uint32_t group_num; + + struct KernelArgs kernel; + struct ImageInputArgs image; // input image; + struct ImageOutputArgs output; +}; + +struct ConcatArgs { + uint32_t image_num; + int16_t** images_in; + float** scales_in; + void* image_out; + float* scale_out; + uint32_t* channel_num; + uint32_t* aligned_channel_num; + uint32_t out_channel; + uint32_t height; + uint32_t width; +}; + +struct SplitConvArgs { + uint32_t split_num; + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct ConvArgs* conv_arg; + struct ConcatArgs concat_arg; +}; + +struct PoolingArgs { + int16_t mode; // mode: 0:max, 1:avg + int16_t kernel_reciprocal; + struct KernelArgs kernel; + struct ImageInputArgs image; // input image; + struct ImageOutputArgs output; +}; + +struct EWAddArgs { + bool relu_enabled; + uint32_t const0; // output0 = const0 x input0 + const1 x input1; + uint32_t const1; + struct ImageInputArgs image0; + struct ImageInputArgs image1; + struct ImageOutputArgs output; +}; + +struct BypassArgs { + enum DataType input_data_type; + enum DataType output_data_type; + enum LayoutType input_layout_type; + enum LayoutType output_layout_type; + struct ImageInputArgs image; + struct ImageOutputArgs output; +}; + +struct DeconvArgs { + struct ConvArgs conv_arg; +}; +static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } +int16_t fp32_2_fp16(float fp32_num); +float fp16_2_fp32(int16_t fp16_num); + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp new file mode 100644 index 0000000000000000000000000000000000000000..26829bfba65f2375b27251070b33b2bbe57d069b --- /dev/null +++ b/src/fpga/V2/image.cpp @@ -0,0 +1,94 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/V2/image.h" +#include +#include +#include "fpga/V2/api.h" + +namespace paddle_mobile { +namespace fpga { +namespace image { + +void convert_to_hwc(float **data_in, int channel, int height, int width) { + float *tmp = *data_in; + float *data_tmp = + (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT + int64_t amount_per_row = width * channel; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_tmp + offset_height + w * channel + c) = *((*data_in)++); + } + } + } + *data_in = data_tmp; + fpga_free(tmp); +} +void align_image(float **data_in, int channel, int height, int width, + int aligned_channel) { + if (channel == aligned_channel) return; + float *tmp = *data_in; + float *new_data = + (float *)fpga_malloc(aligned_channel * height * width * // NOLINT + sizeof(float)); // NOLINT + memset(new_data, 0, aligned_channel * height * width * sizeof(float)); + + for (int i = 0; i < height * width; i++) { + memcpy(new_data + i * aligned_channel, tmp + i * channel, + channel * sizeof(float)); + } + *data_in = new_data; + fpga_free(tmp); +} + +void format_image(float **data_in, int channel, int height, int width, + int aligned_channel) { + convert_to_hwc(data_in, channel, height, width); + align_image(data_in, channel, height, width, aligned_channel); + fpga_flush(*data_in, aligned_channel * height * width * sizeof(float)); +} + +void concat_images(int16_t **images_in, float **scales_in, void *image_out, + float *scale_out, int image_num, const uint32_t *channel_num, + int height, int width, const uint32_t *aligned_channel_num, + int out_channel) { + int hw = height * width; + scale_out[0] = 0.0; + scale_out[1] = 0.0; + for (int i = 0; i < image_num; i++) { + scale_out[0] = std::max(*scale_out, scales_in[i][0]); + fpga_invalidate(images_in[i], + height * width * aligned_channel_num[i] * sizeof(int16_t)); + } + scale_out[1] = 1 / scale_out[0]; + + for (int j = 0; j < hw; j++) { + int tmp_channel_sum = 0; + for (int i = 0; i < image_num; i++) { + memcpy( + (int16_t *)image_out + j * out_channel + tmp_channel_sum, // NOLINT + images_in[i] + j * aligned_channel_num[i], + channel_num[i] * sizeof(int16_t)); + + tmp_channel_sum += channel_num[i]; + } + } + fpga_flush(image_out, hw * out_channel * sizeof(int16_t)); +} + +} // namespace image +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/image.h b/src/fpga/V2/image.h new file mode 100644 index 0000000000000000000000000000000000000000..df20e583fc64e3544eb24bee7aeaf3652331180c --- /dev/null +++ b/src/fpga/V2/image.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +namespace paddle_mobile { +namespace fpga { +namespace image { + +void convert_to_hwc(float **data_in, int channel, int height, int width); +void align_image(float **data_in, int channel, int height, int width, + int aligned_channel); +void format_image(float **data_in, int channel, int height, int width, + int aligned_channel); +void concat_images( + int16_t **images_in, float **scales_in, void *image_out, float *scale_out, + int image_num, const uint32_t *channel_num, int height, int width, + const uint32_t *aligned_channel_num, + int out_channel); // Concat featuremaps along channel direction + +} // namespace image +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/framework/attribute.h b/src/framework/attribute.h index a94346bc7ab321b0f5710a98fb3cc60198f148b0..a21e0a4ec321dbfe08f87160cc2f0c159594920d 100644 --- a/src/framework/attribute.h +++ b/src/framework/attribute.h @@ -117,9 +117,9 @@ class Attribute { template static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) { - if (attr.variant_.TypeId() == typeid(int).hash_code()) { + if (attr.variant_.TypeId() == typeid(int).hash_code()) { // NOLINT return vistor(attr.variant_.Get()); - } else if (attr.variant_.TypeId() == typeid(float).hash_code()) { + } else if (attr.variant_.TypeId() == typeid(float).hash_code()) { // NOLINT return vistor(attr.variant_.Get()); } else if (attr.variant_.TypeId() == typeid(string).hash_code()) { return vistor(attr.variant_.GetString()); @@ -129,7 +129,7 @@ class Attribute { return vistor(attr.variant_.Get>()); } else if (attr.variant_.TypeId() == typeid(vector).hash_code()) { return vistor(attr.variant_.Get>()); - } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { + } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) { // NOLINT return vistor(attr.variant_.Get()); } else if (attr.variant_.TypeId() == typeid(vector).hash_code()) { return vistor(attr.variant_.Get>()); @@ -137,7 +137,6 @@ class Attribute { return vistor(attr.variant_.Get()); } else { PADDLE_MOBILE_THROW_EXCEPTION("type not support"); - exit(0); } } diff --git a/src/framework/cl/cl_deleter.h b/src/framework/cl/cl_deleter.h new file mode 100644 index 0000000000000000000000000000000000000000..55af631174ae9f2a7815c2da35ebadda3ebfd9e9 --- /dev/null +++ b/src/framework/cl/cl_deleter.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "CL/cl.h" + +struct CLKernelDeleter { + template + void operator()(T *clKernelObj) { + clReleaseKernel(clKernelObj); + } +}; + +struct CLMemDeleter { + template + void operator()(T *clMemObj) { + clReleaseMemObject(clMemObj); + } +}; + +struct CLEventDeleter { + template + void operator()(T *clEventObj) { + clReleaseEvent(clEventObj); + } +}; + +struct CLCommQueueDeleter { + template + void operator()(T *clQueueObj) { + clReleaseCommandQueue(clQueueObj); + } +}; + +struct CLContextDeleter { + template + void operator()(T *clContextObj) { + clReleaseContext(clContextObj); + } +}; + +struct CLProgramDeleter { + template + void operator()(T *clProgramObj) { + clReleaseProgram(clProgramObj); + } +}; diff --git a/src/framework/cl/cl_engine.cpp b/src/framework/cl/cl_engine.cpp new file mode 100644 index 0000000000000000000000000000000000000000..04d1675227aac0967f8dee94aa7a27ae5ea73c0f --- /dev/null +++ b/src/framework/cl/cl_engine.cpp @@ -0,0 +1,131 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_engine.h" +#include "CL/cl.h" +#include "framework/cl/cl_tool.h" + +#include +#include + +namespace paddle_mobile { +namespace framework { + +bool CLEngine::Init() { + if (initialized_) { + return true; + } + cl_int status; + SetPlatform(); + SetClDeviceId(); + + initialized_ = true; + return initialized_; + // setClCommandQueue(); + // std::string filename = "./HelloWorld_Kernel.cl"; + // loadKernelFromFile(filename.c_str()); + // buildProgram(); +} + +CLEngine *CLEngine::Instance() { + static CLEngine cl_engine_; + cl_engine_.Init(); + return &cl_engine_; +} + +bool CLEngine::SetPlatform() { + platform_ = NULL; // the chosen platform + cl_uint numPlatforms; // the NO. of platforms + cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); + + /**For clarity, choose the first available platform. */ + if (numPlatforms > 0) { + cl_platform_id *platforms = reinterpret_cast( + malloc(numPlatforms * sizeof(cl_platform_id))); + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + platform_ = platforms[0]; + free(platforms); + return true; + } else { + return false; + } +} + +bool CLEngine::SetClDeviceId() { + cl_uint numDevices = 0; + devices_ = NULL; + cl_int status = + clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + + if (numDevices > 0) { + devices_ = reinterpret_cast( + malloc(numDevices * sizeof(cl_device_id))); + status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_, + NULL); + return true; + } + return false; +} + +// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel( +// const std::string &kernel_name) { +// std::unique_ptr<_cl_kernel, clKernel_deleter> kernel( +// clCreateKernel(program_.get(), kernel_name.c_str(), NULL)); +// return std::move(kernel); +//} +// +// bool CLEngine::SetClCommandQueue() { +// cl_int status; +// command_queue_.reset( +// clCreateCommandQueue(context_.get(), devices_[0], 0, &status)); +// return true; +//} + +// bool CLEngine::SetClContext() { +// context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL)); +// return true; +//} + +// bool CLEngine::LoadKernelFromFile(const char *kernel_file) { +// size_t size; +// char *str; +// std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary)); +// +// if (!f.is_open()) { +// return false; +// } +// +// size_t fileSize; +// f.seekg(0, std::fstream::end); +// size = fileSize = (size_t)f.tellg(); +// f.seekg(0, std::fstream::beg); +// str = new char[size + 1]; +// if (!str) { +// f.close(); +// return 0; +// } +// +// f.read(str, fileSize); +// f.close(); +// str[size] = '\0'; +// const char *source = str; +// size_t sourceSize[] = {strlen(source)}; +// program_.reset( +// clCreateProgramWithSource(context_.get(), 1, &source, sourceSize, +// NULL)); +// return true; +//} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h new file mode 100644 index 0000000000000000000000000000000000000000..d7b1c912dac304660f39e0e294122d0d27eb9bb6 --- /dev/null +++ b/src/framework/cl/cl_engine.h @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "CL/cl.h" +#include "common/enforce.h" +#include "common/log.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace framework { + +class CLEngine { + public: + static CLEngine *Instance(); + + bool Init(); + + std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() { + cl_int status; + cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status); + std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c); + CL_CHECK_ERRORS(status); + return std::move(context_ptr); + } + + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue( + cl_context context) { + cl_int status; + cl_command_queue queue = + clCreateCommandQueue(context, devices_[0], 0, &status); + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr( + queue); + CL_CHECK_ERRORS(status); + return std::move(command_queue_ptr); + } + + std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith( + cl_context context, std::string file_name) { + FILE *file = fopen(file_name.c_str(), "rb"); + PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ", + file_name.c_str()); + fseek(file, 0, SEEK_END); + int64_t size = ftell(file); + PADDLE_MOBILE_ENFORCE(size > 0, "size is too small"); + rewind(file); + char *data = new char[size + 1]; + size_t bytes_read = fread(data, 1, size, file); + data[size] = '\0'; + PADDLE_MOBILE_ENFORCE(bytes_read == size, + "read binary file bytes do not match with fseek"); + fclose(file); + + const char *source = data; + size_t sourceSize[] = {strlen(source)}; + cl_program p = + clCreateProgramWithSource(context, 1, &source, sourceSize, &status_); + + DLOG << " cl kernel file name: " << file_name; + DLOG << " source size: " << sourceSize[0]; + CL_CHECK_ERRORS(status_); + + std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p); + + return std::move(program_ptr); + } + + std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) { + cl_event event = clCreateUserEvent(context, &status_); + std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event); + CL_CHECK_ERRORS(status_); + return std::move(event_ptr); + } + + bool BuildProgram(cl_program program) { + cl_int status; + std::string path = "-cl-fast-relaxed-math -I " + + CLEngine::Instance()->GetCLPath() + "/cl_kernel"; + + status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0); + + CL_CHECK_ERRORS(status); + + if (status_ == CL_BUILD_PROGRAM_FAILURE) { + size_t log_size; + clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), + CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = reinterpret_cast(malloc(log_size)); + clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), + CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + DLOG << " program build error: " << log; + } + + if (status == CL_SUCCESS) { + return true; + } else { + return false; + } + } + + cl_device_id DeviceID(int index = 0) { return devices_[index]; } + + std::string GetCLPath() { return cl_path_; } + void setClPath(std::string cl_path) { cl_path_ = cl_path; } + + private: + CLEngine() { initialized_ = false; } + + bool SetPlatform(); + + bool SetClDeviceId(); + + bool initialized_; + + cl_platform_id platform_; + + cl_device_id *devices_; + + cl_int status_; + + std::string cl_path_; + std::unique_ptr<_cl_program, CLProgramDeleter> program_; + + // bool SetClContext(); + + // bool SetClCommandQueue(); + + // bool LoadKernelFromFile(const char *kernel_file); + + // bool BuildProgram(); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_half.cpp b/src/framework/cl/cl_half.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2877289325d983d0c7d9756732254e0a4ed831b6 --- /dev/null +++ b/src/framework/cl/cl_half.cpp @@ -0,0 +1,518 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + +#include "framework/cl/cl_half.h" + +namespace paddle_mobile { +namespace framework { + +static const uint32_t mantissatable[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000, + 0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, + 0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000, + 0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000, + 0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000, + 0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000, + 0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000, + 0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000, + 0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000, + 0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000, + 0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000, + 0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000, + 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000, + 0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000, + 0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000, + 0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000, + 0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000, + 0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000, + 0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000, + 0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000, + 0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000, + 0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000, + 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, + 0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, + 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000, + 0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000, + 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, + 0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000, + 0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000, + 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, + 0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, + 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000, + 0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000, + 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, + 0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000, + 0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000, + 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, + 0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, + 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000, + 0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000, + 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, + 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, + 0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000, + 0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000, + 0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, + 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, + 0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000, + 0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000, + 0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000, + 0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000, + 0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000, + 0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000, + 0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000, + 0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000, + 0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000, + 0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000, + 0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000, + 0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000, + 0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000, + 0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000, + 0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000, + 0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000, + 0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000, + 0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000, + 0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000, + 0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000, + 0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000, + 0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000, + 0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000, + 0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000, + 0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000, + 0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000, + 0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000, + 0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000, + 0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000, + 0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000, + 0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000, + 0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000, + 0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000, + 0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000, + 0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000, + 0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000, + 0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000, + 0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000, + 0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000, + 0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000, + 0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000, + 0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000, + 0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000, + 0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000, + 0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000, + 0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000, + 0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000, + 0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000, + 0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000, + 0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000, + 0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000, + 0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000, + 0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000, + 0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000, + 0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000, + 0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000, + 0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000, + 0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000, + 0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000, + 0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000, + 0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000, + 0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000, + 0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000, + 0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000, + 0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000, + 0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000, + 0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000, + 0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000, + 0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000, + 0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000, + 0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000, + 0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000, + 0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000, + 0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000, + 0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000, + 0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000, + 0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000, + 0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000, + 0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000, + 0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000, + 0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000, + 0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000, + 0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000, + 0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000, + 0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000, + 0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000, + 0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000, + 0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000, + 0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000, + 0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000, + 0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000, + 0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000, + 0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000, + 0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000, + 0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000, + 0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000, + 0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000, + 0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000, + 0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000, + 0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000, + 0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000, + 0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000, + 0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000, + 0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000, + 0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000, + 0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000, + 0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000, + 0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000, + 0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000, + 0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000, + 0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000, + 0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000, + 0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000, + 0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000, + 0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000, + 0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000, + 0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000, + 0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000, + 0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000, + 0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000, + 0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000, + 0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000, + 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000, + 0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000, + 0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000, + 0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000, + 0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, + 0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000, + 0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000, + 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000, + 0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000, + 0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000, + 0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000, + 0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000, + 0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000, + 0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000, + 0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000, + 0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000, + 0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000, + 0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000, + 0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000, + 0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, + 0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000, + 0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000, + 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000, + 0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000, + 0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000, + 0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000, + 0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, + 0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000, + 0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000, + 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000, + 0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000, + 0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000, + 0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000, + 0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000, + 0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000, + 0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000, + 0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000, + 0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000, + 0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000, + 0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000, + 0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000, + 0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, + 0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000, + 0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000, + 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000, + 0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000, + 0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000, + 0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000, + 0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, + 0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000, + 0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000, + 0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000, + 0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000, + 0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000, + 0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000, + 0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000, + 0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000, + 0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000, + 0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000, + 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000, + 0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000, + 0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000, + 0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000, + 0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, + 0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000, + 0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000, + 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000, + 0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000, + 0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000, + 0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000, + 0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000, + 0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000, + 0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000, + 0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000, + 0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000, + 0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000, + 0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000, + 0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000, + 0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, + 0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000, + 0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000, + 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000, + 0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000, + 0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000, + 0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000, + 0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, + 0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000, + 0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000, + 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000, + 0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000, + 0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000, + 0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000, + 0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000, + 0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000, + 0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000, + 0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000, + 0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000, + 0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000, + 0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000, + 0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000, + 0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, + 0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000, + 0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000, + 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000, + 0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000, + 0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000, + 0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000, + 0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, + 0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000, + 0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000, + 0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000, + 0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000, + 0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000, + 0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000, + 0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000, + 0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000, + 0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000, + 0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000, + 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000, + 0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000, + 0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000, + 0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000, + 0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, + 0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000, + 0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000, + 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000, + 0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000, + 0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000, + 0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000, + 0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000, + 0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000, + 0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000, + 0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000, + 0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000, + 0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000, + 0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000, + 0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000, + 0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, + 0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000, + 0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000, + 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000, + 0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000, + 0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000, + 0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000, + 0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, + 0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000, + 0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000, + 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000, + 0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000, + 0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000, + 0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000, + 0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000, + 0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000, + 0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000, + 0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000, + 0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000, + 0x387fc000, 0x387fe000}; + +static const uint16_t offsettable[64] = { + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, + 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400}; + +static const uint32_t exponenttable[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, + 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, + 0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000, + 0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000, + 0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000, + 0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000, + 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, + 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000, + 0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000, + 0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000}; + +static const uint16_t basetable[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, + 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, + 0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, + 0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, + 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, + 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, + 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, + 0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, + 0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, + 0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, + 0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, + 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00}; + +static const uint8_t shifttable[512] = { + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, + 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, + 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, + 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d}; + +half_t Float2Half(float f) { + uint32_t v = *reinterpret_cast(&f); + return basetable[(v >> 23) & 0x1ff] + + ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]); +} + +float Half2Float(half_t h) { + uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + + exponenttable[h >> 10]; + return *reinterpret_cast(&v); +} + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) { + for (int i = 0; i < count; ++i) { + h_array[i] = Float2Half(f_array[i]); + } +} + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) { + for (int i = 0; i < count; ++i) { + f_array[i] = Half2Float(h_array[i]); + } +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_half.h b/src/framework/cl/cl_half.h new file mode 100644 index 0000000000000000000000000000000000000000..9b05740f1e19af66036a1562243102e5ba42ab1b --- /dev/null +++ b/src/framework/cl/cl_half.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle_mobile { +namespace framework { + +typedef uint16_t half_t; + +half_t Float2Half(float f); + +float Half2Float(half_t h); + +void FloatArray2HalfArray(float *f_array, half_t *h_array, int count); + +void HalfArray2FloatArray(half_t *h_array, float *f_array, int count); + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..bea91ee24ceb5e9011708bd277629a07beb4b8ef --- /dev/null +++ b/src/framework/cl/cl_helper.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "common/log.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_image.h" +#include "framework/cl/cl_scope.h" + +namespace paddle_mobile { +namespace framework { + +class CLHelper { + public: + CLHelper() = default; + + explicit CLHelper(CLScope *scope) : scope_(scope) {} + + void AddKernel(const std::string &kernel_name, const std::string &file_name) { + DLOG << " begin add kernel "; + auto kernel = scope_->GetKernel(kernel_name, file_name); + DLOG << " add kernel ing "; + kernels.emplace_back(std::move(kernel)); + } + + cl_kernel KernelAt(const int index) { + DLOG << " kernel count: " << kernels.size(); + return kernels[index].get(); + } + + cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); } + + cl_context CLContext() { return scope_->Context(); } + + std::vector DefaultWorkSize(const CLImage &image) { + // n c h w + auto image_dim = image.dims(); + if (image_dim.size() == 4) { + auto n = image_dim[0]; + auto h = image_dim[2]; + auto w = image_dim[3]; + auto image_width = image.ImageWidth(); + auto work_size_0 = image_width / w; + auto work_size_1 = w; + auto work_size_2 = n * h; + return {work_size_0, work_size_1, work_size_2}; + } else if (image_dim.size() == 2) { + return {1, image.ImageWidth(), image.ImageHeight()}; + } else if (image_dim.size() == 1) { + return {1, image.ImageWidth(), 1}; + } + PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp "); + } + + private: + CLScope *scope_; + std::vector> kernels; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f32de0a61461d9a9b28d4a0cf5e13ecc9d564cf5 --- /dev/null +++ b/src/framework/cl/cl_image.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_image.h" + +namespace paddle_mobile { +namespace framework { + +void CLImageToTensor(CLImage *cl_image, Tensor *tensor, + cl_command_queue commandQueue) { + // TODO(yangfei): need imp +} + +void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, + cl_command_queue commandQueue) { + // TODO(yangfei): need imp +} + +#ifdef PADDLE_MOBILE_DEBUG +Print &operator<<(Print &printer, const CLImage &cl_image) { + int width = cl_image.ImageDims()[0]; + int height = cl_image.ImageDims()[1]; + + half_t *image_data = new half_t[height * width * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, + region, 0, 0, image_data, 0, NULL, NULL); + + CL_CHECK_ERRORS(err); + + float *tensor_data = new float[cl_image.numel()]; + auto converter = cl_image.Converter(); + converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), + cl_image.dims()); + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + + printer << " dims: " << cl_image.dims() << "\n"; + for (int i = 0; i < cl_image.numel(); i += stride) { + printer << tensor_data[i] << " "; + } + + delete[](tensor_data); + delete[](image_data); + + return printer; +} +#endif +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h new file mode 100644 index 0000000000000000000000000000000000000000..35f60d3b773937d381447b23b64985ce543fddee --- /dev/null +++ b/src/framework/cl/cl_image.h @@ -0,0 +1,234 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "CL/cl.h" + +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" +#include "framework/cl/cl_half.h" +#include "framework/cl/cl_image_converter.h" +#include "framework/cl/cl_tool.h" +#include "framework/ddim.h" +#include "framework/tensor.h" + +namespace paddle_mobile { +namespace framework { + +class CLImage { + public: + CLImage() = default; + + ~CLImage() { + if (tensor_data_ != nullptr) { + delete[](tensor_data_); + } + + if (image_converter_) { + delete (image_converter_); + } + } + /* + * will not hold input tensor data, memcpy in this method + * */ + void SetTensorData(float *tensorData, const DDim &dim) { + int numel = product(dim); + if (tensor_data_ != nullptr) { + delete[](tensor_data_); + tensor_data_ = nullptr; + } + tensor_data_ = new float[numel]; + memcpy(tensor_data_, tensorData, numel * sizeof(float)); + tensor_dims_ = dim; + } + + /* + * need call SetTensorData first + * + * folder when one dim or two dim + * */ + void InitCLImage(cl_context context, cl_command_queue command_queue) { + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + InitCLImage(context, command_queue, folder_converter); + } + + void InitCLImage(cl_context context, cl_command_queue command_queue, + CLImageConverterBase *converter) { + if (image_converter_ != nullptr) { + delete (image_converter_); + } + + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + + DLOG << " begin init cl image "; + image_dims_ = converter->InitImageDimInfoWith(tensor_dims_); + + half_t *image_data = new half_t[product(image_dims_) * 4]; + + DLOG << " convert to image"; + converter->NCHWToImage(tensor_data_, image_data, tensor_dims_); + DLOG << " end convert to image"; + + InitCLImage(context, image_dims_[0], image_dims_[1], image_data); + + delete[](image_data); + delete[](tensor_data_); + + command_queue_ = command_queue; + tensor_data_ = nullptr; + image_converter_ = converter; + initialized_ = true; + DLOG << " end init cl image"; + } + + void InitNImage(cl_context context, cl_command_queue command_queue) { + if (tensor_data_ == nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); + } + CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock(); + InitCLImage(context, command_queue, folder_converter); + PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); + } + void InitDWImage(cl_context context, cl_command_queue command_queue) { + if (tensor_data_ == nullptr) { + PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first"); + } + CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock(); + InitCLImage(context, command_queue, dw_converter); + PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4"); + } + + void InitEmptyImage(cl_context context, cl_command_queue command_queue, + const DDim &dim) { + PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, + " empty image tensor data shouldn't have value"); + + CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + + DLOG << " to get image dims "; + image_dims_ = folder_converter->InitImageDimInfoWith(dim); + DLOG << " end get image dims " << image_dims_; + + InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); + + tensor_dims_ = dim; + command_queue_ = command_queue; + image_converter_ = folder_converter; + cl_event_ = CLEngine::Instance()->CreateEvent(context); + initialized_ = true; + DLOG << " end init cl image"; + } + + cl_mem GetCLImage() const { return cl_image_.get(); } + + const DDim &ImageDims() const { return image_dims_; } + + inline size_t ImageWidth() const { return image_dims_[0]; } + + inline size_t ImageHeight() const { return image_dims_[1]; } + + inline cl_command_queue CommandQueue() const { return command_queue_; } + + /* + * resize original tensor dim + * */ + inline CLImage &Resize(const DDim &dims) { + tensor_dims_ = dims; + return *this; + } + + template + T *data() const { + if (initialized_) { + PADDLE_MOBILE_THROW_EXCEPTION( + " cl image has initialized, tensor data has been deleted, can't use " + "tensor data"); + } + return reinterpret_cast(tensor_data_); + } + + /* + * numel of tensor dim + * */ + inline int64_t numel() const { return product(tensor_dims_); } + + /* + * original tensor dim + * */ + const DDim &dims() const { return tensor_dims_; } + + cl_event GetClEvent() const { return cl_event_.get(); } + + CLImageConverterBase *Converter() const { return image_converter_; } + + private: + void InitCLImage(cl_context context, int width, int height, void *data) { + cl_image_format cf = {.image_channel_order = CL_RGBA, + .image_channel_data_type = CL_HALF_FLOAT}; + cl_image_desc cid = { + .image_type = CL_MEM_OBJECT_IMAGE2D, + .image_width = width, + .image_height = height, + .image_depth = 1, + .image_array_size = 1, + .image_row_pitch = 0, + .image_slice_pitch = 0, + .num_mip_levels = 0, + .num_samples = 0, + // .buffer = nullptr + }; + cid.buffer = nullptr; + cl_int err; + cl_mem cl_image = clCreateImage( + context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0), + &cf, // const cl_image_format *image_format + &cid, // const cl_image_desc *image_desc + data, // void *host_ptr + &err); + cl_image_.reset(cl_image); + if (err != CL_SUCCESS) { + CL_CHECK_ERRORS(err); + PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error "); + } + } + + bool initialized_ = false; + std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_; + std::unique_ptr<_cl_event, CLEventDeleter> cl_event_; + DDim tensor_dims_; + DDim image_dims_; + float *tensor_data_ = nullptr; + cl_context context_; + cl_command_queue command_queue_; + CLImageConverterBase *image_converter_ = nullptr; +}; + +void TensorToCLImage(Tensor *tensor, CLImage *image, + cl_command_queue commandQueue); + +void CLImageToTensor(CLImage *image, Tensor *tensor, + cl_command_queue commandQueue); + +#ifdef PADDLE_MOBILE_DEBUG +Print &operator<<(Print &printer, const CLImage &image); +#endif + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..13094a8d05ac6f7f8d2451a3498da058b37ee98b --- /dev/null +++ b/src/framework/cl/cl_image_converter.cpp @@ -0,0 +1,393 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_image_converter.h" + +namespace paddle_mobile { +namespace framework { + +const DDim &CLImageConverterDefault::InitImageDimInfoWith( + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + return make_ddim({width, height}); +} + +void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + DLOG << " tensor dim " << tensor_dim; + DLOG << " image dim " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t height = in_image_dim[1]; + + int w_block = width / W; + + float *p = nchw; + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + + int width = image_dim[0]; + int height = image_dim[0]; + + float *p = tensor; + + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +const DDim &CLImageConverterFolder::InitImageDimInfoWith( + const DDim &tensor_dim) { + if (tensor_dim.size() <= 2) { + int tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + int width = (tdim[1] + 3) / 4; + int height = tdim[0]; + + width_of_one_block_ = width; + height_of_one_block_ = height; + c_block_ = 1; + + return make_ddim({width, height}); + + } else { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return make_ddim({width, height}); + } +} + +void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, + "tensor dim is not support "); + + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); + + } else { + int tdim[2] = {1, 1}; + if (tensor_dim.size() == 1) { + tdim[1] = tensor_dim[0]; + } else { + tdim[0] = tensor_dim[0]; + tdim[1] = tensor_dim[1]; + } + + DDim image_dim = InitImageDimInfoWith(tensor_dim); + int width = image_dim[0]; + + for (int h = 0; h < tdim[0]; h++) { + for (int w = 0; w < tdim[1]; w++) { + image[(h * width + w / 4) * 4 + (w % 4)] = + Float2Half(tensor[h * tdim[1] + w]); + } + } + } +} + +void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + if (tensor_dim.size() > 2) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); + + } else { + int width = image_dim[0]; + int height = image_dim[1]; + int H, W; + + if (tensor_dim.size() == 2) { + H = tensor_dim[0]; + W = tensor_dim[1]; + } else if (tensor_dim.size() == 1) { + H = 1; + W = tensor_dim[0]; + } + float *p = tensor; + + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]); + } + } + } +} + +const DDim &CLImageConverterNWBlock::InitImageDimInfoWith( + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return make_ddim({width, height}); +} + +void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + auto image_dim = InitImageDimInfoWith(tensor_dim); + float *p = tensor; + int N = tensor_dim[0]; + int C = tensor_dim[1]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[1]; + int block = image_dim[0] / tensor_dim[3]; + + for (int n = 0; n < block * 4; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + + w * 4 + n % 4; + if (n < N) { + image[index] = Float2Half(*p); + p++; + } else { + image[index] = 0.0; + } + if (index >= (width * height * 4)) { + DLOG << " index out of range "; + } + } + } + } + } + DLOG << " init done"; +} + +void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + float *p = tensor; + int N = tensor_dim[0]; + int C = tensor_dim[1]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[1]; + int block = image_dim[0] / tensor_dim[3]; + + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) + + w * 4 + n % 4; + *p = Half2Float(image[index]); + p++; + if (index >= (width * height * 4)) { + DLOG << " index out of range "; + } + } + } + } + } + DLOG << " init done"; +} + +const DDim &CLImageConverterDWBlock::InitImageDimInfoWith( + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + size_t N, C, H, W; + N = tensor_dim[0]; + C = tensor_dim[1]; + H = tensor_dim[2]; + W = tensor_dim[3]; + size_t width = W * ((N + 3) / 4); + size_t height = C * H; + return make_ddim({width, height}); +} + +void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + + size_t N, C, H, W; + N = new_dims[1]; + C = new_dims[0]; + H = new_dims[2]; + W = new_dims[3]; + + DDim in_image_dim = InitImageDimInfoWith(tensor_dim); + + DLOG << " tensor dim " << tensor_dim; + DLOG << " image dim " << in_image_dim; + + size_t width = in_image_dim[0]; + size_t height = in_image_dim[1]; + + int w_block = width / W; + + float *p = tensor; + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < w_block * 4; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + image[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + image[i2] = 0.0; + i2 += 4; + } + } + i1 += width; + } + } + i0 += width * H; + } +} + +void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4"); + float *p = tensor; + int N = tensor_dim[1]; + int C = tensor_dim[0]; + int H = tensor_dim[2]; + int W = tensor_dim[3]; + int width = image_dim[0]; + int height = image_dim[0]; + + size_t i0 = 0; + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + size_t i1 = i0 + (c / 4) * W; + for (int h = 0; h < H; h++) { + size_t i2 = (i1 << 2) + c % 4; + for (int w = 0; w < W; w++) { + *p = Half2Float(image[i2]); + i2 += 4; + p++; + } + i1 += width; + } + } + i0 += width * H; + } +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h new file mode 100644 index 0000000000000000000000000000000000000000..02887b0cd468a45630122bb3f236c0775ac1eaa1 --- /dev/null +++ b/src/framework/cl/cl_image_converter.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/cl/cl_half.h" +#include "framework/ddim.h" + +namespace paddle_mobile { +namespace framework { + +class CLImageConverterBase { + public: + virtual void NCHWToImage(float *nchw, half_t *image, + const DDim &tensor_dim) = 0; + + virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim, + const DDim &tensor_dim) = 0; + virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0; +}; + +class CLImageConverterDefault : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +class CLImageConverterFolder : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + +class CLImageConverterNWBlock : public CLImageConverterBase { + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; +class CLImageConverterDWBlock : public CLImageConverterBase { + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h new file mode 100644 index 0000000000000000000000000000000000000000..c7c06ca75f47cd65d2350dfa6930068aca73ced0 --- /dev/null +++ b/src/framework/cl/cl_scope.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "CL/cl.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace framework { + +class CLScope { + public: + CLScope() { + CLEngine *engin = CLEngine::Instance(); + context_ = engin->CreateContext(); + command_queue_ = engin->CreateClCommandQueue(context_.get()); + } + + cl_command_queue CommandQueue() { return command_queue_.get(); } + + std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel( + const std::string &kernel_name, const std::string &file_name) { + DLOG << " to get program " << file_name; + auto program = Program(file_name); + DLOG << " end get program ~ "; + DLOG << " to create kernel: " << kernel_name; + std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel( + clCreateKernel(program, kernel_name.c_str(), &status_)); + CL_CHECK_ERRORS(status_); + DLOG << " end create kernel ~ "; + return std::move(kernel); + } + + cl_context Context() { return context_.get(); } + + cl_program Program(const std::string &file_name) { + auto it = programs_.find(file_name); + if (it != programs_.end()) { + return it->second.get(); + } + + auto program = CLEngine::Instance()->CreateProgramWith( + context_.get(), + CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name); + + DLOG << " --- begin build program -> " << file_name << " --- "; + CLEngine::Instance()->BuildProgram(program.get()); + DLOG << " --- end build program -> " << file_name << " --- "; + + programs_[file_name] = std::move(program); + + return programs_[file_name].get(); + } + + private: + cl_int status_; + std::unique_ptr<_cl_context, CLContextDeleter> context_; + std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_; + std::unordered_map> + programs_; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_tensor.h b/src/framework/cl/cl_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..b853fa0e8d734c38de2fdc53f766d735dc72bb20 --- /dev/null +++ b/src/framework/cl/cl_tensor.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "CL/cl.h" +#include "framework/cl/cl_deleter.h" +#include "framework/cl/cl_engine.h" +#include "framework/tensor_base.h" + +namespace paddle_mobile { +namespace framework { + +class CLTensor : TensorBase { + public: + CLTensor(cl_context context, cl_command_queue command_queue) + : context_(context), command_queue_(command_queue) {} + + CLTensor() = default; + + /* + * if init method haven't set context and command_queue, need set + * */ + void SetContextAndCommandQueue(cl_context context, + cl_command_queue command_queue) { + context_ = context; + command_queue_ = command_queue; + } + + /*! Resize the dimensions of the memory block. */ + inline CLTensor &Resize(const DDim &dims) { + dims_ = dims; + return *this; + } + + template + inline cl_mem mutable_with_data(const T *data) { + int64_t size = numel() * sizeof(T); + + holder_.reset(new PlaceholderImpl( + size, reinterpret_cast(const_cast(data)), typeid(T), + context_, command_queue_)); + return reinterpret_cast(holder_->ptr()); + } + + inline cl_mem mutable_data(std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") + int64_t size = numel() * SizeOfType(type); + if (holder_ == nullptr || holder_->size() < size + offset_) { + holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_)); + offset_ = 0; + } + return reinterpret_cast(holder_->ptr()); + } + + /** + * @brief Return a pointer to cl buffer. + * @note If not exist, then allocation. + */ + template + inline cl_mem mutable_data() { + return reinterpret_cast(mutable_data(typeid(T))); + } + + /** + * @brief Return a pointer to cl buffer. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline cl_mem mutable_data(DDim dims) { + Resize(dims); + return mutable_data(); + } + + inline cl_mem CLBuffer() { + check_memory_size(); + return reinterpret_cast( + reinterpret_cast(holder_->ptr())); + } + + template + inline T *Data() { + if (host_ptr_) { + delete (host_ptr_); + host_ptr_ = nullptr; + } + cl_mem buffer = CLBuffer(); + host_ptr_ = new char[holder_->size()]; + cl_int status; + status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0, + holder_->size(), host_ptr_, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + return reinterpret_cast(host_ptr_); + } + + int memorySize() { return holder_->size(); } + + ~CLTensor() { + DLOG << "~CLTensor"; + if (host_ptr_) { + DLOG << " delete host ptr "; + delete (host_ptr_); + host_ptr_ = nullptr; + } + } + + private: + cl_context context_; + cl_command_queue command_queue_; + void *host_ptr_ = nullptr; + + struct PlaceholderImpl : public Placeholder { + PlaceholderImpl(size_t size, void *input, std::type_index type, + cl_context context, cl_command_queue command_queue) + : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + size, reinterpret_cast(input), NULL)), + size_(size), + type_(type), + command_queue_(command_queue) {} + + PlaceholderImpl(size_t size, std::type_index type, cl_context context, + cl_command_queue command_queue) + : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)), + size_(size), + type_(type), + command_queue_(command_queue) {} + + virtual size_t size() const { return size_; } + + virtual void *ptr() const { return static_cast(ptr_.get()); } + + virtual std::type_index type() const { return type_; } + + virtual void set_type(std::type_index type) { type_ = type; } + + std::unique_ptr<_cl_mem, CLMemDeleter> ptr_; + + size_t size_; + + /* the current type of memory */ + std::type_index type_; + + cl_command_queue command_queue_; + }; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_tool.cpp b/src/framework/cl/cl_tool.cpp new file mode 100644 index 0000000000000000000000000000000000000000..827642b6b73cfaee02f4053dce798bf6b3c52f4b --- /dev/null +++ b/src/framework/cl/cl_tool.cpp @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace framework { + +const char *opencl_error_to_str(cl_int error) { +#define CASE_CL_CONSTANT(NAME) \ + case NAME: \ + return #NAME; + // Suppose that no combinations are possible. + switch (error) { + CASE_CL_CONSTANT(CL_SUCCESS) + CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND) + CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE) + CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES) + CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY) + CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE) + CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP) + CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH) + CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED) + CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE) + CASE_CL_CONSTANT(CL_MAP_FAILURE) + CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET) + CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) + CASE_CL_CONSTANT(CL_INVALID_VALUE) + CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE) + CASE_CL_CONSTANT(CL_INVALID_PLATFORM) + CASE_CL_CONSTANT(CL_INVALID_DEVICE) + CASE_CL_CONSTANT(CL_INVALID_CONTEXT) + CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES) + CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE) + CASE_CL_CONSTANT(CL_INVALID_HOST_PTR) + CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT) + CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) + CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE) + CASE_CL_CONSTANT(CL_INVALID_SAMPLER) + CASE_CL_CONSTANT(CL_INVALID_BINARY) + CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS) + CASE_CL_CONSTANT(CL_INVALID_PROGRAM) + CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION) + CASE_CL_CONSTANT(CL_INVALID_KERNEL) + CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX) + CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE) + CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE) + CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS) + CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION) + CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE) + CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE) + CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET) + CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST) + CASE_CL_CONSTANT(CL_INVALID_EVENT) + CASE_CL_CONSTANT(CL_INVALID_OPERATION) + CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT) + CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE) + CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL) + CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE) + CASE_CL_CONSTANT(CL_INVALID_PROPERTY) + + default: + return "UNKNOWN ERROR CODE"; + } +#undef CASE_CL_CONSTANT +} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/cl/cl_tool.h b/src/framework/cl/cl_tool.h new file mode 100644 index 0000000000000000000000000000000000000000..25d5bfc584b59e4fe9d22a922b601f8c32892fd1 --- /dev/null +++ b/src/framework/cl/cl_tool.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "CL/cl.h" + +namespace paddle_mobile { +namespace framework { + +const char* opencl_error_to_str(cl_int error); + +#define CL_CHECK_ERRORS(ERR) \ + if (ERR != CL_SUCCESS) { \ + printf( \ + "OpenCL error with code %s happened in file %s at line %d. " \ + "Exiting.\n", \ + paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ + __LINE__); \ + } + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h index 0ba31ef9b7016b453b34cc4a023b0841b2110540..665b5315bc1c0fca7b9e62f89062f375a9a011be 100644 --- a/src/framework/data_layout.h +++ b/src/framework/data_layout.h @@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) { return DataLayout::kAnyLayout; } else { PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str()) - exit(0); } } @@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) { return "ANY_LAYOUT"; default: PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ") - exit(0); break; } } diff --git a/src/framework/dim.h b/src/framework/dim.h index 85e86076e1de53fa80b75f56237901da49e22eb9..7c78659e3baacdf707dc46884c099dfd0cd284bb 100644 --- a/src/framework/dim.h +++ b/src/framework/dim.h @@ -42,7 +42,7 @@ struct Dim { : head(idx % size.head), tail(idx / size.head, size.tail) {} /** Construct a Dim with each dimension set to the given index */ - Dim(int64_t idx) : head(idx), tail(idx) {} + explicit Dim(int64_t idx) : head(idx), tail(idx) {} bool operator==(const Dim &o) const { return (head == o.head) && (tail == o.tail); @@ -65,7 +65,7 @@ template <> struct Dim<0> { static constexpr int dimensions = 0; - Dim(int64_t _head) {} + explicit Dim(int64_t _head) {} Dim() {} @@ -131,7 +131,6 @@ int64_t &indexer(Dim &dim, int idx) { template <> int64_t &indexer<0>(Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - exit(0); } template @@ -148,7 +147,6 @@ int64_t indexer(const Dim &dim, int idx) { template <> int64_t indexer<0>(const Dim<0> &dim, int idx) { PADDLE_MOBILE_THROW_EXCEPTION("Invalid index") - exit(0); } } // namespace diff --git a/src/io/executor.cpp b/src/framework/executor.cpp similarity index 64% rename from src/io/executor.cpp rename to src/framework/executor.cpp index 9efec27c9df3d51a3411db87faee924b374d2ac7..c7ef09ed5a1466a7396ec9c177eb3c48abd91ad7 100644 --- a/src/io/executor.cpp +++ b/src/framework/executor.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "io/executor.h" +#include "framework/executor.h" #include #include #include @@ -26,12 +26,26 @@ limitations under the License. */ #include "framework/program/var_desc.h" #include "framework/scope.h" #include "framework/tensor.h" -#include "operators/math/gemm.h" +#include "memory/t_malloc.h" + +#ifdef PADDLE_EXECUTOR_MULTITHREAD +#include +#include +#include "common/threadpool.h" +#endif + +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_image.h" +#endif namespace paddle_mobile { +namespace framework { +using framework::Variable; using framework::Variable; +#pragma mark - executor + template Executor::Executor(const framework::Program p, int batch_size, const bool use_optimize, const bool loddable) @@ -73,8 +87,10 @@ Executor::Executor(const framework::Program p, int batch_size, } std::shared_ptr to_predict_block = to_predict_program_->Block(0); + int i = 0; auto &ops = ops_of_block_[*to_predict_block.get()]; for (const auto &op : ops) { + DLOG << "Initialize op[" << i++ << "]: " << op->Type(); op->Init(); } } @@ -89,8 +105,8 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor, // should be moved into operator init function float min_value; float max_value; - memcpy(&min_value, data_buf, sizeof(float)); - memcpy(&max_value, data_buf + sizeof(float), sizeof(float)); + memory::Copy(&min_value, data_buf, sizeof(float)); + memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float)); data_buf += 2 * sizeof(float); const float factor = (max_value - min_value) / 255.0; const uint8_t *uint8_data = reinterpret_cast(data_buf); @@ -99,7 +115,7 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor, } data_buf += size * sizeof(uint8_t); } else { - memcpy(tensor_data, *data_buf, size * sizeof(Dtype)); + memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype)); *data_buf += size * sizeof(Dtype); } } @@ -115,7 +131,7 @@ void Executor::LoadMemory( // lod information // uint64_t lod_level = *(reinterpret_cast(*data_buf)); uint64_t lod_level = 0; - memcpy(&lod_level, *data_buf, sizeof(uint64_t)); + memory::Copy(&lod_level, *data_buf, sizeof(uint64_t)); *data_buf += sizeof(uint64_t); auto *lod = tensor->mutable_lod(); @@ -124,7 +140,7 @@ void Executor::LoadMemory( uint64_t size = *(reinterpret_cast(*data_buf)); *data_buf += sizeof(uint64_t); std::vector tmp_dim(size / sizeof(size_t)); - memcpy(tmp_dim.data(), *data_buf, size); + memory::Copy(tmp_dim.data(), *data_buf, size); (*lod)[i] = std::move(tmp_dim); *data_buf += size; } @@ -390,13 +406,18 @@ std::vector::Ptype> Executor::Predict( const std::vector &input, const std::vector &dims) { framework::Tensor tensor(input, framework::make_ddim(dims)); std::shared_ptr output_tensor = Predict(tensor, 0); - Executor::Ptype *output_ptr = - output_tensor->data::Ptype>(); - std::vector::Ptype> result_vector; - for (int j = 0; j < output_tensor->numel(); ++j) { - result_vector.push_back(output_ptr[j]); + if (output_tensor != nullptr) { + Executor::Ptype *output_ptr = + output_tensor->data::Ptype>(); + std::vector::Ptype> result_vector; + for (int j = 0; j < output_tensor->numel(); ++j) { + result_vector.push_back(output_ptr[j]); + } + return result_vector; + } else { + DLOG << "return empty vector"; + return {}; } - return result_vector; } #ifdef PADDLE_MOBILE_FPGA @@ -470,8 +491,236 @@ void Executor::Predict_To(int end) { } #endif +#ifdef PADDLE_MOBILE_CL +template +void Executor::LoadMemory(const framework::VarDesc var_desc, + float *tensorInput, char **data) {} + +template <> +void Executor::LoadMemory( + const framework::VarDesc var_desc, float *tensorInput, char **data) { + // 1. version + uint32_t version = *reinterpret_cast(*data); + + (*data) += sizeof(uint32_t); + + // 2 Lod information + uint64_t *lod_level_ptr = new uint64_t(); + memcpy(lod_level_ptr, (*data), sizeof(uint64_t)); + uint64_t lod_level = *lod_level_ptr; + delete lod_level_ptr; + (*data) += sizeof(uint64_t); + + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size = *reinterpret_cast(*data); + (*data) += sizeof(uint64_t); + std::vector tmp(size / sizeof(size_t)); + + for (int k = 0; k < tmp.size(); ++k) { + tmp[k] = *reinterpret_cast(*data); + (*data) += sizeof(size_t); + } + } + + // 3. tensor version + uint32_t tensor_version = *reinterpret_cast(*data); + (*data) += sizeof(uint32_t); + + // 4. tensor desc + int32_t size = *reinterpret_cast(*data); + (*data) += sizeof(int32_t); + + std::unique_ptr buf(new char[size]); + for (int m = 0; m < size; ++m) { + buf.get()[m] = (*data)[m]; + } + (*data) += (sizeof(char) * size); + + const framework::TensorDesc &desc = var_desc.Tensor_desc(); + int memory_size = 1; + for (auto l : desc.Dims()) { + memory_size *= l; + } + + void *memory = nullptr; + // int type_size = 0; + // switch (desc.DataType()) { + // case framework::VARTYPE_TYPE_FP16: + // type_size = 2; + // break; + // case framework::VARTYPE_TYPE_FP32: + // type_size = 4; + // memory = tensor->mutable_data(); + // break; + // case framework::VARTYPE_TYPE_FP64: + // type_size = 8; + // break; + // case framework::VARTYPE_TYPE_INT32: + // memory = tensor->mutable_data(); + // type_size = 4; + // break; + // case framework::VARTYPE_TYPE_INT64: + // type_size = 8; + // break; + // case framework::VARTYPE_TYPE_BOOL: + // type_size = 1; + // break; + // default: + // break; + // } + int type_size = 4; + memory = tensorInput; + if (program_.quantification) { + float min_value; + float max_value; + + memcpy(&min_value, *data, sizeof(float)); + memcpy(&max_value, *data + sizeof(float), sizeof(float)); + *data += 2 * sizeof(float); + const float factor = (max_value - min_value) / 255.0; + uint8_t *uint8_data = reinterpret_cast(*data); + for (int k = 0; k < memory_size; ++k) { + static_cast(memory)[k] = uint8_data[k] * factor + min_value; + } + *data += (memory_size * sizeof(uint8_t)); + } else { + for (int n = 0; n < memory_size; n++) { + float value; + memcpy(&value, *data + n * type_size, type_size); + if (value < 1e-30 && value > -1e-30) { + static_cast(memory)[n] = 0.0; + } else { + static_cast(memory)[n] = value; + } + } + (*data) += (sizeof(char) * memory_size * type_size); + } +} + +template <> +void Executor::InitMemory() { + for (const auto &block : to_predict_program_->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program_.scope->Var(var_desc->Name()); + if (var_desc->Persistable()) { + CLImage *cl_image = nullptr; + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + var->template GetMutable(); + continue; + } else { + cl_image = var->template GetMutable(); + } + + char *origin_data = + ReadFileToBuff(program_.model_path + "/" + var_desc->Name()); + char *data = origin_data; + cl_context context = program_.scope->GetCLScpoe()->Context(); + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + int numel = 1; + for (auto l : desc.Dims()) { + numel *= l; + } + DLOG << var_desc->Name(); + float *tensorInput = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + LoadMemory(*var_desc, tensorInput, &data); + + framework::DDim ddim = framework::make_ddim(desc.Dims()); + + // has not init + cl_image->SetTensorData(tensorInput, ddim); + + delete origin_data; + paddle_mobile::memory::Free(tensorInput); + } else { + if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { + auto cl_image = var->template GetMutable(); + cl_context context = program_.scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = + program_.scope->GetCLScpoe()->CommandQueue(); + + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + // framework::DDim ddim = framework::make_ddim(desc.Dims()); + framework::DDim ddim = cl_image->dims(); + DLOG << var_desc->Name(); + cl_image->InitEmptyImage(context, command_queue, ddim); + } + } + } + } +} + +template <> +void Executor::InitCombineMemory() { + char *origin_data = nullptr; + bool self_alloc = false; + if (program_.combined_params_buf && program_.combined_params_len) { + LOG(kLOG_INFO) << "use outter memory"; + origin_data = reinterpret_cast(program_.combined_params_buf); + } else { + LOG(kLOG_INFO) << " begin init combine memory"; + self_alloc = true; + origin_data = ReadFileToBuff(program_.para_path); + } + PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!"); + float *data = reinterpret_cast(origin_data); + + for (const auto &block : to_predict_program_->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = program_.scope->Var(var_desc->Name()); + if (var_desc->Persistable()) { + CLImage *cl_image = nullptr; + if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") { + var->template GetMutable(); + continue; + } else { + cl_image = var->template GetMutable(); + } + + cl_context context = program_.scope->GetCLScpoe()->Context(); + + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + framework::DDim ddim = framework::make_ddim(desc.Dims()); + + int numel = 1; + for (int i = 0; i < ddim.size(); i++) { + numel = numel * ddim[i]; + } + float *tensorInput = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * numel)); + LoadMemory(*var_desc, tensorInput, &origin_data); + + // has not init + cl_image->SetTensorData(tensorInput, ddim); + + paddle_mobile::memory::Free(tensorInput); + } else { + auto cl_image = var->template GetMutable(); + cl_context context = program_.scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = + program_.scope->GetCLScpoe()->CommandQueue(); + const framework::TensorDesc &desc = var_desc->Tensor_desc(); + framework::DDim ddim = cl_image->dims(); + // framework::DDim ddim = framework::make_ddim(desc.Dims()); + cl_image->InitEmptyImage(context, command_queue, ddim); + } + } + } + if (self_alloc) { + delete data; + } + LOG(kLOG_INFO) << " end init combine memory "; +} + +#endif + template class Executor; -template class Executor; + template class Executor; +template class Executor; + +template class Executor; + +} // namespace framework } // namespace paddle_mobile diff --git a/src/io/executor.h b/src/framework/executor.h similarity index 95% rename from src/io/executor.h rename to src/framework/executor.h index 98906749effb7e46318157085c4505c57726ec62..be1c87e239c9c2ace9b4791f9769c176c9d5ef8e 100644 --- a/src/io/executor.h +++ b/src/framework/executor.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "framework/tensor.h" namespace paddle_mobile { +namespace framework { template class Executor { @@ -79,7 +80,10 @@ class Executor { void LoadMemory(void **data, const std::shared_ptr var_desc, framework::LoDTensor *tensor); - +#ifdef PADDLE_MOBILE_CL + void LoadMemory(const framework::VarDesc var_desc, float *tensorInput, + char **data); +#endif framework::Program program_; int batch_size_ = 1; std::shared_ptr to_predict_program_; @@ -97,4 +101,5 @@ class Executor { bool loddable_ = false; }; +} // namespace framework } // namespace paddle_mobile diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h index 4c6842572e49daa283efa2d92bd43e4687d92e26..982f1c0f3525afde8475866c0121343fafc9d5a0 100644 --- a/src/framework/load_ops.h +++ b/src/framework/load_ops.h @@ -109,9 +109,15 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu); #ifdef RESHAPE_OP LOAD_OP2(reshape, CPU, MALI_GPU); #endif +#ifdef RESHAPE2_OP +LOAD_OP2(reshape2, CPU, MALI_GPU); +#endif #ifdef TRANSPOSE_OP LOAD_OP1(transpose, CPU); #endif +#ifdef TRANSPOSE2_OP +LOAD_OP1(transpose2, CPU); +#endif #ifdef PRIORBOX_OP LOAD_OP1(prior_box, CPU); #endif diff --git a/src/io/loader.cpp b/src/framework/loader.cpp similarity index 51% rename from src/io/loader.cpp rename to src/framework/loader.cpp index 7dd55950be240a88a7521d4be260416625419015..5587d0698fa2b9a04532deae618545d15ecd631f 100644 --- a/src/io/loader.cpp +++ b/src/framework/loader.cpp @@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "io/loader.h" +#include "framework/loader.h" #include "framework/lod_tensor.h" #include "framework/program/program-optimize/program_optimize.h" +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_image.h" +#endif namespace paddle_mobile { -using framework::Variable; +namespace framework { /** * muteandresize tensor as originProgramDesc and scope in loadParams @@ -26,23 +29,57 @@ using framework::Variable; * @param originProgramDesc * @param scope */ -void InitMemoryFromProgram( - std::shared_ptr &originProgramDesc, // NOLINT - std::shared_ptr &scope) { // NOLINT +template +void Loader::InitMemoryFromProgram( + const std::shared_ptr &originProgramDesc, + const std::shared_ptr &scope) { + for (const auto &block : originProgramDesc.get()->Blocks()) { + for (const auto &var_desc : block->Vars()) { + auto var = scope.get()->Var(var_desc->Name()); + if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { + if (var_desc->Persistable()) { + auto dim = var_desc->Tensor_desc().Dims(); + auto tensor = var->GetMutable(); + tensor->Resize(make_ddim(dim)); + } else { + auto dim = var_desc->Tensor_desc().Dims(); + PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0"); + // dim[0] = 1; + for (auto &d : dim) { + if (d < 0) { + d *= -1; + } + } + auto tensor = var->GetMutable(); + tensor->Resize(make_ddim(dim)); + } + } else { + // TODO(codeWorm): some. + } + } + } +} + +#ifdef PADDLE_MOBILE_CL +template <> +void Loader::InitMemoryFromProgram( + const std::shared_ptr &originProgramDesc, + const std::shared_ptr &scope) { for (const auto &block : originProgramDesc.get()->Blocks()) { for (const auto &var_desc : block->Vars()) { auto var = scope.get()->Var(var_desc->Name()); - if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) { + if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Persistable()) { auto dim = var_desc->Tensor_desc().Dims(); - auto tensor = var->GetMutable(); - tensor->Resize(framework::make_ddim(dim)); + // auto tensor = var->GetMutable(); + auto cl_image = var->GetMutable(); + cl_image->Resize(make_ddim(dim)); } else { auto dim = var_desc->Tensor_desc().Dims(); PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0"); dim[0] = 1; - auto tensor = var->GetMutable(); - tensor->Resize(framework::make_ddim(dim)); + auto cl_image = var->GetMutable(); + cl_image->Resize(make_ddim(dim)); } } else { // TODO(codeWorm): some. @@ -50,6 +87,56 @@ void InitMemoryFromProgram( } } } +template <> +const Program +Loader::LoadCombinedMemory( + size_t read_size, const uint8_t *buf, size_t combined_params_len, + uint8_t *combined_params_buf, bool optimize, bool quantification) { + bool can_add_split = false; + + PaddleMobile__Framework__Proto__ProgramDesc *c_program; + PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null"); + + c_program = paddle_mobile__framework__proto__program_desc__unpack( + nullptr, read_size, buf); + // + PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null"); + // + DLOG << "n_ops: " << (*c_program->blocks)->n_ops; + // + + auto originProgramDesc = std::make_shared(c_program); + + Program program; + program.combined = true; + program.originProgram = originProgramDesc; + program.quantification = quantification; + program.combined_params_len = combined_params_len; + program.combined_params_buf = combined_params_buf; + + auto scope = std::make_shared(); + program.scope = scope; + InitMemoryFromProgram(originProgramDesc, scope); + if (optimize) { + ProgramOptimize program_optimize; + program.optimizeProgram = + program_optimize.FusionOptimize(originProgramDesc, can_add_split); + if (!program.optimizeProgram) { + program.optimizeProgram = originProgramDesc; + } + } + if (optimize) { + program.optimizeProgram->Description("optimize: "); + } else { + originProgramDesc->Description("program: "); + } + paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, + nullptr); + return program; +} + +#endif + /** * fusion and print someinfos * @tparam Dtype @@ -61,19 +148,18 @@ void InitMemoryFromProgram( */ template void FusionAndPrintInfos( - bool optimize, bool can_add_split, - framework::Program &program, // NOLINT - const std::shared_ptr &originProgramDesc) { + bool optimize, bool can_add_split, Program *program, + const std::shared_ptr &originProgramDesc) { if (optimize) { - framework::ProgramOptimize program_optimize; - program.optimizeProgram = + ProgramOptimize program_optimize; + program->optimizeProgram = program_optimize.FusionOptimize(originProgramDesc, can_add_split); - if (!program.optimizeProgram) { - program.optimizeProgram = originProgramDesc; + if (!program->optimizeProgram) { + program->optimizeProgram = originProgramDesc; } } if (optimize) { - program.optimizeProgram->Description("optimize: "); + program->optimizeProgram->Description("optimize: "); } else { originProgramDesc->Description("program: "); } @@ -102,9 +188,10 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) { } template -const framework::Program Loader::Load( - const std::string &dirname, bool optimize, bool quantification, - bool can_add_split) { +const Program Loader::Load(const std::string &dirname, + bool optimize, + bool quantification, + bool can_add_split) { auto program = this->LoadProgram(dirname + "/__model__", optimize, quantification, can_add_split); program.model_path = dirname; @@ -112,9 +199,10 @@ const framework::Program Loader::Load( } template -const framework::Program Loader::Load( - const std::string &model_path, const std::string ¶_path, bool optimize, - bool quantification) { +const Program Loader::Load(const std::string &model_path, + const std::string ¶_path, + bool optimize, + bool quantification) { auto program = this->LoadProgram(model_path, optimize, quantification); program.para_path = para_path; @@ -124,7 +212,7 @@ const framework::Program Loader::Load( } template -const framework::Program Loader::LoadProgram( +const Program Loader::LoadProgram( const std::string &model_path, bool optimize, bool quantification, bool can_add_split) { std::string model_filename = model_path; @@ -141,29 +229,29 @@ const framework::Program Loader::LoadProgram( // DLOG << "n_ops: " << (*c_program->blocks)->n_ops; // - auto originProgramDesc = std::make_shared(c_program); + auto originProgramDesc = std::make_shared(c_program); - framework::Program program; + Program program; program.originProgram = originProgramDesc; program.quantification = quantification; program.combined_params_len = 0; program.combined_params_buf = nullptr; - auto scope = std::make_shared(); + auto scope = std::make_shared(); program.scope = scope; // use originProgramDesc and scope to init tensors InitMemoryFromProgram(originProgramDesc, scope); // perform fusion and print infos - FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc); + FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL); return program; } template -const framework::Program Loader::LoadCombinedMemory( +const Program Loader::LoadCombinedMemory( size_t read_size, const uint8_t *buf, size_t combined_params_len, - const uint8_t *combined_params_buf, bool optimize, bool quantification) { + uint8_t *combined_params_buf, bool optimize, bool quantification) { bool can_add_split = false; PaddleMobile__Framework__Proto__ProgramDesc *c_program; @@ -177,26 +265,31 @@ const framework::Program Loader::LoadCombinedMemory( DLOG << "n_ops: " << (*c_program->blocks)->n_ops; // - auto originProgramDesc = std::make_shared(c_program); + auto originProgramDesc = std::make_shared(c_program); - framework::Program program; + Program program; program.combined = true; program.originProgram = originProgramDesc; program.quantification = quantification; program.combined_params_len = combined_params_len; program.combined_params_buf = combined_params_buf; - auto scope = std::make_shared(); + auto scope = std::make_shared(); program.scope = scope; InitMemoryFromProgram(originProgramDesc, scope); - FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc); + FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, nullptr); return program; } template class Loader; + template class Loader; + template class Loader; +template class Loader; + +} // namespace framework } // namespace paddle_mobile diff --git a/src/framework/loader.h b/src/framework/loader.h new file mode 100644 index 0000000000000000000000000000000000000000..3200f0b25368fa123b80c51000cfd6c6a6d084b6 --- /dev/null +++ b/src/framework/loader.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "common/types.h" +#include "framework/program/program.h" + +namespace paddle_mobile { +namespace framework { + +template +class Loader { + public: + /* + * @b load separate format fluid model + * @b 加载分开形式的 fluid 模型 + * */ + const Program Load(const std::string &dirname, + bool optimize = false, + bool quantification = false, + bool can_add_split = false); + + /* + * @b load combine format fluid mode + * @b 加载结合在一起格式的模型 + * */ + const Program Load(const std::string &model_path, + const std::string ¶_path, + bool optimize = false, + bool quantification = false); + + const Program LoadCombinedMemory(size_t model_len, + const uint8_t *model_buf, + size_t combined_params_len, + uint8_t *combined_params_buf, + bool optimize = false, + bool quantification = false); + + private: + const Program LoadProgram(const std::string &model_path, + bool optimize = false, + bool quantification = false, + bool can_add_split = false); + + void InitMemoryFromProgram( + const std::shared_ptr &originProgramDesc, + const std::shared_ptr &scope); +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h index 32954531d0854b3318185aacdf99314051f98f6a..219385ab1429fefddc9d380799259f7562e0030f 100644 --- a/src/framework/op_registry.h +++ b/src/framework/op_registry.h @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once +#include #include #include + #include "common/log.h" #include "common/type_define.h" #include "framework/op_info.h" @@ -120,5 +122,8 @@ class OpRegistry { #define REGISTER_OPERATOR_FPGA(op_type, op_class) \ REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA); +#define REGISTER_OPERATOR_CL(op_type, op_class) \ + REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL); + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 21b14dfcac682e7d310dcf4e8c47afaa0fb68fb3..e0b40cebf7f14e0b927e4666d63e740213918333 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -56,7 +56,7 @@ template void OperatorBase::CheckAllInputOutputSet() const {} template -void OperatorBase::Run() const { +void OperatorBase::Run() { RunImpl(); #ifdef PADDLE_MOBILE_DEBUG DLOG << "-------------" << type_ << "----------------------------"; @@ -84,9 +84,57 @@ void OperatorBase::Run() const { #endif } +#ifdef PADDLE_MOBILE_CL +template <> +void OperatorBase::Run() { + RunImpl(); +#ifdef PADDLE_MOBILE_DEBUG + DLOG << "-------------" << type_ << "----------------------------"; + vector input_keys = GetInputKeys(); + for (const auto key : input_keys) { + auto var_vec_in = inputs_.at(key); + for (int i = 0; i < var_vec_in.size(); ++i) { + auto vari = scope_->FindVar(var_vec_in[i]); + if (vari->IsInitialized()) { + if (type_ == "feed") { + Tensor *tensor = vari->template GetMutable(); + if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; + } else { + CLImage *cl_image = vari->template GetMutable(); + if (cl_image) { + DLOG << type_ << " input- " << key << "=" << *cl_image; + } + } + } + } + } + for (const auto key : GetOutKeys()) { + auto var_vec_out = outputs_.at(key); + for (int i = 0; i < var_vec_out.size(); ++i) { + auto vari = scope_->FindVar(var_vec_out[i]); + if (vari->IsInitialized()) { + if (type_ == "fetch") { + Tensor *tensor = vari->template GetMutable(); + if (tensor) { + DLOG << type_ << " output- " << key << "=" << *tensor; + } + } else { + CLImage *cl_image = vari->template GetMutable(); + if (cl_image) { + DLOG << type_ << " output- " << key << "=" << *cl_image; + } + } + } + } + } +#endif +} +#endif + template class OperatorBase; template class OperatorBase; template class OperatorBase; +template class OperatorBase; } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/operator.h b/src/framework/operator.h index 0a9127f079f3c30acbc9f9c7cf0518d7354b5431..464910b613322451d05adcc772825079d0d8f677 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -32,7 +32,10 @@ limitations under the License. */ #include "framework/scope.h" #include "framework/tensor.h" #include "framework/variable.h" - +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_helper.h" +#include "framework/cl/cl_scope.h" +#endif namespace paddle_mobile { namespace framework { using std::string; @@ -60,10 +63,10 @@ class OperatorBase { const VariableNameMap &outputs, const AttributeMap &attrs, std::shared_ptr scope); virtual ~OperatorBase() {} - void Run() const; + void Run(); std::vector GetOutKeys() const; std::vector GetInputKeys() const; - virtual void RunImpl() const = 0; + virtual void RunImpl() = 0; virtual void Init() = 0; /* @@ -113,9 +116,13 @@ class OperatorWithKernel : public OperatorBase { const VariableNameMap &outputs, const AttributeMap &attrs, std::shared_ptr scope) : OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, *scope) {} + param_(inputs, outputs, attrs, *scope) { +#ifdef PADDLE_MOBILE_CL + kernel_.InitCLHelper(scope->GetCLScpoe()); +#endif + } - virtual void RunImpl() const { this->kernel_.Compute(this->param_); } + virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void InferShape() const = 0; @@ -135,22 +142,35 @@ class OperatorWithKernel : public OperatorBase { template class OpKernelBase { public: - /* - * @b 所有kernel 需实现 Compute 方法 - * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, - * 所有结构体存在与: paddle-mobile/src/operators/op_param.h - * */ -#ifdef PADDLE_MOBILE_MALI_GPU + OpKernelBase() = default; + +#ifdef PADDLE_MOBILE_CL + virtual void InitCLHelper(CLScope *clScope) { + cl_helper_ = CLHelper(clScope); + } +#endif + + /* + * @b 所有kernel 需实现 Compute 方法 + * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体, + * 所有结构体存在与: paddle-mobile/src/operators/op_param.h + * */ +#ifdef PADDLE_McOBILE_MALI_GPU OpKernelBase() { acl_op_ = nullptr; } void *GetAclOp() const { return acl_op_; } void SetAclOp(void *op, void *ob) const { reinterpret_cast *>(ob)->acl_op_ = op; } #endif - virtual void Compute(const P ¶) const = 0; + virtual void Compute(const P ¶) = 0; virtual bool Init(P *para) { return true; } virtual ~OpKernelBase() = default; + protected: +#ifdef PADDLE_MOBILE_CL + CLHelper cl_helper_; +#endif + private: #ifdef PADDLE_MOBILE_MALI_GPU void *acl_op_; diff --git a/src/framework/program/program.h b/src/framework/program/program.h index 696cf75b91ff88837cffd3304f5fe3cd491e77eb..6a25b1c40bd5c1b74ded54ee4134d71c77b15244 100644 --- a/src/framework/program/program.h +++ b/src/framework/program/program.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "framework/program/program_desc.h" #include "framework/scope.h" +#include + namespace paddle_mobile { namespace framework { @@ -32,7 +34,7 @@ class Program { bool combined = false; bool quantification = false; size_t combined_params_len; - const uint8_t *combined_params_buf; + uint8_t *combined_params_buf; }; } // namespace framework diff --git a/src/framework/scope.h b/src/framework/scope.h index 054f141ff68895e0879fd31e15d90c76ea038135..abc727231a0d119ab53d765ab020085aaab9102d 100644 --- a/src/framework/scope.h +++ b/src/framework/scope.h @@ -15,8 +15,14 @@ limitations under the License. */ #pragma once #include +#include #include -#include "variable.h" +#include + +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_scope.h" +#endif +#include "framework/variable.h" namespace paddle_mobile { namespace framework { @@ -33,6 +39,10 @@ class Scope { delete kid; } kids_.clear(); + +#ifdef PADDLE_MOBILE_CL + delete cl_scope_; +#endif } Scope &NewScope() const; @@ -72,6 +82,10 @@ class Scope { Variable *FindVarLocally(const std::string &name) const; +#ifdef PADDLE_MOBILE_CL + CLScope *GetCLScpoe() { return cl_scope_; } +#endif + private: // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const *parent) : parent_(parent) {} @@ -79,6 +93,10 @@ class Scope { mutable std::unordered_map vars_; mutable std::list kids_; Scope const *parent_{nullptr}; + +#ifdef PADDLE_MOBILE_CL + CLScope *cl_scope_ = new CLScope(); +#endif }; } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 8609d8d1530495526302ee50dd5b83ea3d220b1a..9e6ae7288b755d40973264f8744c7c54f73193bd 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -24,65 +24,24 @@ limitations under the License. */ #include #include "common/enforce.h" -#include "common/types.h" #include "framework/data_layout.h" -#include "framework/ddim.h" +#include "framework/tensor_base.h" #include "memory/t_malloc.h" namespace paddle_mobile { namespace framework { -template -struct SizeOfTypeFunctor; - -template -struct SizeOfTypeFunctor { - size_t operator()(std::type_index type) const { - if (typeid(T).hash_code() == type.hash_code()) { - return sizeof(T); - } else { - return 0UL; - } - } -}; - -template <> -struct SizeOfTypeFunctor<> { - size_t operator()(std::type_index type) const { return 0UL; } -}; - -template -struct SizeOfTypeFunctor { - size_t operator()(std::type_index type) const { - SizeOfTypeFunctor head; - size_t head_size = head(type); - if (head_size != 0) { - return head_size; - } - SizeOfTypeFunctor tail; - return tail(type); - } -}; - -static inline size_t SizeOfType(std::type_index type) { - SizeOfTypeFunctor - functor; - size_t size = functor(type); - - PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); - return size; -} class LoDTensor; -class Tensor { +class Tensor : public TensorBase { public: - Tensor() : offset_(0) {} + Tensor() {} template - Tensor(std::vector input, DDim ddim) : offset_(0) { + Tensor(std::vector input, DDim ddim) { PADDLE_MOBILE_ENFORCE( input.size() == framework::product(ddim), "input vector'length should be equal to tensor's length"); + auto input_ptr = mutable_data(ddim); for (int i = 0; i < input.size(); ++i) { input_ptr[i] = input[i]; @@ -95,44 +54,19 @@ class Tensor { this->offset_ = inTensor.offset_; } - /*! Return a pointer to mutable memory block. */ - template - inline T *data() { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type().hash_code() == typeid(T).hash_code()), - "Tensor holds the wrong type, it holds %s", - this->holder_->type().name()); - - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - /*! Return a pointer to constant memory block. */ - template - inline const T *data() const { - check_memory_size(); - PADDLE_MOBILE_ENFORCE( - (std::is_same::value || - holder_->type().hash_code() == typeid(T).hash_code()), - "Tensor holds the wrong type, it holds %s ,requested:%s", - this->holder_->type().name(), typeid(T).name()); - - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); + /*! Resize the dimensions of the memory block. */ + inline Tensor &Resize(const DDim &dims) { + dims_ = dims; + return *this; } - inline bool IsInitialized() const { return holder_ != nullptr; } - - /** - * @brief Return a pointer to mutable memory block. - * @note If not exist, then allocation. - */ - template - inline T *mutable_data() { - static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(typeid(T))); + /*! The internal of two tensors share the same memory block. */ + inline Tensor &ShareDataWith(const Tensor &src) { + src.check_memory_size(); + if (holder_.get() != src.holder_.get()) { + *this = src; + } + return *this; } inline void *mutable_data(std::type_index type) { @@ -149,6 +83,16 @@ class Tensor { reinterpret_cast(holder_->ptr()) + offset_); } + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T *mutable_data() { + static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast(mutable_data(typeid(T))); + } + /** * @brief Return a pointer to mutable memory block. * @@ -164,27 +108,6 @@ class Tensor { return mutable_data(); } - /*! Return the dimensions of the memory block. */ - inline const DDim &dims() const { return dims_; } - - /*! Return the numel of the memory block. */ - inline int64_t numel() const { return product(dims_); } - - /*! Resize the dimensions of the memory block. */ - inline Tensor &Resize(const DDim &dims) { - dims_ = dims; - return *this; - } - - /*! The internal of two tensors share the same memory block. */ - inline Tensor &ShareDataWith(const Tensor &src) { - src.check_memory_size(); - if (holder_.get() != src.holder_.get()) { - *this = src; - } - return *this; - } - /** * @brief Return a sub-tensor of the given tensor. * @@ -218,44 +141,35 @@ class Tensor { } } - std::type_index type() const { + /*! Return a pointer to mutable memory block. */ + template + inline T *data() { + check_memory_size(); PADDLE_MOBILE_ENFORCE( - holder_ != nullptr, - "Tensor not initialized yet when Tensor::type() is called.") - return holder_->type(); - } + (std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code()), + "Tensor holds the wrong type, it holds %s", + this->holder_->type().name()); - // memory size returns the holding memory size in byte. - size_t memory_size() const { - return holder_ == nullptr ? 0UL : holder_->size() - offset_; + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); } - inline void check_memory_size() const { + /*! Return a pointer to constant memory block. */ + template + inline const T *data() const { + check_memory_size(); PADDLE_MOBILE_ENFORCE( - holder_ != nullptr, - "Tensor holds no memory. Call Tensor::mutable_data first."); - PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(), - "Tensor's dims_ is out of bound. "); + (std::is_same::value || + holder_->type().hash_code() == typeid(T).hash_code()), + "Tensor holds the wrong type, it holds %s ,requested:%s", + this->holder_->type().name(), typeid(T).name()); + + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); } private: - /** - * @note Placeholder hides type T, so it doesn't appear as a - * template - * parameter of Variable. - */ - struct Placeholder { - virtual ~Placeholder() = default; - - virtual void *ptr() const = 0; - - virtual size_t size() const = 0; - - virtual std::type_index type() const = 0; - - virtual void set_type(std::type_index type) = 0; - }; - struct PlaceholderImpl : public Placeholder { PlaceholderImpl(size_t size, std::type_index type) : ptr_(static_cast(memory::Alloc(size)), @@ -283,27 +197,6 @@ class Tensor { std::type_index type_; }; - /*! holds the memory block if allocated. */ - std::shared_ptr holder_; - - /** - * @brief points to elements dimensions. - * - * @note dims_ do not indicate the memory block size. - */ - - DDim dims_; - - /** - * @brief A PlaceHolder may be shared by more than one tensor. - * - * @note Some of them may be slices of the others. So the offset_ - * is introduced here to indicate the byte offset between - * PlaceHolder::ptr_ and where the tensor data really - * begins. - */ - size_t offset_; - #ifdef PADDLE_MOBILE_FPGA public: // NOLINT inline void reset_data_ptr(void *p) { diff --git a/src/framework/tensor_base.h b/src/framework/tensor_base.h new file mode 100644 index 0000000000000000000000000000000000000000..e1539d2e681973b39eeca5b30e2ed35b535be8cb --- /dev/null +++ b/src/framework/tensor_base.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "common/enforce.h" +#include "common/types.h" +#include "framework/ddim.h" + +namespace paddle_mobile { +namespace framework { + +template +struct SizeOfTypeFunctor; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + if (typeid(T).hash_code() == type.hash_code()) { + return sizeof(T); + } else { + return 0UL; + } + } +}; + +template <> +struct SizeOfTypeFunctor<> { + size_t operator()(std::type_index type) const { return 0UL; } +}; + +template +struct SizeOfTypeFunctor { + size_t operator()(std::type_index type) const { + SizeOfTypeFunctor head; + size_t head_size = head(type); + if (head_size != 0) { + return head_size; + } + SizeOfTypeFunctor tail; + return tail(type); + } +}; + +static inline size_t SizeOfType(std::type_index type) { + SizeOfTypeFunctor + functor; + size_t size = functor(type); + + PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); + return size; +} + +class TensorBase { + public: + virtual inline TensorBase &Resize(const DDim &dims) = 0; + + inline bool IsInitialized() const { return holder_ != nullptr; } + + /*! Return the dimensions of the memory block. */ + inline const DDim &dims() const { return dims_; } + + /*! Return the numel of the memory block. */ + inline int64_t numel() const { return product(dims_); } + + std::type_index type() const { + PADDLE_MOBILE_ENFORCE( + holder_ != nullptr, + "Tensor not initialized yet when Tensor::type() is called.") + return holder_->type(); + } + + // memory size returns the holding memory size in byte. + size_t memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; + } + + inline void check_memory_size() const { + PADDLE_MOBILE_ENFORCE( + holder_ != nullptr, + "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(), + "Tensor's dims_ is out of bound. "); + } + + protected: + /** + * @note Placeholder hides type T, so it doesn't appear as a + * template + * parameter of Variable. + */ + struct Placeholder { + virtual ~Placeholder() = default; + + virtual void *ptr() const = 0; + + virtual size_t size() const = 0; + + virtual std::type_index type() const = 0; + + virtual void set_type(std::type_index type) = 0; + }; + + /** + * @brief points to elements dimensions. + * + * @note dims_ do not indicate the memory block size. + */ + + DDim dims_; + + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really + * begins. + */ + size_t offset_ = 0; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index 6a7dff597af7fa5de06c90304136e81390fe06af..8088f0b8c9f600ce2422af500ab66a68e1341fc8 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -29,7 +29,9 @@ PaddleMobilePredictor::PaddleMobilePredictor( template bool PaddleMobilePredictor::Init(const PaddleMobileConfig &config) { paddle_mobile_.reset(new PaddleMobile()); - +#ifdef PADDLE_MOBILE_CL + paddle_mobile_->SetCLPath(config.cl_path); +#endif if (config.memory_pack.from_memory) { DLOG << "load from memory!"; paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size, @@ -50,7 +52,6 @@ bool PaddleMobilePredictor::Init(const PaddleMobileConfig &config) { paddle_mobile_->SetThreadNum(config.thread_num); return true; } - template bool PaddleMobilePredictor::Run( const std::vector &inputs, @@ -126,6 +127,8 @@ CreatePaddlePredictor( x.reset(new PaddleMobilePredictor(config)); } else if (config.device == PaddleMobileConfig::kGPU_MALI) { x.reset(new PaddleMobilePredictor(config)); + } else if (config.device == PaddleMobileConfig::kGPU_CL) { + x.reset(new PaddleMobilePredictor(config)); } else { LOG(kLOG_ERROR) << "unsupport device type!"; return nullptr; diff --git a/src/ios_io/PaddleMobileCPU.h b/src/io/ios_io/PaddleMobileCPU.h similarity index 100% rename from src/ios_io/PaddleMobileCPU.h rename to src/io/ios_io/PaddleMobileCPU.h diff --git a/src/ios_io/PaddleMobileCPU.mm b/src/io/ios_io/PaddleMobileCPU.mm similarity index 100% rename from src/ios_io/PaddleMobileCPU.mm rename to src/io/ios_io/PaddleMobileCPU.mm diff --git a/src/jni/PML.java b/src/io/jni/PML.java similarity index 100% rename from src/jni/PML.java rename to src/io/jni/PML.java diff --git a/src/jni/paddle_mobile_jni.cpp b/src/io/jni/paddle_mobile_jni.cpp similarity index 100% rename from src/jni/paddle_mobile_jni.cpp rename to src/io/jni/paddle_mobile_jni.cpp diff --git a/src/jni/paddle_mobile_jni.h b/src/io/jni/paddle_mobile_jni.h similarity index 100% rename from src/jni/paddle_mobile_jni.h rename to src/io/jni/paddle_mobile_jni.h diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index 16756a61bf3265a0b6d7c2ec731d2c3d17bf9c3c..5326f864a4b5238c8498ee1fe9e5810ca0a657cf 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -44,7 +44,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -98,7 +98,6 @@ class PaddlePredictor { virtual bool Run(const std::vector& inputs, std::vector* output_data, int batch_size = -1) = 0; - // Destroy the Predictor. virtual ~PaddlePredictor() = default; @@ -121,7 +120,7 @@ struct PaddleModelMemoryPack { struct PaddleMobileConfig : public PaddlePredictor::Config { enum Precision { FP32 = 0 }; - enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; + enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 }; enum Precision precision; enum Device device; @@ -132,6 +131,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config { int thread_num = 1; std::string prog_file; std::string param_file; + std::string cl_path; struct PaddleModelMemoryPack memory_pack; }; diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index ec1fd1af45319192585f60fa1f90500fa2deaf46..8e4a72dcadf1bc5105e1fc5f9b8c96bfb6d9aa3d 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -13,7 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "io/paddle_mobile.h" - +#ifdef PADDLE_MOBILE_CL +#include +#include "framework/cl/cl_tensor.h" +#endif +#include "common/common.h" +#include "operators/math/gemm.h" namespace paddle_mobile { template @@ -28,13 +33,13 @@ bool PaddleMobile::Load(const std::string &dirname, bool optimize, bool quantification, int batch_size, bool loddable) { if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); + loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { - executor_ = std::make_shared>( + executor_ = std::make_shared>( loader_->Load(dirname, optimize, quantification), batch_size, optimize, loddable); } else { @@ -50,13 +55,13 @@ bool PaddleMobile::Load(const std::string &model_path, bool quantification, int batch_size, bool loddable) { if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); + loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { - executor_ = std::make_shared>( + executor_ = std::make_shared>( loader_->Load(model_path, para_path, optimize, quantification), batch_size, optimize, loddable); } else { @@ -67,21 +72,22 @@ bool PaddleMobile::Load(const std::string &model_path, } template -bool PaddleMobile::LoadCombinedMemory( - size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf) { +bool PaddleMobile::LoadCombinedMemory(size_t model_len, + const uint8_t *model_buf, + size_t combined_params_len, + uint8_t *combined_params_buf) { int batch_size = 1; bool optimise = true; bool quantification = false; if (loader_.get() == nullptr) { - loader_ = std::make_shared>(); + loader_ = std::make_shared>(); } else { LOG(kLOG_INFO) << "loader inited"; } if (executor_.get() == nullptr) { - executor_ = std::make_shared>( + executor_ = std::make_shared>( loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, combined_params_buf, optimise, quantification), @@ -117,6 +123,40 @@ void PaddleMobile::Clear() { loader_ = nullptr; } +template +double PaddleMobile::GetPredictTime() { + int m = 32; + int n = 224 * 224; + int k = 27; + int lda = k; + int ldb = n; + int ldc = n; + float *a = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * k)); + float *b = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * k * n)); + float *c = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * m * n)); + int t1 = 1; + int t2 = 1; + for (int i = 0; i < m * k; ++i) { + a[i] = t1 + rand() % t2; + } + for (int i = 0; i < k * n; ++i) { + b[i] = t1 + rand() % t2; + } + paddle_mobile::operators::math::Gemm gemm; + auto time1 = paddle_mobile::time(); + // gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, + // static_cast(0), c, ldc, false, nullptr); + auto time2 = paddle_mobile::time(); + double cost = paddle_mobile::time_diff(time1, time2); + paddle_mobile::memory::Free(a); + paddle_mobile::memory::Free(b); + paddle_mobile::memory::Free(c); + return cost; +} + template PaddleMobile::~PaddleMobile() { executor_ = nullptr; @@ -157,8 +197,223 @@ void PaddleMobile::Predict_To(int end) { } #endif +#ifdef PADDLE_MOBILE_CL +static std::mutex lc; +template +void PaddleMobile::SetCLPath(std::string path) { + std::lock_guard lock(lc); + if (framework::CLEngine::Instance()->GetCLPath() == "") { + framework::CLEngine::Instance()->setClPath(path); + } +} +template <> +double PaddleMobile::GetPredictTime() { + cl_int status; + cl_uint nPlatform; + clGetPlatformIDs(0, NULL, &nPlatform); + cl_platform_id *listPlatform = + (cl_platform_id *)malloc(nPlatform * sizeof(cl_platform_id)); + clGetPlatformIDs(nPlatform, listPlatform, NULL); + cl_uint nDevice = 0; + clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); + cl_device_id *listDevice = + (cl_device_id *)malloc(nDevice * sizeof(cl_device_id)); + clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice, + NULL); + cl_context context = + clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status); + cl_command_queue queue = + clCreateCommandQueue(context, listDevice[0], 0, &status); + + int n = 1; + int c = 3; + int h = 224; + int w = 224; + float *input = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224)); + float *filter = static_cast( + paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27)); + int input_w = w * (c + 3) / 4; + int input_h = n * h; + int filter_w = 3 * (3 + 3) / 4; + int filter_h = 32 * 3; + int output_w = 224 * (32 + 3) / 4; + int output_h = 1 * 224; + + framework::DDim input_dims = {1, 3, 224, 224}; + framework::CLTensor input_cl_tensor(context, queue); + input_cl_tensor.Resize(input_dims); + cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input); + + framework::DDim filter_dims = {32, 3, 3, 3}; + framework::CLTensor filter_cl_tensor(context, queue); + input_cl_tensor.Resize(filter_dims); + cl_mem filterBuffer = filter_cl_tensor.mutable_with_data(filter); + + cl_mem cl_filter_image = NULL; + cl_mem cl_input_image = NULL; + cl_mem cl_output_image = NULL; + cl_image_format cf = {.image_channel_order = CL_RGBA, + .image_channel_data_type = CL_HALF_FLOAT}; + cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w, + input_h, 0, NULL, &status); + cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, + filter_w, filter_h, 0, NULL, &status); + cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, + output_w, output_h, 0, NULL, &status); + char *code; + std::string path = framework::CLEngine::Instance()->GetCLPath() + + "/cl_kernel/feed_kernel.cl"; + size_t length = readText(path.c_str(), &code); + cl_program program = clCreateProgramWithSource( + context, 1, (const char **)&code, &length, NULL); + std::string path1 = "-cl-fast-relaxed-math -I " + + framework::CLEngine::Instance()->GetCLPath() + + "/cl_kernel"; + clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); + cl_kernel kernel = clCreateKernel(program, "feed", &status); + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &input_w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &input_h); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); + CL_CHECK_ERRORS(status); + + size_t global_work_size[2] = {input_w, input_h}; + + // cl_event out_event = param.Out()->GetClEvent(); + + status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, + NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &filter_w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &filter_h); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); + CL_CHECK_ERRORS(status); + + size_t global_work_size1[2] = {filter_w, filter_h}; + + // cl_event out_event = param.Out()->GetClEvent(); + + status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size1, + NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + + clFinish(queue); + queue = clCreateCommandQueue(context, listDevice[0], 0, &status); + + path = framework::CLEngine::Instance()->GetCLPath() + + "/cl_kernel/conv_kernel.cl"; + size_t length1 = readText(path.c_str(), &code); + program = clCreateProgramWithSource(context, 1, (const char **)&code, + &length1, &status); + CL_CHECK_ERRORS(status); + clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); + kernel = clCreateKernel(program, "conv_3x3", &status); + CL_CHECK_ERRORS(status); + + int c_block = (32 + 3) / 4; + int nh = n * h; + int stride = 1; + int offset = 0; + int input_c = (c + 3) / 4; + int dilation = 1; + int input_width = 224; + int input_height = 224; + int output_width = 224; + int output_height = 224; + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + size_t global_work_size2[3] = {8, 224, 224}; + auto time1 = paddle_mobile::time(); + status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2, + NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + clFinish(queue); + auto time2 = paddle_mobile::time(); + paddle_mobile::memory::Free(input); + paddle_mobile::memory::Free(filter); + return paddle_mobile::time_diff(time1, time2); +} +template +int PaddleMobile::readText( + const char *kernelPath, + char **pcode) // 读取文本文件放入 pcode,返回字符串长度 +{ + FILE *fp; + int size; + // printf(" File: %s\n", kernelPath); + fp = fopen(kernelPath, "rb"); + if (!fp) { + printf(" Open file failed\n"); + return -1; + } + if (fseek(fp, 0, SEEK_END) != 0) { + printf(" Seek end of file failed\n"); + return -1; + } + if ((size = ftell(fp)) < 0) { + printf(" Get file position failed\n"); + return -1; + } + rewind(fp); + if ((*pcode = (char *)malloc(size + 1)) == NULL) { + printf(" Allocate space failed\n"); + return -1; + } + fread(*pcode, 1, size, fp); + (*pcode)[size] = '\0'; + fclose(fp); + return size + 1; +} + +#endif + template class PaddleMobile; template class PaddleMobile; template class PaddleMobile; +template class PaddleMobile; + } // namespace paddle_mobile diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index e0ff51d246b179e3f91e1c94f3b26c5ff9ba3d8f..ab148e7361c160bc658403d4696b806323595c54 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -22,10 +22,13 @@ limitations under the License. */ #endif // _OPENMP #include "common/types.h" +#include "framework/executor.h" #include "framework/load_ops.h" +#include "framework/loader.h" #include "framework/tensor.h" -#include "io/executor.h" -#include "io/loader.h" +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_engine.h" +#endif namespace paddle_mobile { @@ -34,7 +37,13 @@ class PaddleMobile { typedef typename PrecisionTrait

::ptype Ptype; public: - PaddleMobile() {} + PaddleMobile() { +#ifndef PADDLE_MOBILE_CL + bool is_gpu = std::is_same, Dtype>::value; + PADDLE_MOBILE_ENFORCE(!is_gpu, + "Not Enable GPU in CmakeList but run gpu codes "); +#endif + } bool Load(const std::string &dirname, bool optimize = false, bool quantification = false, int batch_size = 1, bool loddable = false); @@ -52,10 +61,11 @@ class PaddleMobile { bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf, size_t combined_params_len, - const uint8_t *combined_params_buf); + uint8_t *combined_params_buf); void SetThreadNum(int num); void Clear(); + double GetPredictTime(); ~PaddleMobile(); @@ -68,9 +78,16 @@ class PaddleMobile { void Predict_To(int end); #endif +#ifdef PADDLE_MOBILE_CL + public: + void SetCLPath(std::string cl_path); + int readText(const char *kernelPath, + char **pcode); // 读取文本文件放入 pcode,返回字符串长度 +#endif + private: - std::shared_ptr> loader_; - std::shared_ptr> executor_; + std::shared_ptr> loader_; + std::shared_ptr> executor_; }; } // namespace paddle_mobile diff --git a/src/io/paddle_test_inference_api.cpp b/src/io/paddle_test_inference_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..97410ff32e31298bfd35abcc7dfc8cef61fe017a --- /dev/null +++ b/src/io/paddle_test_inference_api.cpp @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "io/paddle_test_inference_api.h" +#include "io/paddle_mobile.h" +namespace paddle_mobile { +template +double PaddleTester::CaculatePredictTime(std::string *cl_path) { + PaddleMobile paddle_mobile; +#ifdef PADDLE_MOBILE_CL + if (cl_path) { + paddle_mobile.SetCLPath(*cl_path); + } + +#endif + return paddle_mobile.GetPredictTime(); +} +template class PaddleTester; +template class PaddleTester; +template class PaddleTester; + +template class PaddleTester; + +} // namespace paddle_mobile diff --git a/src/io/paddle_test_inference_api.h b/src/io/paddle_test_inference_api.h new file mode 100644 index 0000000000000000000000000000000000000000..b203bac43d17cafd7655911df5a5116b215413bd --- /dev/null +++ b/src/io/paddle_test_inference_api.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains the definition of a simple Inference API for Paddle. + * + * ATTENTION: It requires some C++ features, for lower version C++ or C, we + * might release another API. + */ + +#pragma once +#include "common/types.h" +#include "string" +namespace paddle_mobile { +template +class PaddleTester { + public: + double CaculatePredictTime(std::string *cl_path = nullptr); +}; + +} // namespace paddle_mobile diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9..2fb74d18809f174810866a990396bb0279d256f5 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -16,10 +16,12 @@ limitations under the License. */ #include #include -#ifdef PADDLE_MOBILE_FPGA - -#include "fpga/api.h" +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" #endif namespace paddle_mobile { @@ -30,7 +32,7 @@ const int MALLOC_ALIGN = 64; namespace fpga = paddle_mobile::fpga; void Copy(void *dst, const void *src, size_t num) { - std::memcpy(dst, src, num); + fpga::fpga_copy(dst, src, num); } void *Alloc(size_t size) { return fpga::fpga_malloc(size); } diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp index f820908404ea637d9680c32d5c4b5568e191dd7e..89220dd2489c93a84bc8a141c06a151b8044a4e4 100644 --- a/src/operators/batchnorm_op.cpp +++ b/src/operators/batchnorm_op.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef BATCHNORM_OP -#include "batchnorm_op.h" +#include "operators/batchnorm_op.h" #include "framework/op_proto_maker.h" #include "framework/op_registry.h" @@ -40,4 +40,8 @@ REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp); #ifdef PADDLE_MOBILE_FPGA #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp); +#endif + #endif diff --git a/src/operators/bilinear_interp_op.h b/src/operators/bilinear_interp_op.h index 1b17406c546d336fd42b0a818d16627c87aedb09..2bb61d129d5ba45900f1c67b8c202e958a004bb7 100644 --- a/src/operators/bilinear_interp_op.h +++ b/src/operators/bilinear_interp_op.h @@ -40,10 +40,6 @@ class BilinearOp : public framework::OperatorWithKernel< DeviceType, BilinearInterpParam, operators::BilinearInterpKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, BilinearInterpParam, - operators::BilinearInterpKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h index c06ca8265dd495acb79e4e2ec6c497941b822b21..3a3048c6624996892333a71773c33ee2f6e18e0a 100644 --- a/src/operators/box_coder_op.h +++ b/src/operators/box_coder_op.h @@ -39,10 +39,6 @@ class BoxCoderOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::BoxCoderKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, BoxCoderParam, - operators::BoxCoderKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h index eb257d47228ab854c00574a001f6454e239cfbbd..a01e066edd1082bc109ba7eb0f31a2ac42ab865a 100644 --- a/src/operators/concat_op.h +++ b/src/operators/concat_op.h @@ -34,10 +34,6 @@ class ConcatOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ConcatKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ConcatParam, - operators::ConcatKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp index c4601995219b32db75f22c7c2ed959e18af85f36..2c70f42f56530c2d21252d6b51c228e7c49ca8bf 100644 --- a/src/operators/conv_op.cpp +++ b/src/operators/conv_op.cpp @@ -62,4 +62,8 @@ REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp); REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(conv2d, ops::ConvOp); +#endif + #endif diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h index 23c022e584f9be6cb0b4c2c416ca96e61b3c131f..1b8bd70805ccff8946c1ab12a207618849fc9ca4 100644 --- a/src/operators/conv_op.h +++ b/src/operators/conv_op.h @@ -34,10 +34,6 @@ class ConvOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ConvKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::ConvKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/conv_transpose_op.cpp b/src/operators/conv_transpose_op.cpp index 4d9eefaa85be51c9c2409ca044a6da4874566e1c..d09a7937453f3bd2c20d9e6bc1a03d4375d57491 100644 --- a/src/operators/conv_transpose_op.cpp +++ b/src/operators/conv_transpose_op.cpp @@ -27,6 +27,7 @@ REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose); #ifdef PADDLE_MOBILE_MALI_GPU #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose); #endif #endif diff --git a/src/operators/crf_op.h b/src/operators/crf_op.h index 9b7487ee958467dac451c3bcb743e6122842c7f1..dca481bb2dd08dc65fb94e41d0573277c9b143c7 100644 --- a/src/operators/crf_op.h +++ b/src/operators/crf_op.h @@ -37,10 +37,6 @@ class CrfOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::CrfKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, CrfParam, - operators::CrfKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h index 845c59a19e613bfcf299b445b778eff4d99c7295..102d65670d3e50acd15745e95b85d7b843994ed7 100644 --- a/src/operators/depthwise_conv_op.h +++ b/src/operators/depthwise_conv_op.h @@ -36,10 +36,6 @@ class DepthwiseConvOp : public framework::OperatorWithKernel< DeviceType, ConvParam, operators::DepthwiseConvKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::DepthwiseConvKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h index 65f3587c2336b3e581a30328c41ad397b2848b34..ce8acd5966439808f7a03f18cf3d29a1b5c0487e 100644 --- a/src/operators/dropout_op.h +++ b/src/operators/dropout_op.h @@ -38,10 +38,6 @@ class DropoutOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::DropoutKernel>( type, inputs, outputs, attrs, scope) {} - - // using framework::OperatorWithKernel, - // operators::DropoutKernel>; void InferShape() const override; protected: diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp index 93e447d51f0e9ce2fdf75c60332ad52950d68c3d..281cd3d5084a1a15502e1e06865e1024d3b2b639 100644 --- a/src/operators/elementwise_add_op.cpp +++ b/src/operators/elementwise_add_op.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef ELEMENTWISEADD_OP -#include "elementwise_add_op.h" +#include "operators/elementwise_add_op.h" namespace paddle_mobile { namespace operators { @@ -36,4 +36,12 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp); REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp); +#endif + +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp); +#endif + #endif diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h index a1360eba5480a46395cedb445a4df4e4ca0ab279..a853b40ff7ccf323911f2ea1bf6e23d67d111db2 100644 --- a/src/operators/elementwise_add_op.h +++ b/src/operators/elementwise_add_op.h @@ -37,10 +37,6 @@ class ElementwiseAddOp : public framework::OperatorWithKernel< DeviceType, ElementwiseAddParam, operators::ElementwiseAddKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ElementwiseAddParam, - operators::ElementwiseAddKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp index 41f9e687bb4024d245a89df3dc785e1254b5a9a7..c3211b9fa9cc4b973788af4104c7ebe7bea2f54f 100644 --- a/src/operators/feed_op.cpp +++ b/src/operators/feed_op.cpp @@ -14,6 +14,19 @@ limitations under the License. */ #include "operators/feed_op.h" +namespace paddle_mobile { +namespace operators { + +template +void FeedOp::InferShape() const { + auto out_dims = this->param_.Out()->dims(); + out_dims[0] = this->param_.BatchSize(); + this->param_.Out()->Resize(out_dims); +} + +} // namespace operators +} // namespace paddle_mobile + namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU @@ -25,3 +38,6 @@ REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(feed, ops::FeedOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(feed, ops::FeedOp); +#endif diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index c7e77fcca40a3c533e442d10604c8cd9bcc1e74b..57932474184fd5431e5b6ac5756ab28faa2b1b9e 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -16,68 +16,29 @@ limitations under the License. */ #include #include "framework/operator.h" +#include "operators/kernel/feed_kernel.h" #include "operators/op_param.h" namespace paddle_mobile { namespace operators { +using std::string; + template -class FeedOp : public framework::OperatorBase { +class FeedOp + : public framework::OperatorWithKernel, + FeedKernel> { public: FeedOp(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const framework::AttributeMap attrs, std::shared_ptr scope) - : framework::OperatorBase(type, inputs, outputs, attrs, - scope), - param_(inputs, outputs, attrs, scope.get()) {} - - void InferShape() const { - auto out_dims = param_.Out()->dims(); - out_dims[0] = param_.BatchSize(); - param_.Out()->Resize(out_dims); - } - -#ifdef PADDLE_MOBILE_FPGA - - void Init() { - Tensor *output = param_.Out(); - fpga::format_fp16_ofm(output); - } - - void RunImpl() const { - auto input = (Tensor *)const_cast(param_.InputX()); // NOLINT - fpga::format_image(input); - auto input_ptr = input->data(); - Tensor *output = param_.Out(); - auto output_ptr = output->data(); - - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; - - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = (void *)input_ptr; // NOLINT - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); - } -#else - void Init() {} - void RunImpl() const { - param_.Out()->ShareDataWith(*param_.InputX()); - param_.Out()->set_lod(param_.InputX()->lod()); - } -#endif + : framework::OperatorWithKernel, + FeedKernel>( + type, inputs, outputs, attrs, scope) {} + void InferShape() const override; protected: - FeedParam param_; }; } // namespace operators diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp index 6c5d1341db12db5e602bad08aaa33f26b2ac3396..50e53c30cfd06a8fae8c9e18dd4aa985a056a13e 100644 --- a/src/operators/fetch_op.cpp +++ b/src/operators/fetch_op.cpp @@ -13,6 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "operators/fetch_op.h" +namespace paddle_mobile { +namespace operators { + +template +void FetchOp::InferShape() const { + auto x_dims = this->param_.InputX()->dims(); + this->param_.Out()->Resize(x_dims); +} + +} // namespace operators +} // namespace paddle_mobile namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU @@ -24,3 +35,6 @@ REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fetch, ops::FetchOp); +#endif diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h index 9fbfc2f417b52162950612beb2979fe640cbdcc4..f92c66a05f121b3f6b78c244dd01d81393fa5c68 100644 --- a/src/operators/fetch_op.h +++ b/src/operators/fetch_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "framework/operator.h" +#include "operators/kernel/fetch_kernel.h" #include "operators/op_param.h" namespace paddle_mobile { @@ -23,25 +24,20 @@ namespace operators { using std::string; template -class FetchOp : public framework::OperatorBase { +class FetchOp + : public framework::OperatorWithKernel, + FetchKernel> { public: FetchOp(const string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const framework::AttributeMap attrs, std::shared_ptr scope) - : framework::OperatorBase(type, inputs, outputs, attrs, - scope), - param_(inputs, outputs, attrs, *scope) {} - void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); } + : framework::OperatorWithKernel, + FetchKernel>( + type, inputs, outputs, attrs, scope) {} - void Init() {} - - void InferShape() const { - auto x_dims = param_.InputX()->dims(); - param_.Out()->Resize(x_dims); - } + void InferShape() const override; protected: - FetchParam param_; }; } // namespace operators diff --git a/src/operators/fill_constant_op.cpp b/src/operators/fill_constant_op.cpp index 6d7c4f44f1b769c47d6f741d139118158292a40f..0c13c57ceb53933c750f8c1adaa8b4e24ff948c8 100644 --- a/src/operators/fill_constant_op.cpp +++ b/src/operators/fill_constant_op.cpp @@ -20,9 +20,6 @@ namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp); #endif -#ifdef PADDLE_MOBILE_MALI_GPU -REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp); -#endif #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp); #endif diff --git a/src/operators/fill_constant_op.h b/src/operators/fill_constant_op.h index 78eb162efc8ccd42b9fba363d49d1dbc4052f6b2..e24cecd363630a845f147e2e429b973dad24f63d 100644 --- a/src/operators/fill_constant_op.h +++ b/src/operators/fill_constant_op.h @@ -37,7 +37,7 @@ class FillConstantOp : public framework::OperatorBase { : framework::OperatorBase(type, inputs, outputs, attrs, scope), param_(inputs, outputs, attrs, *scope) {} - void RunImpl() const { + void RunImpl() { auto data_type = static_cast<_PaddleMobile__Framework__Proto__VarType__Type>( param_.DataDtype()); diff --git a/src/operators/flatten_op.h b/src/operators/flatten_op.h index e935ae308cf5c28b9c435086b2b5e4d4407c319a..a7a91e60701cf559cb35238aa2966c02c869e844 100644 --- a/src/operators/flatten_op.h +++ b/src/operators/flatten_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #pragma once #include +#include #include "framework/operator.h" #include "operators/kernel/flatten_kernel.h" @@ -53,10 +54,6 @@ class FlattenOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::FlattenKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FlattenParam, - operators::FlattenKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h index 7893ff95a671447adbeebeeaf4096235e7a37964..4ec76b500812f95eb64e27564d0e63b2c1b2c2d3 100644 --- a/src/operators/fusion_conv_add_add_prelu_op.h +++ b/src/operators/fusion_conv_add_add_prelu_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #pragma once #include +#include #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" @@ -67,10 +68,6 @@ class FusionConvAddAddPReluOp DeviceType, FusionConvAddAddPReluParam, operators::ConvAddAddPReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvAddAddPReluParam, - operators::ConvAddAddPReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp index e7d6ee59f2dadbdca0af72af1e786f0430c58d63..b9bc948fe0e77741a36f959e29eb2a4c82e82b72 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.cpp +++ b/src/operators/fusion_conv_add_bn_relu_op.cpp @@ -58,5 +58,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); #endif - +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp); +#endif #endif diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h index 07bb0146b3f481e09d0a944c4791237e7eea08e4..6ecc9bdc4a90530221c70651c52457874e3eaaa8 100644 --- a/src/operators/fusion_conv_add_bn_relu_op.h +++ b/src/operators/fusion_conv_add_bn_relu_op.h @@ -20,8 +20,8 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/conv_add_bn_relu_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -66,10 +66,6 @@ class FusionConvAddBNReluOp DeviceType, FusionConvAddBNReluParam, operators::ConvAddBNReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvAddBNReluParam, - operators::ConvAddBNReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_op.cpp b/src/operators/fusion_conv_add_op.cpp index 485ba1be9baee2034dbd5c47f64372b701026e44..1b32ec39b65f8b16fd8967be3f45f4b31db5ca16 100644 --- a/src/operators/fusion_conv_add_op.cpp +++ b/src/operators/fusion_conv_add_op.cpp @@ -58,4 +58,8 @@ REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp); REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp); +#endif + #endif diff --git a/src/operators/fusion_conv_add_op.h b/src/operators/fusion_conv_add_op.h index 365e3afa97c2c2fd82c629302f8a5fddf8abb406..eef143ce8716ce856784bb01dd3d58a26746b4e8 100644 --- a/src/operators/fusion_conv_add_op.h +++ b/src/operators/fusion_conv_add_op.h @@ -19,8 +19,8 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/conv_add_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -56,10 +56,6 @@ class FusionConvAddOp : public framework::OperatorWithKernel< FusionConvAddParam, operators::ConvAddKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvAddParam, - operators::ConvAddKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h index 0b0763e781daf3d882d0463205b07fdef53b90f5..fc1143099e16b8b7f7c44d7fe5a5694a278a1906 100644 --- a/src/operators/fusion_conv_add_prelu_op.h +++ b/src/operators/fusion_conv_add_prelu_op.h @@ -39,10 +39,7 @@ class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher { std::vector> *removed_nodes) { node->Folder(node_.Depth(), Type(), {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, - {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}} - - }, - + {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}}, removed_nodes); } std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; } @@ -63,9 +60,6 @@ class FusionConvAddPReluOp operators::ConvAddPReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionConvAddPReluParam, - operators::ConvAddPReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp index 486221f0f6b2e1b0d78d2632c8d735a6a6a101bb..bb4b6666a881de0989d43840806b9d5d720b3b66 100644 --- a/src/operators/fusion_conv_add_relu_op.cpp +++ b/src/operators/fusion_conv_add_relu_op.cpp @@ -56,5 +56,7 @@ REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp); #endif - +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp); +#endif #endif diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h index 1335ce7b6ca5151e3d396856055f38825710f4b1..22ba67c617ecdb0f3be2f5757504b6ba530b092c 100644 --- a/src/operators/fusion_conv_add_relu_op.h +++ b/src/operators/fusion_conv_add_relu_op.h @@ -29,9 +29,8 @@ namespace operators { class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher { public: FusionConvAddReluOpMatcher() { - node_ = framework::Node(G_OP_TYPE_CONV); - node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > - std::make_shared(G_OP_TYPE_RELU); + // node_ = framework::Node(G_OP_TYPE_FUSION_CONV_ADD); + // node_ > std::make_shared(G_OP_TYPE_RELU); } void FolderNodes( @@ -57,9 +56,6 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel< operators::ConvAddReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionConvAddReluParam, - operators::ConvAddReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_bn_add_relu_op.h b/src/operators/fusion_conv_bn_add_relu_op.h index b2f911363acc4f9d5b3c4407317107efadf3996d..303668a89bf7869e72a4b546c5d96be24b26c4ec 100644 --- a/src/operators/fusion_conv_bn_add_relu_op.h +++ b/src/operators/fusion_conv_bn_add_relu_op.h @@ -17,11 +17,12 @@ limitations under the License. */ #pragma once #include +#include #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/conv_bn_add_relu_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -71,10 +72,6 @@ class FusionConvBNAddReluOp DeviceType, FusionConvBNAddReluParam, operators::ConvBNAddReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvBNAddReluParam, - operators::ConvBNAddReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h index a6bbe72500ccfe2b43e21496c5abc18b9a562d47..9bc534fe333c76e8f533c904560b8228760c66e5 100644 --- a/src/operators/fusion_conv_bn_relu_op.h +++ b/src/operators/fusion_conv_bn_relu_op.h @@ -63,10 +63,6 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel< DeviceType, FusionConvBNReluParam, operators::ConvBNReluKernel>(type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, FusionConvBNReluParam, - operators::ConvBNReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_deconv_relu_op.cpp b/src/operators/fusion_deconv_relu_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..daae39c951b34fa05962f936c28381f7d5d4e15c --- /dev/null +++ b/src/operators/fusion_deconv_relu_op.cpp @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVRELU_OP + +#include "operators/fusion_deconv_relu_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_relu_op.h b/src/operators/fusion_deconv_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e87d5d3798930d745b82c8e5a3cca793c12ee4b1 --- /dev/null +++ b/src/operators/fusion_deconv_relu_op.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVRELU_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvReluMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvReluMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), {}, removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; } +}; + +template +class FusionDeconvReluOp : public framework::OperatorWithKernel< + DeviceType, FusionDeconvReluParam, + operators::DeconvReluKernel> { + public: + FusionDeconvReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvReluParam, + operators::DeconvReluKernel>(type, inputs, outputs, + attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_FC_RELU_OP diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h index 44a1f845bc9b2dc0251fb729de9f9c00071fd492..d7a74d896e904971e21c28fab29771b34a049921 100644 --- a/src/operators/fusion_dwconv_bn_relu_op.h +++ b/src/operators/fusion_dwconv_bn_relu_op.h @@ -20,8 +20,8 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "op_param.h" #include "operators/kernel/dwconv_bn_relu_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -65,9 +65,6 @@ class FusionDWConvBNReluOp operators::DWConvBNReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionDWConvBNReluParam, - operators::DWConvBNReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h index 722c5225bc035df2761154a08a521a09b34a1e82..26cb40aac8e47203f125417e1f6b5df75d7835b5 100644 --- a/src/operators/fusion_fc_op.h +++ b/src/operators/fusion_fc_op.h @@ -56,10 +56,6 @@ class FusionFcOp : public framework::OperatorWithKernel< operators::FusionFcKernel>( type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionFcParam, - operators::FusionFcKernel>::OperatorWithKernel; - void InferShape() const override; }; diff --git a/src/operators/fusion_fc_relu_op.h b/src/operators/fusion_fc_relu_op.h index 5cd884f04e819ac881c3b2a4ad666591ea610117..7324f94138e59c4a4a93fe2658b38ddbdf6fa651 100644 --- a/src/operators/fusion_fc_relu_op.h +++ b/src/operators/fusion_fc_relu_op.h @@ -56,9 +56,6 @@ class FusionFcReluOp : public framework::OperatorWithKernel< operators::FusionFcReluKernel>(type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, FusionFcReluParam, - operators::FusionFcReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/gru_op.h b/src/operators/gru_op.h index a45d3efe5b4c59f8582c534f85de7cc1ac82df85..5e66b497af15c498e2af5ff5903ef88a16db1832 100644 --- a/src/operators/gru_op.h +++ b/src/operators/gru_op.h @@ -37,10 +37,6 @@ class GruOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::GruKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, GruParam, - operators::GruKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h index 50d5664c1a3ce999a0c163225d20126961804a22..036b496ca8293432aa30ae86542e78880143f086 100644 --- a/src/operators/im2sequence_op.h +++ b/src/operators/im2sequence_op.h @@ -16,15 +16,14 @@ limitations under the License. */ #pragma once -#include +#include #include "framework/operator.h" #include "operators/kernel/im2sequence_kernel.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { -using namespace framework; - template class Im2SequenceOp : public framework::OperatorWithKernel< DeviceType, Im2SequenceParam, @@ -39,9 +38,6 @@ class Im2SequenceOp : public framework::OperatorWithKernel< operators::Im2SequenceKernel>(type, inputs, outputs, attrs, scope) {} - // using framework::OperatorWithKernel< - // DeviceType, Im2SequenceParam, - // operators::Im2SequenceKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp index c420727f425092240994ee834117225c72abeec2..f31c4426db7d28234692742fcd670cb26ec50ab0 100644 --- a/src/operators/kernel/arm/batchnorm_kernel.cpp +++ b/src/operators/kernel/arm/batchnorm_kernel.cpp @@ -26,8 +26,7 @@ bool BatchNormKernel::Init(BatchNormParam *param) { } template <> -void BatchNormKernel::Compute( - const BatchNormParam ¶m) const { +void BatchNormKernel::Compute(const BatchNormParam ¶m) { BatchnormCompute(param); } diff --git a/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/src/operators/kernel/arm/bilinear_interp_kernel.cpp index 4888f7a37a47fe80ffcbaee7e3f80b1d5c1f20f4..85192e28edf8351bd8be540b27aa986b2c458d0d 100644 --- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp +++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp @@ -27,7 +27,7 @@ bool BilinearInterpKernel::Init(BilinearInterpParam *param) { template <> void BilinearInterpKernel::Compute( - const BilinearInterpParam ¶m) const { + const BilinearInterpParam ¶m) { BilinearInterpCompute(param); } diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp index b769d4fbbaa7570ee741476f960d9e5b60c61917..30ede12dffe0eed7673c9ae1f7c836fd1b5b7096 100644 --- a/src/operators/kernel/arm/box_coder_kernel.cpp +++ b/src/operators/kernel/arm/box_coder_kernel.cpp @@ -26,8 +26,7 @@ bool BoxCoderKernel::Init(BoxCoderParam *param) { } template <> -void BoxCoderKernel::Compute( - const BoxCoderParam ¶m) const { +void BoxCoderKernel::Compute(const BoxCoderParam ¶m) { BoxCoderCompute(param); } diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp index 04c590e6b432fbf88cd136eac942485adf9a9003..8cdf6cb01afa85239bfd0d48bbce02790ba5250d 100644 --- a/src/operators/kernel/arm/concat_kernel.cpp +++ b/src/operators/kernel/arm/concat_kernel.cpp @@ -26,7 +26,7 @@ bool ConcatKernel::Init(ConcatParam *param) { } template <> -void ConcatKernel::Compute(const ConcatParam ¶m) const { +void ConcatKernel::Compute(const ConcatParam ¶m) { ConcatCompute(param); param.Out()->set_lod(param.Inputs()[0]->lod()); } diff --git a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp index 74b88f5d4f5e24b1401803c8c48d99319f412d1b..2f6f5f3ac719b3fd32aac54ce36eb534f7d99dd7 100644 --- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp @@ -28,7 +28,7 @@ bool ConvAddAddPReluKernel::Init( template <> void ConvAddAddPReluKernel::Compute( - const FusionConvAddAddPReluParam ¶m) const { + const FusionConvAddAddPReluParam ¶m) { ConvAddAddPReluCompute(param); } template class ConvAddAddPReluKernel; diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp index ca53ebea8e4577fdc52fad066691d4351eaf12f9..eb55920621db34d191a9536f287ec50747e1ce3c 100644 --- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp @@ -55,7 +55,7 @@ bool ConvAddBNReluKernel::Init( template <> void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) const { + const FusionConvAddBNReluParam ¶m) { ConvAddBNReluCompute(param); } template class ConvAddBNReluKernel; diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp index 1af1c3db1159cd4fed007ebf153ba15b804eee75..e016b8efbd15472ae0d77423d84dc19671bfa316 100644 --- a/src/operators/kernel/arm/conv_add_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_kernel.cpp @@ -25,8 +25,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { } template <> -void ConvAddKernel::Compute( - const FusionConvAddParam ¶m) const { +void ConvAddKernel::Compute(const FusionConvAddParam ¶m) { ConvAddCompute(param); } diff --git a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp index 5930cfdcfc0f983c9f07754113dc37d5122d19f0..f04a9a7d746f2d970196945707bd05409c5fa340 100644 --- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp @@ -27,7 +27,7 @@ bool ConvAddPReluKernel::Init(FusionConvAddPReluParam *param) { template <> void ConvAddPReluKernel::Compute( - const FusionConvAddPReluParam ¶m) const { + const FusionConvAddPReluParam ¶m) { ConvAddPReluCompute(param); } template class ConvAddPReluKernel; diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp index f50e1e3900bb5fce35a29100d6c2cb6004b4af74..211d6d8487bfd4afc71d74e5ecbff149ad34e466 100644 --- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp @@ -27,7 +27,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { template <> void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) const { + const FusionConvAddReluParam ¶m) { ConvAddReluCompute(param); } template class ConvAddReluKernel; diff --git a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp index 785b13dde2ec1196792d17b253bb0d904da799f5..a0f21dd6126ed81cf5e96f99bd0f8ed5211f96a4 100644 --- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp @@ -55,7 +55,7 @@ bool ConvBNAddReluKernel::Init( template <> void ConvBNAddReluKernel::Compute( - const FusionConvBNAddReluParam ¶m) const { + const FusionConvBNAddReluParam ¶m) { ConvBNAddReluCompute(param); } template class ConvBNAddReluKernel; diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp index 6b9ea0428fa496980a234c7c895ef9cbf1245b51..d8acb8d2083b732da026a9bff19c2d7732568597 100644 --- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp @@ -57,7 +57,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { template <> void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) const { + const FusionConvBNReluParam ¶m) { ConvBNReluCompute(param); } template class ConvBNReluKernel; diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index be518d3a2cac2f3a749a7bbbbd0c15a17cf2904c..93aaea4afd7026f792a007b337a35c2bde48ad48 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -55,7 +55,7 @@ bool ConvKernel::Init(ConvParam *param) { } template <> -void ConvKernel::Compute(const ConvParam ¶m) const { +void ConvKernel::Compute(const ConvParam ¶m) { switch (param.ExecMode()) { case ConvParam::EXEC_GEMM_INT8: GemmConv(param); diff --git a/src/operators/kernel/arm/conv_transpose_kernel.cpp b/src/operators/kernel/arm/conv_transpose_kernel.cpp index 94f8a79101ca4b1f4085a4d172fee761714dc3d2..771a846ed65e5c69090698ce813103077dedaccf 100644 --- a/src/operators/kernel/arm/conv_transpose_kernel.cpp +++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp @@ -27,7 +27,7 @@ bool ConvTransposeKernel::Init(ConvTransposeParam *param) { template <> void ConvTransposeKernel::Compute( - const ConvTransposeParam ¶m) const { + const ConvTransposeParam ¶m) { ConvTransposeCompute(param); } diff --git a/src/operators/kernel/arm/crf_kernel.cpp b/src/operators/kernel/arm/crf_kernel.cpp index 89769c50a6fc05b28192ebf584ba3cb12f19ac2c..d30c28b3576e2a8a8a108ae6c86edc2f4310b83f 100644 --- a/src/operators/kernel/arm/crf_kernel.cpp +++ b/src/operators/kernel/arm/crf_kernel.cpp @@ -27,7 +27,7 @@ bool CrfKernel::Init(CrfParam *param) { } template <> -void CrfKernel::Compute(const CrfParam ¶m) const { +void CrfKernel::Compute(const CrfParam ¶m) { CrfCompute(param); } diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp index fd5e068afb6f7f2a069a7d8fccc459d4c2a6828d..000d59baa8c804201cbd2e2a731c2077196b698f 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp @@ -26,8 +26,7 @@ bool DepthwiseConvKernel::Init(ConvParam *param) { } template <> -void DepthwiseConvKernel::Compute( - const ConvParam ¶m) const { +void DepthwiseConvKernel::Compute(const ConvParam ¶m) { DepthwiseConvCompute(param); } diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp index 64da460da1b90bcc9b16500b9562e270a4110f78..03122047f61c585c3955ca18243ab849fb498728 100644 --- a/src/operators/kernel/arm/dequantize_kernel.cpp +++ b/src/operators/kernel/arm/dequantize_kernel.cpp @@ -29,8 +29,7 @@ bool DequantizeKernel::Init(DequantizeParam *param) { } template <> -void DequantizeKernel::Compute( - const DequantizeParam ¶m) const { +void DequantizeKernel::Compute(const DequantizeParam ¶m) { const Tensor *input = param.input_; Tensor *output = param.out_; float activation_scale = param.activation_scale_->data()[0]; diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp index 4578ac6607d87c316853f6201f02f8204bc41de1..964773ad696ea53fccec62a394f00fa70daf7145 100644 --- a/src/operators/kernel/arm/dropout_kernel.cpp +++ b/src/operators/kernel/arm/dropout_kernel.cpp @@ -27,7 +27,7 @@ bool DropoutKernel::Init(DropoutParam *para) { template struct DropoutFunctor { - DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {} + explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {} inline T operator()(T in) const { return (1 - dropout_pro_) * in; } private: @@ -35,7 +35,7 @@ struct DropoutFunctor { }; template <> -void DropoutKernel::Compute(const DropoutParam ¶m) const { +void DropoutKernel::Compute(const DropoutParam ¶m) { const auto *input_x = param.InputX(); auto *input_x_ptr = input_x->data(); auto *out = param.Out(); diff --git a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp index b85701bb936b2ccc0323e4d534424abb726a69be..f92d9a273467bf15d9d7fad43237af5385d3d54e 100644 --- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp @@ -54,7 +54,7 @@ bool DWConvBNReluKernel::Init(FusionDWConvBNReluParam *param) { template <> void DWConvBNReluKernel::Compute( - const FusionDWConvBNReluParam ¶m) const { + const FusionDWConvBNReluParam ¶m) { DWConvBNReluCompute(param); } template class DWConvBNReluKernel; diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp index 9c6f4a3316385b803a8fdb833490f1fe9e7f41ac..043d27e72f16ab4b38f31d6cff60bd2f4e89a649 100644 --- a/src/operators/kernel/arm/elementwise_add_kernel.cpp +++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { template <> void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) const { + const ElementwiseAddParam ¶m) { ElementwiseAddCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/src/operators/kernel/arm/elementwise_mul_kernel.cpp index 00205952a2567aae5927e318c494c90bc4a5ffbb..9c245707da31d07e2419439c68343f7014beb416 100644 --- a/src/operators/kernel/arm/elementwise_mul_kernel.cpp +++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { template <> void ElementwiseMulKernel::Compute( - const ElementwiseMulParam ¶m) const { + const ElementwiseMulParam ¶m) { ElementwiseMulCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/src/operators/kernel/arm/elementwise_sub_kernel.cpp index d78b3e31098ef7ef929a0d2c00043fab7193b01c..30f607155c4a91f4f523c6596f09c2379970108c 100644 --- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp +++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseSubKernel::Init(ElementwiseSubParam *param) { template <> void ElementwiseSubKernel::Compute( - const ElementwiseSubParam ¶m) const { + const ElementwiseSubParam ¶m) { ElementwiseSubCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/feed_kernel.cpp b/src/operators/kernel/arm/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..598f6df01b16683f4d6e06f6418a2930a7ec8736 --- /dev/null +++ b/src/operators/kernel/arm/feed_kernel.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); + param.Out()->set_lod(param.InputX()->lod()); +} + +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/arm/fetch_kernel.cpp b/src/operators/kernel/arm/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6c25514857dee9029afa3a7a80d5c89a97bbe9be --- /dev/null +++ b/src/operators/kernel/arm/fetch_kernel.cpp @@ -0,0 +1,24 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "operators/kernel/fetch_kernel.h" +namespace paddle_mobile { +namespace operators { +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} +template class FetchKernel; +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/arm/flatten_kernel.cpp b/src/operators/kernel/arm/flatten_kernel.cpp index ef4fe913c4800526f46daa75760afe82fdbee591..4d00e494544557ce05f2af16bb59979ea2b8927f 100644 --- a/src/operators/kernel/arm/flatten_kernel.cpp +++ b/src/operators/kernel/arm/flatten_kernel.cpp @@ -26,7 +26,7 @@ bool FlattenKernel::Init(FlattenParam *param) { } template <> -void FlattenKernel::Compute(const FlattenParam ¶m) const { +void FlattenKernel::Compute(const FlattenParam ¶m) { FlattenCompute(param); } diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp index d9d112e7a762705efe041c74eea9ddb7d5162918..c503edab643def7af0585a18d774b14ca0a3c39d 100644 --- a/src/operators/kernel/arm/fusion_fc_kernel.cpp +++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp @@ -26,8 +26,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { } template <> -void FusionFcKernel::Compute( - const FusionFcParam ¶m) const { +void FusionFcKernel::Compute(const FusionFcParam ¶m) { FusionFcCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/gru_kernel.cpp b/src/operators/kernel/arm/gru_kernel.cpp index 168471185e07a9c1814c708238996a82c1ee0891..a4e89ff42a3d70c0a9a3d1bd7316e18d015a0926 100644 --- a/src/operators/kernel/arm/gru_kernel.cpp +++ b/src/operators/kernel/arm/gru_kernel.cpp @@ -26,7 +26,7 @@ bool GruKernel::Init(GruParam *param) { } template <> -void GruKernel::Compute(const GruParam ¶m) const { +void GruKernel::Compute(const GruParam ¶m) { GruCompute(param); param.OutHidden()->set_lod(param.InputInput()->lod()); // DLOG << "________________" << param.OutHidden()->dims(); diff --git a/src/operators/kernel/arm/im2sequence_kernel.cpp b/src/operators/kernel/arm/im2sequence_kernel.cpp index 8295fd94a31db2ad1c10d32a8c639b067e422f45..07ce0314fa08467d4fc63bc0745a49b8a3b2f263 100644 --- a/src/operators/kernel/arm/im2sequence_kernel.cpp +++ b/src/operators/kernel/arm/im2sequence_kernel.cpp @@ -33,9 +33,9 @@ inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0, template <> void Im2SequenceKernel::Compute( - const Im2SequenceParam ¶m) const { + const Im2SequenceParam ¶m) { const Tensor *in_x = param.Input(); - Tensor *out = param.Output(); + framework::LoDTensor *out = param.Output(); out->mutable_data(); std::vector kernels = param.Kernels(); @@ -52,22 +52,31 @@ void Im2SequenceKernel::Compute( paddings[2], strides[0]); int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]); - const std::vector dilations({1, 1}); - // TODO: verify + out->mutable_data({batch_size * output_height * output_width, + img_channels * kernels[0] * kernels[1]}); + const std::vector dilations({1, 1}); + // TODO(): verify auto out_dims = out->dims(); out->Resize({batch_size, out->numel() / batch_size}); - for (int i = 0; i < batch_size; i++) { const Tensor src = in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); Tensor dst = out->Slice(i, i + 1).Resize( {output_height, output_width, img_channels, kernels[0], kernels[1]}); - math::Im2ColFunctor f; f(src, dilations, strides, paddings, &dst); } out->Resize(out_dims); + framework::LoD lod(1); + lod[0].reserve(batch_size + 1); + int offset = 0; + lod[0].push_back(offset); + for (int i = 0; i < batch_size; ++i) { + offset += output_height * output_width; + lod[0].push_back(offset); + } + out->set_lod(lod); } template class Im2SequenceKernel; diff --git a/src/operators/kernel/arm/lookup_kernel.cpp b/src/operators/kernel/arm/lookup_kernel.cpp index 584c497c701bd0598e0a151774fe60b7c7fee718..0e6df6ab6bf19f67b0c5f5a873d4a47215167e45 100644 --- a/src/operators/kernel/arm/lookup_kernel.cpp +++ b/src/operators/kernel/arm/lookup_kernel.cpp @@ -25,7 +25,7 @@ bool LookupKernel::Init(LookupParam *param) { } template <> -void LookupKernel::Compute(const LookupParam ¶m) const { +void LookupKernel::Compute(const LookupParam ¶m) { LookupCompute(param); param.Out()->set_lod(param.InputIds()->lod()); } diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp index 3ec1bdd9a0e2ebbce555eef944fe56750505430f..bf049053fc5b9157f24c50233742eea3c0ca2de1 100644 --- a/src/operators/kernel/arm/lrn_kernel.cpp +++ b/src/operators/kernel/arm/lrn_kernel.cpp @@ -26,7 +26,7 @@ bool LrnKernel::Init(LrnParam *param) { } template <> -void LrnKernel::Compute(const LrnParam ¶m) const { +void LrnKernel::Compute(const LrnParam ¶m) { LrnCompute(param); } diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp index 276281f963e449af9d55f7c5ca58ef5da17e6f93..59d16600d71d247c42bb7625a3dddd5952a33705 100644 --- a/src/operators/kernel/arm/mul_kernel.cpp +++ b/src/operators/kernel/arm/mul_kernel.cpp @@ -26,7 +26,7 @@ bool MulKernel::Init(MulParam *param) { } template <> -void MulKernel::Compute(const MulParam ¶m) const { +void MulKernel::Compute(const MulParam ¶m) { MulCompute(param); param.Out()->set_lod(param.InputX()->lod()); } diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp index 938f81cf485eb64f408c0fb274eeec673349e306..61638da0051c7b27b695752c445f0fd6b20114b5 100644 --- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp +++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp @@ -27,7 +27,7 @@ bool MultiClassNMSKernel::Init(MultiClassNMSParam *param) { template <> void MultiClassNMSKernel::Compute( - const MultiClassNMSParam ¶m) const { + const MultiClassNMSParam ¶m) { MultiClassNMSCompute(param); } diff --git a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp index e72c29135e9898d3b5342d1c4b4f0176f105a62a..1ae11aba41f1b2dbd9207e0808990a262bb80f56 100644 --- a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp +++ b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp @@ -28,7 +28,7 @@ bool PolygonBoxTransformKernel::Init( template <> void PolygonBoxTransformKernel::Compute( - const PolygonBoxTransformParam ¶m) const { + const PolygonBoxTransformParam ¶m) { PolygonBoxTransformCompute(param); } diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp index 60d6f1401876b957649d08889218b88cf1fe5eef..58d6359efa48b0db215269a631e7e4cb57c429d9 100644 --- a/src/operators/kernel/arm/pool_kernel.cpp +++ b/src/operators/kernel/arm/pool_kernel.cpp @@ -25,7 +25,7 @@ bool PoolKernel::Init(PoolParam *param) { } template <> -void PoolKernel::Compute(const PoolParam ¶m) const { +void PoolKernel::Compute(const PoolParam ¶m) { PoolCompute(param); } } // namespace operators diff --git a/src/operators/kernel/arm/prelu_kernel.cpp b/src/operators/kernel/arm/prelu_kernel.cpp index e1ec927fb13d1f4a2e600d46f65f2806448059d9..591bd644165f1a271a879073b27429d1780cbfb5 100644 --- a/src/operators/kernel/arm/prelu_kernel.cpp +++ b/src/operators/kernel/arm/prelu_kernel.cpp @@ -35,7 +35,7 @@ struct PReluFunctor { * @b 特化到具体平台的实现, param 从 op 层传入 * */ template <> -void PReluKernel::Compute(const PReluParam ¶m) const { +void PReluKernel::Compute(const PReluParam ¶m) { auto *x = param.InputX(); auto *alpha = param.InputAlpha(); auto *out = param.Out(); diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp index 71011fa2112f36d573b5bdc55f1b5bf92318c448..c067d3388dd928b032178add99c6567a8add20d3 100644 --- a/src/operators/kernel/arm/prior_box_kernel.cpp +++ b/src/operators/kernel/arm/prior_box_kernel.cpp @@ -26,8 +26,7 @@ bool PriorBoxKernel::Init(PriorBoxParam *param) { } template <> -void PriorBoxKernel::Compute( - const PriorBoxParam ¶m) const { +void PriorBoxKernel::Compute(const PriorBoxParam ¶m) { PriorBoxCompute(param); } diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index 11a1f0a53d4886e1a07d258b76b3827671471dca..17f442abe4e03d936eb3b317d5b6f164ac0924e7 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -279,8 +279,7 @@ bool QuantizeKernel::Init(QuantizeParam *param) { } template <> -void QuantizeKernel::Compute( - const QuantizeParam ¶m) const { +void QuantizeKernel::Compute(const QuantizeParam ¶m) { float max_abs = 0.f; const Tensor *input = param.input_; Tensor *output = param.out_; diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp index 6e04e6013aa8dd5c50dcc22a720b470b08ecd648..8ee103484eb753913e5554b64d6dac523066322a 100644 --- a/src/operators/kernel/arm/relu_kernel.cpp +++ b/src/operators/kernel/arm/relu_kernel.cpp @@ -26,7 +26,7 @@ bool ReluKernel::Init(ReluParam *param) { } template <> -void ReluKernel::Compute(const ReluParam ¶m) const { +void ReluKernel::Compute(const ReluParam ¶m) { ReluCompute(param); } diff --git a/src/operators/kernel/arm/reshape2_kernel.cpp b/src/operators/kernel/arm/reshape2_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..093105f906da2287015417ec05b709aebd4a1fb2 --- /dev/null +++ b/src/operators/kernel/arm/reshape2_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#include "operators/kernel/reshape2_kernel.h" +#include "operators/kernel/central-arm-func/reshape2_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool Reshape2Kernel::Init(Reshape2Param *param) { + return true; +} + +template <> +void Reshape2Kernel::Compute(const Reshape2Param ¶m) { + Reshape2Compute(param); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp index 235288ae13e2c557e6f7310727f5d8e6e83cedf6..800808f9c23cd07d17f8207b9b51e96d3feb34f3 100644 --- a/src/operators/kernel/arm/reshape_kernel.cpp +++ b/src/operators/kernel/arm/reshape_kernel.cpp @@ -26,7 +26,7 @@ bool ReshapeKernel::Init(ReshapeParam *param) { } template <> -void ReshapeKernel::Compute(const ReshapeParam ¶m) const { +void ReshapeKernel::Compute(const ReshapeParam ¶m) { ReshapeCompute(param); } diff --git a/src/operators/kernel/arm/resize_kernel.cpp b/src/operators/kernel/arm/resize_kernel.cpp index 5c0c186554a31454447b1df47a1b7573fd948fb9..b53b7545e33c929fe0b55bccd68e7b955db0d676 100644 --- a/src/operators/kernel/arm/resize_kernel.cpp +++ b/src/operators/kernel/arm/resize_kernel.cpp @@ -108,7 +108,7 @@ void ResizeTensor(const Tensor* src, Tensor* dst) { } template <> -void ResizeKernel::Compute(const ResizeParam& param) const { +void ResizeKernel::Compute(const ResizeParam& param) { const auto* input_x = param.InputX(); const auto& input_x_dims = input_x->dims(); auto* out = param.Out(); diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp index 299132ea00f40838249022c45d994e7d88547eaa..bded56275f80741c552d4978bb238d6f0d6339db 100644 --- a/src/operators/kernel/arm/scale_kernel.cpp +++ b/src/operators/kernel/arm/scale_kernel.cpp @@ -23,7 +23,7 @@ namespace operators { * @b 特化到具体平台的实现, param 从 op 层传入 * */ template <> -void ScaleKernel::Compute(const ScaleParam ¶m) const { +void ScaleKernel::Compute(const ScaleParam ¶m) { const auto *input_x = param.InputX(); auto *input_x_ptr = input_x->data(); auto *out = param.Out(); diff --git a/src/operators/kernel/arm/shape_kernel.cpp b/src/operators/kernel/arm/shape_kernel.cpp index 1687cfb4cdaf12eb2be9d465a83b82034b59f7cc..4adbf8fa1321c57330b480068ff1f7df7454d7e6 100644 --- a/src/operators/kernel/arm/shape_kernel.cpp +++ b/src/operators/kernel/arm/shape_kernel.cpp @@ -26,7 +26,7 @@ bool ShapeKernel::Init(ShapeParam *param) { } template <> -void ShapeKernel::Compute(const ShapeParam ¶m) const { +void ShapeKernel::Compute(const ShapeParam ¶m) { ShapeCompute(param); } diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp index 7912fd8762b693cd40c632d6b152406ed4b0c568..3d6e14ffea80169172431229e34309cde331d588 100644 --- a/src/operators/kernel/arm/sigmoid_kernel.cpp +++ b/src/operators/kernel/arm/sigmoid_kernel.cpp @@ -32,7 +32,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { } template <> -void SigmoidKernel::Compute(const SigmoidParam ¶m) const { +void SigmoidKernel::Compute(const SigmoidParam ¶m) { SigmoidCompute(param); } diff --git a/src/operators/kernel/arm/slice_kernel.cpp b/src/operators/kernel/arm/slice_kernel.cpp index 62efec9d2fb01568a108df8f3516085d81865bf7..e373b569870c81587377ac02e578397518513a85 100644 --- a/src/operators/kernel/arm/slice_kernel.cpp +++ b/src/operators/kernel/arm/slice_kernel.cpp @@ -17,6 +17,14 @@ limitations under the License. */ #include "operators/kernel/slice_kernel.h" namespace paddle_mobile { -namespace operators {} +namespace operators { + +template <> +bool SliceKernel::Init(SliceParam* param) { + return true; +} +template <> +void SliceKernel::Compute(const SliceParam& param) {} +} // namespace operators } // namespace paddle_mobile #endif diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp index f86a10601aa3a67300736f2f4c751c05bf41a781..d5a1009fd79d57d8815d313ed61bbc5d7bf32bbe 100644 --- a/src/operators/kernel/arm/softmax_kernel.cpp +++ b/src/operators/kernel/arm/softmax_kernel.cpp @@ -26,7 +26,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { } template <> -void SoftmaxKernel::Compute(const SoftmaxParam ¶m) const { +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { SoftmaxCompute(param); } diff --git a/src/operators/kernel/arm/split_kernel.cpp b/src/operators/kernel/arm/split_kernel.cpp index d2ca34f764adc50154fb58e3a6248f9311bbface..13c7567e3db137f0c579ad0e33b1856aaf8334f2 100644 --- a/src/operators/kernel/arm/split_kernel.cpp +++ b/src/operators/kernel/arm/split_kernel.cpp @@ -26,7 +26,7 @@ bool SplitKernel::Init(SplitParam *param) { } template <> -void SplitKernel::Compute(const SplitParam ¶m) const { +void SplitKernel::Compute(const SplitParam ¶m) { SplitCompute(param); } diff --git a/src/operators/kernel/arm/sum_kernel.cpp b/src/operators/kernel/arm/sum_kernel.cpp index 0290037522a2bf3b3c88ce129eda277a401fecb5..2b36a382a1681b08e5f6c87b9031492e81a579cd 100644 --- a/src/operators/kernel/arm/sum_kernel.cpp +++ b/src/operators/kernel/arm/sum_kernel.cpp @@ -26,7 +26,7 @@ bool SumKernel::Init(SumParam *param) { } template <> -void SumKernel::Compute(const SumParam ¶m) const { +void SumKernel::Compute(const SumParam ¶m) { SumCompute(param); param.Out()->set_lod(param.Inputs()[0]->lod()); } diff --git a/src/operators/kernel/arm/transpose2_kernel.cpp b/src/operators/kernel/arm/transpose2_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..228f210ea1c52f1bfe601bd46f741347dabd6cce --- /dev/null +++ b/src/operators/kernel/arm/transpose2_kernel.cpp @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef TRANSPOSE2_OP + +#include "operators/kernel/transpose2_kernel.h" +#include "operators/kernel/central-arm-func/transpose2_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool Transpose2Kernel::Init(Transpose2Param *param) { + return true; +} + +template <> +void Transpose2Kernel::Compute(const Transpose2Param ¶m) { + Transpose2Compute(param); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp index bb7a881bdc1d2706a25a77833ca38695ede2fec7..f90376eb507253badb209838a3db4bafbcfbb5b9 100644 --- a/src/operators/kernel/arm/transpose_kernel.cpp +++ b/src/operators/kernel/arm/transpose_kernel.cpp @@ -25,8 +25,7 @@ bool TransposeKernel::Init(TransposeParam *param) { } template <> -void TransposeKernel::Compute( - const TransposeParam ¶m) const { +void TransposeKernel::Compute(const TransposeParam ¶m) { TransposeCompute(param); } diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h index beac7399583d074956fa4564fdd9312b2d7985f0..1f2db456d360d6eb6c684fb98e3807b07cc89b92 100644 --- a/src/operators/kernel/batchnorm_kernel.h +++ b/src/operators/kernel/batchnorm_kernel.h @@ -22,13 +22,11 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { -using namespace framework; - template class BatchNormKernel : public framework::OpKernelBase> { public: - void Compute(const BatchNormParam ¶m) const; + void Compute(const BatchNormParam ¶m); bool Init(BatchNormParam *param); }; diff --git a/src/operators/kernel/bilinear_interp_kernel.h b/src/operators/kernel/bilinear_interp_kernel.h index ac3dfcb16190315f72dc60da54c4f944874e4458..9a68fe65a562a8567dab2e5977506e083f7889a2 100644 --- a/src/operators/kernel/bilinear_interp_kernel.h +++ b/src/operators/kernel/bilinear_interp_kernel.h @@ -29,7 +29,7 @@ class BilinearInterpKernel : public framework::OpKernelBase> { public: - void Compute(const BilinearInterpParam& param) const; + void Compute(const BilinearInterpParam& param); bool Init(BilinearInterpParam* param); }; } // namespace operators diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h index 58144a87349ed3a6504e0074903594be3aa6fe8f..eadb21b3d5ecb95ef82cfef2ac8c3245e925ec7c 100644 --- a/src/operators/kernel/box_coder_kernel.h +++ b/src/operators/kernel/box_coder_kernel.h @@ -29,7 +29,7 @@ template class BoxCoderKernel : public framework::OpKernelBase> { public: - void Compute(const BoxCoderParam& param) const; + void Compute(const BoxCoderParam& param); bool Init(BoxCoderParam* param); }; } // namespace operators diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h index 42c01d2825e052a52e7021a1b2a97997fb9c915b..45d5dc76d1e95668638706a252cc24d7ff2dec40 100644 --- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h +++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h @@ -29,10 +29,9 @@ void FusionFcCompute(const FusionFcParam ¶m) { auto *input_z_data = input_z->data(); int axis = param.Axis(); Tensor *out = param.Out(); - auto *out_data = out->mutable_data(); // int m = out->dims()[0]; // int n = out->dims()[1]; - + auto *out_data = out->mutable_data(); const Tensor x_matrix = input_x->dims().size() > 2 ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h index 37479c22efe95b6506054cf3ded5855aa766c34c..941c237865707bce854aedba56029a4f5de9b2bf 100644 --- a/src/operators/kernel/central-arm-func/pool_arm_func.h +++ b/src/operators/kernel/central-arm-func/pool_arm_func.h @@ -83,6 +83,7 @@ void PoolCompute(const PoolParam ¶m) { #if __aarch64__ PoolBasic(pooling_type, ksize, strides, paddings, in_x, out); #else + /// todo: fix bug in Pool2x2 if (pooling_type == "max") { math::Pool2x2Maxs2p0(strides, paddings, in_x, out); } else if (pooling_type == "avg") { diff --git a/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/src/operators/kernel/central-arm-func/reshape2_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..c22cf120313b039944932fb4e6cc52aa59a68fd4 --- /dev/null +++ b/src/operators/kernel/central-arm-func/reshape2_arm_func.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP +#pragma once + +#include +#include "operators/kernel/reshape_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void Reshape2Compute(const Reshape2Param ¶m) { + const auto *input_x = param.InputX(); + const auto &input_x_dims = input_x->dims(); + auto *out = param.Out(); + framework::DDim out_dims = out->dims(); + const auto *input_shape = param.InputShape(); + + if (input_shape) { + auto *shape_data = input_shape->data(); + framework::Tensor cpu_shape_tensor; + auto shape = + std::vector(shape_data, shape_data + input_shape->numel()); + out_dims = ValidateShape(shape, input_x->dims()); + } else { + auto &shape = param.Shape(); + out_dims = ValidateShape(shape, input_x_dims); + } + + bool inplace = param.Inplace(); + out->Resize(out_dims); + if (!inplace) { + out->mutable_data(); + framework::TensorCopy(*input_x, out); + out->Resize(out_dims); + } else { + out->ShareDataWith(*input_x); + out->Resize(out_dims); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/softmax_arm_func.h b/src/operators/kernel/central-arm-func/softmax_arm_func.h index d311d97984a7207df9075befe71a9806092966e1..a94c8299c514bc9e2937daf57b1a845d7be56b16 100644 --- a/src/operators/kernel/central-arm-func/softmax_arm_func.h +++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h @@ -24,6 +24,7 @@ void SoftmaxCompute(const SoftmaxParam ¶m) { Tensor *out = param.Out(); auto x_dims = in_x->dims(); out->Resize(x_dims); + out->mutable_data(); math::SoftmaxFuntor()(in_x, out); } } // namespace operators diff --git a/src/operators/kernel/central-arm-func/transpose2_arm_func.h b/src/operators/kernel/central-arm-func/transpose2_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..dea90e863b20f19820d60d9cce67b6849d3c467b --- /dev/null +++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP +#pragma once + +#include +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void Transpose2Compute(const Transpose2Param& param) { + const auto* input_x = param.InputX(); + const auto input_x_dims = input_x->dims(); + auto* out = param.Out(); + const auto axis = param.Axis(); + const auto* input_x_data = input_x->data(); + auto* out_data = out->mutable_data(); + + size_t ndim = axis.size(); + std::vector xdim(ndim); + std::vector xstride(ndim); + std::vector xout(ndim); + for (int i = 0; i < ndim; i++) { + int j = ndim - 1 - i; + xdim[j] = input_x_dims[axis[i]]; + xstride[j] = 1; + for (int k = axis[i] + 1; k < ndim; k++) { + xstride[j] *= input_x_dims[k]; + } + xout[j] = xstride[j] * xdim[j]; + } + + auto numel = input_x->numel(); + size_t pind = 0; + std::vector ind(ndim); + for (int i = 0; i < numel; i++) { + out_data[i] = input_x_data[pind]; + ind[0]++; + pind += xstride[0]; + for (int j = 0; j < ndim - 1; j++) { + if (ind[j] == xdim[j]) { + ind[j + 1]++; + ind[j] = 0; + pind += xstride[j + 1]; + pind -= xout[j]; + } else { + break; + } + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/batchnorm_kernel.cpp b/src/operators/kernel/cl/batchnorm_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0d5695cb80736dcc126ce5f726c0a2566884fe45 --- /dev/null +++ b/src/operators/kernel/cl/batchnorm_kernel.cpp @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef BATCHNORM_OP + +#include "operators/kernel/batchnorm_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool BatchNormKernel::Init(BatchNormParam *param) { + this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl"); + const framework::CLImage *mean = param->InputMean(); + const framework::CLImage *variance = param->InputVariance(); + const framework::CLImage *scale = param->InputScale(); + const framework::CLImage *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + const int C = mean->numel(); + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + float *new_scale_ptr = new float[C]; + float *new_bias_ptr = new float[C]; + + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + framework::CLImage *new_scale = new framework::CLImage(); + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + framework::CLImage *new_bias = new framework::CLImage(); + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + delete[](new_scale_ptr); + delete[](new_bias_ptr); + + return true; +} + +template <> +void BatchNormKernel::Compute( + const BatchNormParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY()); + + auto input = param.InputX()->GetCLImage(); + auto out = param.OutputY()->GetCLImage(); + auto new_scale = param.NewScale()->GetCLImage(); + auto new_bias = param.NewBias()->GetCLImage(); + const int out_width = default_work_size[1]; + + clSetKernelArg(kernel, 1, sizeof(int), &out_width); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale); + clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias); + clSetKernelArg(kernel, 5, sizeof(cl_mem), &out); + + // cl_event out_event = param.OutputY()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class BatchNormKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..9d0857a45e0766482e2dbb6ded77edb07517bc0f --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void batchnorm(__private const int out_width, + __read_only image2d_t input, + __read_only image2d_t new_scale_image, + __read_only image2d_t new_bias_image, + __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0)); + half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0)); + + int pos_x = mad24(out_c, out_width, out_w); + half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh)); + half4 out = mad(in, new_scale, new_bias); + + write_imageh(output, (int2)(pos_x, out_nh), out); +} diff --git a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..1f2e36687ab04be2b8c18b26e868b7709bc3c231 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x/w; + coords_bias.y = 0; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords_bias); + half4 output = in + biase; + write_imageh(outputImage,coords,output); + } diff --git a/src/operators/kernel/cl/cl_kernel/cl_common.h b/src/operators/kernel/cl/cl_kernel/cl_common.h new file mode 100644 index 0000000000000000000000000000000000000000..34f36eb9a3ffbdc5781c974926ea4a7d5258636b --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/cl_common.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +inline half4 activation(half4 in +#ifdef PRELU + , + half4 prelu_alpha +#endif +) { + half4 output; +#ifdef PRELU + output = select(prelu_alpha * in, in, in >= (half4)0.0); +#endif + +#ifdef RELU + output = fmax(in, (half4)(0.0f)); +#endif + return output; +} diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..aa3eaedda5634294f231831d550296dfdba0dd48 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -0,0 +1,19 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define BATCH_NORM +#define RELU + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b8bf7e7d7d9fbb9eb9e930e9c1c3a58bb3391efc --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..8d686c20dfaa31204a4c44105fb479423352fb9e --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl @@ -0,0 +1,17 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define RELU +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..2a5c823295c7562361433414cf35be81d2fbf00c --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl new file mode 100644 index 0000000000000000000000000000000000000000..63e6e62345c8034ef914b4c385e6fd976b267c4c --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -0,0 +1,701 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* +conv +conv_bn +conv_add +conv_relu +conv_bn_relu +conv_add_relu +conv_add_bn_relu +*/ + +#include "cl_common.h" + +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input[9]; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + + +/* + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + float4 weight_x = read_imagef(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + float4 weight_y = read_imagef(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + float4 weight_z = read_imagef(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + float4 weight_w = read_imagef(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } +*/ + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + + + + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x ; + int filter_y = pos_in_filter_block.y ; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + half4 filters[9]; + filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); + filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); + filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); + filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); + filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); + filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); + filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); + filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + + for(int i = 0 ;i < 9 ; i++){ + output += inputs[i] * filters[i]; + } +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + write_imageh(output_image, output_pos, output); + +} + + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); +/* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); +*/ + + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); +} + + + +/* + +__kernel void conv_1x1_4(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width, + __private const int input_height, + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0) * 4; + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0)); + half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0)); + half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0)); +#else + half4 output0 = 0.0f; + half4 output1 = 0.0f; + half4 output2 = 0.0f; + half4 output3 = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + + output0 = mad(input.x, weight0_0, output0); + output0 = mad(input.y, weight0_1, output0); + output0 = mad(input.z, weight0_2, output0); + output0 = mad(input.w, weight0_3, output0); + + half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0)); + half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1)); + half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2)); + half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3)); + + output1 = mad(input.x, weight1_0, output1); + output1 = mad(input.y, weight1_1, output1); + output1 = mad(input.z, weight1_2, output1); + output1 = mad(input.w, weight1_3, output1); + + half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0)); + half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1)); + half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2)); + half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3)); + + output2 = mad(input.x, weight2_0, output2); + output2 = mad(input.y, weight2_1, output2); + output2 = mad(input.z, weight2_2, output2); + output2 = mad(input.w, weight2_3, output2); + + half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0)); + half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1)); + half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2)); + half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3)); + + output3 = mad(input.x, weight3_0, output3); + output3 = mad(input.y, weight3_1, output3); + output3 = mad(input.z, weight3_2, output3); + output3 = mad(input.w, weight3_3, output3); + + } + +#ifdef BATCH_NORM + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0)); + + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0)); + + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0)); + + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0)); + +#endif + +#ifdef RELU + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); +#endif + + int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos0, output0); + + + int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos1, output1); + + + int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos2, output2); + + + int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos3, output3); +} + +*/ + + + + + + + + diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..3c3497f917d8a16c7c7e304edf00a4250066dce7 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl @@ -0,0 +1,18 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#define BIASE +#define BATCH_NORM +#define RELU +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..2a5c823295c7562361433414cf35be81d2fbf00c --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl @@ -0,0 +1,15 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "conv_kernel.inc.cl" diff --git a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..f304764868959ce028a8448c4d311db878cc1f6e --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords); + half4 output = in + biase; + write_imageh(outputImage,coords,output); + } diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..200a221c9bda49c42f2caff374fc24d6e4df27e5 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c) + { + int i = get_global_id(0); + int j = get_global_id(1); + half4 pixel; + pixel.x = convert_half(in[(i * w + j)]); + if(c>=2){ + pixel.y = convert_half(in[h * w + (i * w + j)]); + }else{ + pixel.y = 0.0; + } + if(c>=3){ + pixel.z = convert_half(in[2 * h * w + (i * w + j)]); + }else{ + pixel.z = 0.0; + } + pixel.w = 0.0; + int2 coords; + coords.x = j; + coords.y = i; + + write_imageh(outputImage,coords,pixel); + } diff --git a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..64bb1845b0bd2c04c8761845b90dbed9e391a77b --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void fetch(__private const int in_height, + __private const int in_width, + __read_only image2d_t input, + __global float* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh)); + + const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = convert_float(in.x); + out[index + size_ch] = convert_float(in.y); + out[index + size_ch * 2] = convert_float(in.z); + out[index + size_ch * 3] = convert_float(in.w); +} + +__kernel void fetch_2d(__private const int in_height, + __private const int in_width, + __read_only image2d_t input, + __global float* out) { + const int in_w = get_global_id(1); + const int in_h = get_global_id(2); + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(in_w, in_h)); + + const int index = (in_h * in_width + in_w) * 4; + out[index] = convert_float(in.x); + out[index + 1] = convert_float(in.y); + out[index + 2] = convert_float(in.z); + out[index + 3] = convert_float(in.w); +} diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..fc660941f8863a0056c4618f0207ae69533d3242 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#define MIN_VALUE -FLT_MAX + +__kernel void pool_max( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 max_value = (half4)(MIN_VALUE); + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + max_value = max(max_value, tmp); + } + } + + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), max_value); +} + +__kernel void pool_avg( + __private const int in_height, __private const int in_width, + __private const int out_height, __private const int out_width, + __private const int pad_top, __private const int pad_left, + __private const int stride_h, __private const int stride_w, + __private const int ksize_h, __private const int ksize_w, + __read_only image2d_t input, __write_only image2d_t output) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_height; + const int out_h = out_nh % out_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int start_h = max(out_h * stride_h - pad_top, 0); + int end_h = min(start_h + ksize_h, in_height); + + int start_w = max(out_w * stride_w - pad_left, 0); + int end_w = min(start_w + ksize_w, in_width); + + const int pos_in_x = out_c * in_width; + const int pos_in_y = out_n * in_height; + half4 sum = (half4)(0.0f); + int num = 0; + for (int y = start_h; y < end_h; ++y) { + for (int x = start_w; x < end_w; ++x) { + sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y)); + num++; + } + } + half4 avg = sum / num; + const int pos_out_x = mad24(out_c, out_width, out_w); + write_imageh(output, (int2)(pos_out_x, out_nh), avg); +} diff --git a/src/operators/kernel/cl/cl_kernel/relu.cl b/src/operators/kernel/cl/cl_kernel/relu.cl new file mode 100644 index 0000000000000000000000000000000000000000..cc8f9c3742f7794c51a5e04ac4edde617af0e388 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/relu.cl @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void relu(__read_only image2d_t input, + __write_only image2d_t output){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); + write_imageh(output, (int2)(x, y), in); +} + +__kernel void relu_p0(__read_only image2d_t input, + __write_only image2d_t output){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in); + write_imageh(output, (int2)(x, y), in); +} +__kernel void relu_p1(__read_only image2d_t input, + __write_only image2d_t output){ + + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + write_imageh(output, (int2)(x, y), in); +} diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl new file mode 100644 index 0000000000000000000000000000000000000000..0ffc64f15cd531879de4852f976769790b6bafe4 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/reshape.cl @@ -0,0 +1,78 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void reshape(__read_only image2d_t input, + __write_only image2d_t output, + __private const int d0, + __private const int d1, + __private const int d2, + __private const int d3, + __private const int x0, + __private const int x1, + __private const int x2, + __private const int x3) { + const int x = get_global_id(0); + const int y = get_global_id(1); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 in = read_imageh(input, sampler, (int2)(x, y)); + + write_imageh(output, (int2)(x, y), in); +} + + +/* + +__kernel void reshape(__read_only image2d_t input, + __write_only image2d_t output, + __private const int d0, + __private const int d1, + __private const int d2, + __private const int d3, + __private const int x0, + __private const int x1, + __private const int x2, + __private const int x3) { + const int x = get_global_id(0); + const int y = get_global_id(1); + int obx = x / x3; + int oby = y / x2; + int ox = x % x3; + int oy = y % x2; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 r; + for (int i = 0; i < 4; i++) { + int t = obx * 4 + i; + if (t > x1) break; + int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy; + int i3 = oindex % d3; oindex /= d3; + int i2 = oindex % d2; oindex /= d2; + int i1 = oindex % d1; oindex /= d1; + int i0 = oindex; + int ix = (i1 / 4) * d3 + i3; + int iy = i0 * d2 + i2; + half4 p = read_imageh(input, sampler, (int2)(ix, iy)); + ((half*)&r)[i] = ((half*)&p)[i1%4]; + } + write_imageh(output, (int2)(x, y), r); +} + +*/ diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl new file mode 100644 index 0000000000000000000000000000000000000000..215ec69fc283dcb2b538300cb5591b2b9e4b6a13 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/softmax.cl @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void softmax(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int group + ) { + const int out_c = get_global_id(0); // block index + const int out_w = get_global_id(1); // index in one block + const int out_nh = get_global_id(2); + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half maxv = 0.0f; + for (int i = 0; i < group; ++i) { + half4 temp = read_imageh(input_image, sampler, (int2)(i, 0)); + maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w)))); + } + + + half4 rsum = (half4)(0.0f); + for (int i = 0; i < group; ++i) { + half4 r = read_imageh(input_image, sampler, (int2)(i, 0)); + rsum += convert_half4(exp(convert_float4(r - maxv))); + } + + float sum = rsum.x + rsum.y + rsum.z + rsum.w; + + half4 rr = read_imageh(input_image, sampler, (int2)(out_w, out_nh)); + half4 result = convert_half4(exp(convert_float4(rr - maxv)) / sum); + write_imageh(output_image, (int2)(out_w, out_nh), result); +} + +/* + +__kernel void softmax(__read_only image2d_t input, + __write_only image2d_t output, + __private const int d0, + __private const int d1, + __private const int d2, + __private const int d3) { + const int z = get_global_id(0); + const int x = get_global_id(1); + const int y = get_global_id(2); + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 cv = read_imageh(input, sampler, (int2)(x, y)); + half4 maxv = cv; + for (int i = 0; i < d3; i++) { + half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y)); + maxv = max(maxv, temp); + } + half4 sum = (half4)0.0f; + // half4 x = = (half4)0.0f; + for (int i = 0; i < d3; i++) { + half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y)); + sum += exp(temp - maxv); + } + half4 r = exp(cv - maxv) / sum; + + write_imageh(output, (int2)(z * d3 + x, y), r); +} + +*/ diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..33172e4f0343f1bb26e34f6c7d3b009629b60430 --- /dev/null +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -0,0 +1,289 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBNRELU_OP + +#include "operators/kernel/conv_add_bn_relu_kernel.h" +#include "framework/cl/cl_image.h" +#include "framework/cl/cl_tool.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddBNReluKernel::Init( + FusionConvAddBNReluParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + param->Bias()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // const CL *mean = param->InputMean(); + const framework::CLImage *mean = param->InputMean(); + const framework::CLImage *variance = param->InputVariance(); + const framework::CLImage *scale = param->InputScale(); + const framework::CLImage *bias = param->InputBias(); + const float epsilon = param->Epsilon(); + + const int C = mean->numel(); + + // for (int j = 0; j < C; ++j) { + // DLOG << " mean - " << j << mean->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " variance - " << j << variance->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " scale - " << j << scale->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " bias - " << j << bias->data()[j]; + // } + + // + // DLOG << " climage mean: " << *mean; + // DLOG << " climage variance: " << *variance; + // DLOG << " climage scale: " << *scale; + // DLOG << " climage bias: " << *bias; + + auto mean_ptr = mean->data(); + auto variance_ptr = variance->data(); + auto scale_ptr = scale->data(); + auto bias_ptr = bias->data(); + + float inv_std_ptr[C]; + for (int i = 0; i < C; i++) { + inv_std_ptr[i] = + 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); + } + float *new_scale_ptr = new float[C]; + float *new_bias_ptr = new float[C]; + + for (int i = 0; i < C; i++) { + new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; + new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; + } + + framework::CLImage *new_scale = new framework::CLImage(); + + // for (int j = 0; j < C; ++j) { + // DLOG << " new scale - " << j << new_scale_ptr[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " new bias - " << j << new_bias_ptr[j]; + // } + + new_scale->SetTensorData(new_scale_ptr, variance->dims()); + new_scale->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // DLOG << " climage - y bias: " << *(param->Bias()); + // + // DLOG << " climage - new scale: " << *new_scale; + + framework::CLImage *new_bias = new framework::CLImage(); + + new_bias->SetTensorData(new_bias_ptr, variance->dims()); + new_bias->InitCLImage(this->cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + // DLOG << " climage - new bias: " << *new_bias; + // + // DLOG << " climage - filter: " << *(param->Filter()); + + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + delete[](new_scale_ptr); + delete[](new_bias_ptr); + + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + + param->SetOffset(offset); + + /* + if (param->Filter()->dims()[2] == 1 && + param->Filter()->dims()[3] == 1 && + (param->Filter()->dims()[0] % 16) == 0) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv 1x1 4"; + } + */ + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv 1x1"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu depth_conv_3x3"; + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv_3x3"; + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddBNReluKernel::Compute( + const FusionConvAddBNReluParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto biase = param.Bias()->GetCLImage(); + auto new_scale = param.NewScale()->GetCLImage(); + auto new_bias = param.NewBias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + // DLOG << " c block " << c_block; + // DLOG << " w " << w; + // DLOG << " nh " << nh; + // DLOG << " stride " << stride; + // DLOG << " offset " << offset; + // DLOG << " input_c " << input_c; + // DLOG << " dilation " << dilation; + // DLOG << " input width " << input_width; + // DLOG << " input height " << input_height; + // DLOG << " output width " << output_width; + // DLOG << " output height " << output_height; + // DLOG << " input dim " << param.Input()->dims(); + // DLOG << " output dim " << param.Output()->dims(); + // DLOG << " filter dim " << param.Filter()->dims(); + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + /* + if (param.Filter()->dims()[2] == 1 && + param.Filter()->dims()[3] == 1 && + param.Filter()->dims()[0] % 16 == 0) { + DLOG << " before modifi work size: " << default_work_size; + + default_work_size[0] = default_work_size[0] / 4; + + DLOG << " modification work size: " << default_work_size; + DLOG << " input dims " << param.Input()->dims(); + DLOG << " output dims " << param.Output()->dims(); + DLOG << " filter dims: " << param.Filter()->dims(); + DLOG << " biase dims : " << param.Bias()->dims(); + + } + */ + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddBNReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e30c6d31db645fb5d18bf70ef5b6876a5f683da --- /dev/null +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -0,0 +1,149 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADD_OP + +#include "operators/kernel/conv_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddKernel::Init(FusionConvAddParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddKernel::Compute( + const FusionConvAddParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + DLOG << "---yangfei30---"; + DLOG << *param.Filter(); + DLOG << param.Paddings(); + auto biase = param.Bias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..814cff634cb0c4c2d5dd6e6706b558bb1cd64f22 --- /dev/null +++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDRELU_OP + +#include "operators/kernel/conv_add_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddReluKernel::Init( + FusionConvAddReluParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Bias()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + param->Filter()->InitNImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_1x1", "conv_add_relu_kernel.cl"); + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_relu_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvAddReluKernel::Compute( + const FusionConvAddReluParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + DLOG << "---yangfei30---"; + DLOG << *param.Filter(); + DLOG << param.Paddings(); + auto biase = param.Bias()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvAddReluKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/conv_kernel.cpp b/src/operators/kernel/cl/conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..05cefadce052fb65664cc797c800ec67e43f3a2c --- /dev/null +++ b/src/operators/kernel/cl/conv_kernel.cpp @@ -0,0 +1,140 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#include "operators/kernel/conv_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvKernel::Init(ConvParam *param) { + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + + auto filter_ddim = param->Filter()->dims(); + + std::vector filter_shape( + {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]}); + framework::DDim ddim = framework::make_ddim(filter_shape); + if (filter_ddim[1] == 1) { + param->Filter()->Resize(ddim); + } + + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + + DLOG << " init helper: " << &cl_helper_; + DLOG << " conv kernel add kernel ~ "; + DLOG << " width of one block: " << param->Filter()->dims()[3]; + DLOG << " height of one block: " << param->Filter()->dims()[2]; + DLOG << " filter dims: " << param->Filter()->dims(); + + if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) { + DLOG << " here1 "; + this->cl_helper_.AddKernel("conv_1x1", "conv_kernel.cl"); + + } else if (param->Filter()->dims()[0] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + DLOG << " here2 "; + this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl"); + + } else if (param->Filter()->dims()[2] == 3 && + param->Filter()->dims()[3] == 3) { + DLOG << " here3 "; + this->cl_helper_.AddKernel("conv_3x3", "conv_kernel.cl"); + + } else { + PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } + + return true; +} + +template <> +void ConvKernel::Compute(const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + DLOG << " begin set kernel arg "; + DLOG << " c block " << c_block; + DLOG << " w " << w; + DLOG << " nh " << nh; + DLOG << " stride " << stride; + DLOG << " offset " << offset; + DLOG << " input_c " << input_c; + DLOG << " dilation " << dilation; + DLOG << " input width " << input_width; + DLOG << " input height " << input_height; + DLOG << " output width " << output_width; + DLOG << " output height " << output_height; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class ConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/src/operators/kernel/cl/depthwise_conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..35813a31f570c8daf956e4c90d0f3e3de1675eb4 --- /dev/null +++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DEPTHWISECONV_OP + +#include "operators/kernel/depthwise_conv_kernel.h" +#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DepthwiseConvKernel::Init(ConvParam *param) { + DLOG << " depthwise conv kernel init begin "; + PADDLE_MOBILE_ENFORCE( + param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Paddings()[0] == param->Paddings()[1], + "need equal"); + param->Filter()->InitCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + int offset = static_cast(param->Filter()->dims()[2]) / 2 - + static_cast(param->Paddings()[1]); + param->SetOffset(offset); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " depthwise conv kernel init end "; + return true; +} + +template <> +void DepthwiseConvKernel::Compute( + const ConvParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + auto input = param.Input()->GetCLImage(); + auto filter = param.Filter()->GetCLImage(); + auto output = param.Output()->GetCLImage(); + int stride = param.Strides()[0]; + int offset = param.Offset(); + int input_c = reinterpret_cast( + param.Input()->Converter()) + ->GetCBlock(); + int dilation = param.Dilations()[0]; + + int input_width = param.Input()->dims()[3]; + int input_height = param.Input()->dims()[2]; + int output_width = param.Output()->dims()[3]; + int output_height = param.Output()->dims()[2]; + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output); + status = clSetKernelArg(kernel, 6, sizeof(int), &stride); + status = clSetKernelArg(kernel, 7, sizeof(int), &offset); + status = clSetKernelArg(kernel, 8, sizeof(int), &input_c); + status = clSetKernelArg(kernel, 9, sizeof(int), &dilation); + status = clSetKernelArg(kernel, 10, sizeof(int), &input_width); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_height); + status = clSetKernelArg(kernel, 12, sizeof(int), &output_width); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_height); + + CL_CHECK_ERRORS(status); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); +} + +template class DepthwiseConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e62714b3fa3182706270627e7fd1a13b06f3b66a --- /dev/null +++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp @@ -0,0 +1,104 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEADD_OP + +#include "operators/kernel/elementwise_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseAddKernel::Init( + ElementwiseAddParam *param) { + DLOG << "-----init add-----"; + CLImage *bias = (CLImage *)(param->InputY()); + bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); + DLOG << " bias: " << *bias; + if (bias->dims().size() == 4) { + this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + } else if (param->InputY()->dims().size() == 1) { + this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); + } else { + DLOG << "error:bias dims is error"; + } + + return true; +} + +template <> +void ElementwiseAddKernel::Compute( + const ElementwiseAddParam ¶m) { + auto input = param.InputX(); + auto bias = param.InputY(); + auto output = param.Out(); + cl_int status; + auto kernel = this->cl_helper_.KernelAt(0); + if (bias->dims().size() == 4) { + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + int width = input->ImageWidth(); + int height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (bias->dims().size() == 1) { + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[3]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + int width = input->ImageWidth(); + int height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + cl_event out_event = param.Out()->GetClEvent(); + cl_event wait_event = param.InputX()->GetClEvent(); + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else { + DLOG << "error:bias dims is error"; + } +} + +template class ElementwiseAddKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..78f04357a23c70595595cc24489fd96e994162fb --- /dev/null +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" +#include "framework/cl/cl_tensor.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + DLOG << "Init feed"; + this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + cl_int status; + auto output = param.Out(); + const Tensor *input = param.InputX(); + // DLOG << *input; + const float *input_data = input->data(); + int numel = input->numel(); + cl_mem cl_image = output->GetCLImage(); + int c = input->dims()[1]; + int height = output->dims()[2]; + int width = output->dims()[3]; + CLTensor input_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + input_cl_tensor.Resize(input->dims()); + cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input_data); + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); + CL_CHECK_ERRORS(status); + + size_t global_work_size[2] = {width, height}; + + // cl_event out_event = param.Out()->GetClEvent(); + + status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31c1d4179cbdfc8145d90bee2353be821e65b40b --- /dev/null +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -0,0 +1,101 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/fetch_kernel.h" +#include "framework/cl/cl_tensor.h" +// #include "common/common.h" +// #include + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + if (param->InputX()->dims().size() <= 2) { + this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl"); + } else { + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + } + auto *out = param->Out(); + out->mutable_data(); + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX()); + + auto input = param.InputX()->GetCLImage(); + auto *out = param.Out(); + + const auto &dim = param.InputX()->dims(); + size_t new_dims[] = {1, 1, 1, 1}; + + for (int j = 0; j < dim.size(); ++j) { + new_dims[4 - dim.size() + j] = dim[j]; + } + + size_t C, in_height, in_width; + + C = new_dims[1]; + in_height = new_dims[2]; + if (dim.size() <= 2) { + in_width = param.InputX()->ImageWidth(); + } else { + in_width = new_dims[3]; + } + + CLTensor out_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + out_cl_tensor.Resize(out->dims()); + cl_mem outBuffer = out_cl_tensor.mutable_data(); + + clSetKernelArg(kernel, 0, sizeof(int), &in_height); + clSetKernelArg(kernel, 1, sizeof(int), &in_width); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); + if (dim.size() > 2) { + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; + clSetKernelArg(kernel, 4, sizeof(int), &size_ch); + clSetKernelArg(kernel, 5, sizeof(int), &size_block); + clSetKernelArg(kernel, 6, sizeof(int), &size_batch); + } + + // cl_event wait_event = param.InpdutX()->GetClEvent(); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + // auto time1 = paddle_mobile::time(); + + // printf(" before finish \n"); + // clFlsh(this->cl_helper_.CLCommandQueue()); + clFinish(this->cl_helper_.CLCommandQueue()); + // printf(" after finish \n"); + + // auto time2 = paddle_mobile::time(); + // + // + // std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2) + // << "ms" << std::endl; + + memcpy(out->data(), out_cl_tensor.Data(), out->memory_size()); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/pool_kernel.cpp b/src/operators/kernel/cl/pool_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df79ababadd4c1b959a1eb0fe237a45ab97a6bd8 --- /dev/null +++ b/src/operators/kernel/cl/pool_kernel.cpp @@ -0,0 +1,84 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef POOL_OP + +#include "operators/kernel/pool_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool PoolKernel::Init(PoolParam *param) { + std::string pooling_type = param->PoolingType(); + this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl"); + return true; +} + +template <> +void PoolKernel::Compute(const PoolParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output()); + + auto input = param.Input()->GetCLImage(); + auto out = param.Output()->GetCLImage(); + + framework::CLImageConverterFolder *input_folder_converter = + reinterpret_cast( + param.Input()->Converter()); + framework::CLImageConverterFolder *output_folder_converter = + reinterpret_cast( + param.Output()->Converter()); + + const int in_height = input_folder_converter->HeightOfOneBlock(); + const int in_width = input_folder_converter->WidthOfOneBlock(); + const int out_height = output_folder_converter->HeightOfOneBlock(); + const int out_width = output_folder_converter->WidthOfOneBlock(); + + std::string pooling_type = param.PoolingType(); + std::vector ksize = param.Ksize(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + const int pad_top = paddings[0]; + const int pad_left = paddings[1]; + const int stride_h = strides[0]; + const int stride_w = strides[1]; + const int ksize_h = ksize[0]; + const int ksize_w = ksize[1]; + + clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height); + clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width); + clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height); + clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width); + clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top); + clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left); + clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h); + clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w); + clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h); + clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w); + clSetKernelArg(kernel, 10, sizeof(cl_mem), &input); + clSetKernelArg(kernel, 11, sizeof(cl_mem), &out); + + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + default_work_size.data(), NULL, 0, NULL, NULL); +} + +template class PoolKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3acfe442201a9be59c6f0a0a536cf9aea68c4a2 --- /dev/null +++ b/src/operators/kernel/cl/relu_kernel.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef RELU_OP + +#include "operators/kernel/relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReluKernel::Init(ReluParam* param) { + this->cl_helper_.AddKernel("relu", "relu.cl"); + // this->cl_helper_.AddKernel("relu_p0", "relu.cl"); + // this->cl_helper_.AddKernel("relu_p1", "relu.cl"); + // const auto dim = + // const_cast(param->InputX())->ImageDims(); + // param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), + // this->cl_helper_.CLCommandQueue(), + // dim); + return true; +} + +template <> +void ReluKernel::Compute(const ReluParam& param) { + auto kernel = this->cl_helper_.KernelAt(0); + // auto kernel_p0 = this->cl_helper_.KernelAt(1); + // auto kernel_p1 = this->cl_helper_.KernelAt(2); + const auto* input = param.InputX(); + auto* output = param.Out(); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + // auto tImage = + // const_cast&>(param).getMidImage().GetCLImage(); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); + // clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); + // clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); + // clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); + const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + work_size, NULL, 0, NULL, NULL); + // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3, + // NULL, + // work_size, NULL, 0, NULL, NULL); +} + +template class ReluKernel; + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fb3aa9b52f722b21cdc30e54eafadf9dffcfef7a --- /dev/null +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/reshape_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReshapeKernel::Init(ReshapeParam *param) { + this->cl_helper_.AddKernel("reshape", "reshape.cl"); + return true; +} + +template <> +void ReshapeKernel::Compute(const ReshapeParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + const auto *input = param.InputX(); + auto *output = param.Out(); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + const auto &inputDim = input->dims(); + const auto &outputDim = output->dims(); + int dims[4] = {1, 1, 1, 1}; + int odims[4] = {1, 1, 1, 1}; + // 1 1000 1 1 + for (int i = 0; i < inputDim.size(); i++) { + dims[4 - inputDim.size() + i] = inputDim[i]; + } + + // 1 1 1 1000 + for (int i = 0; i < outputDim.size(); i++) { + odims[4 - outputDim.size() + i] = outputDim[i]; + } + clSetKernelArg(kernel, 2, sizeof(cl_int), &dims); + clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]); + clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]); + clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]); + clSetKernelArg(kernel, 6, sizeof(cl_int), &odims); + clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]); + clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]); + clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]); + const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, + work_size, NULL, 0, NULL, NULL); +} + +template class ReshapeKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..22e6672ee462b963476dc72895329a9117fc16a8 --- /dev/null +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SOFTMAX_OP + +#include "operators/kernel/softmax_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool SoftmaxKernel::Init(SoftmaxParam *param) { + this->cl_helper_.AddKernel("softmax", "softmax.cl"); + return true; +} + +template <> +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); + const auto *input = param.InputX(); + auto *output = param.Out(); + auto inputImage = input->GetCLImage(); + auto outputImage = output->GetCLImage(); + + int group = output->ImageWidth(); + + cl_int status; + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + status = clSetKernelArg(kernel, 2, sizeof(int), &group); + + // const auto &inputDim = input->dims(); + // + // int dims[4] = {1, 1, 1, 1}; + // + // for (int i = 0; i < inputDim.size(); i++) { + // dims[4 - inputDim.size() + i] = inputDim[i]; + // } + // + // clSetKernelArg(kernel, 2, sizeof(int), &dims); + // clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); + // clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); + // clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); + + // cl_event out_event = param.Out()->GetClEvent(); + // cl_event wait_event = param.InputX()->GetClEvent(); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); +} + +template class SoftmaxKernel; + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h index 61100bf5f0e9de43bfb6295a0719f1be0954d128..ac9ebca4d5ab30307303b8720677e67470634b44 100644 --- a/src/operators/kernel/concat_kernel.h +++ b/src/operators/kernel/concat_kernel.h @@ -27,7 +27,7 @@ template class ConcatKernel : public framework::OpKernelBase> { public: - void Compute(const ConcatParam ¶m) const; + void Compute(const ConcatParam ¶m); bool Init(ConcatParam *param); }; diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h index 5715cd46d5a6c7e80ab5ff77ba83c7973e1db811..fadaf7564ceeb7a52215dc335135016be02bc1ab 100644 --- a/src/operators/kernel/conv_add_add_prelu_kernel.h +++ b/src/operators/kernel/conv_add_add_prelu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddAddPReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddAddPReluParam ¶m) const; + void Compute(const FusionConvAddAddPReluParam ¶m); bool Init(FusionConvAddAddPReluParam *param); }; diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h index ee73215c4688c3e604de69cda55b05e63844c0b8..7a921ecc7d0f4498cae80fbb9cea1b13e4c94101 100644 --- a/src/operators/kernel/conv_add_bn_kernel.h +++ b/src/operators/kernel/conv_add_bn_kernel.h @@ -35,7 +35,7 @@ template class ConvAddBNKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddBNParam ¶m) const; + void Compute(const FusionConvAddBNParam ¶m); bool Init(FusionConvAddBNParam *param); }; diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h index 9faaaedcf8d6f825f818ebf5121dc7685185d5d8..3f088528fc901987873038c7e1dd779dcc2019e7 100644 --- a/src/operators/kernel/conv_add_bn_relu_kernel.h +++ b/src/operators/kernel/conv_add_bn_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddBNReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddBNReluParam ¶m) const; + void Compute(const FusionConvAddBNReluParam ¶m); bool Init(FusionConvAddBNReluParam *param); }; diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h index 360cbb6775168885e9c1a25db1f9ffb9e552324b..4e9ff0853f1d502ebb4dc4ef3641d0a879f32b60 100644 --- a/src/operators/kernel/conv_add_kernel.h +++ b/src/operators/kernel/conv_add_kernel.h @@ -40,7 +40,7 @@ template class ConvAddKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddParam ¶m) const; + void Compute(const FusionConvAddParam ¶m); bool Init(FusionConvAddParam *param); }; diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h index a109f84cf09b4d0e2469a1885b902c0f70acc6c8..631982789b09c57d0d21186d0a30df7368d2955f 100644 --- a/src/operators/kernel/conv_add_prelu_kernel.h +++ b/src/operators/kernel/conv_add_prelu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddPReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddPReluParam ¶m) const; + void Compute(const FusionConvAddPReluParam ¶m); bool Init(FusionConvAddPReluParam *param); }; diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h index f33b1dc312e1d94be0c23cff55e9e6789a556bc7..e001926b361da96ec3ff76e120bc3d1ad13714fa 100644 --- a/src/operators/kernel/conv_add_relu_kernel.h +++ b/src/operators/kernel/conv_add_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvAddReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvAddReluParam ¶m) const; + void Compute(const FusionConvAddReluParam ¶m); bool Init(FusionConvAddReluParam *param); }; diff --git a/src/operators/kernel/conv_bn_add_relu_kernel.h b/src/operators/kernel/conv_bn_add_relu_kernel.h index 820e5f8bcbf58676e8374e575044b10fe4676efa..dcd8fecf07fbb4ea75b382f5315e24e64e26e939 100644 --- a/src/operators/kernel/conv_bn_add_relu_kernel.h +++ b/src/operators/kernel/conv_bn_add_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvBNAddReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvBNAddReluParam ¶m) const; + void Compute(const FusionConvBNAddReluParam ¶m); bool Init(FusionConvBNAddReluParam *param); }; diff --git a/src/operators/kernel/conv_bn_kernel.h b/src/operators/kernel/conv_bn_kernel.h index f740ca836481c1331ea2e889865b3078d48644a6..e669f3bdd85dbd89e3a48d417dcd0cd6b9706062 100644 --- a/src/operators/kernel/conv_bn_kernel.h +++ b/src/operators/kernel/conv_bn_kernel.h @@ -35,7 +35,7 @@ template class ConvBNKernel : public OpKernelBase> { public: - void Compute(const FusionConvBNParam ¶m) const; + void Compute(const FusionConvBNParam ¶m); bool Init(FusionConvBNParam *param); }; diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h index 225976aa5db31096ef691ecefa8b63d4ae3dc277..91b3413116ae22a8e212cf149c4e0c2a8924664a 100644 --- a/src/operators/kernel/conv_bn_relu_kernel.h +++ b/src/operators/kernel/conv_bn_relu_kernel.h @@ -35,7 +35,7 @@ template class ConvBNReluKernel : public OpKernelBase> { public: - void Compute(const FusionConvBNReluParam ¶m) const; + void Compute(const FusionConvBNReluParam ¶m); bool Init(FusionConvBNReluParam *param); }; diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h index 93474adaa97743d1850b53df114ae08f144aebca..cac498c36bd5debef0ff996cdf017355a2371a18 100644 --- a/src/operators/kernel/conv_kernel.h +++ b/src/operators/kernel/conv_kernel.h @@ -31,7 +31,7 @@ using framework::OpKernelBase; template class ConvKernel : public OpKernelBase> { public: - void Compute(const ConvParam ¶m) const; + void Compute(const ConvParam ¶m); bool Init(ConvParam *param); }; diff --git a/src/operators/kernel/conv_transpose_kernel.h b/src/operators/kernel/conv_transpose_kernel.h index 761370095cae9751eb479521d6378c4f7ccaefe5..6341a87d43fdb3a3ca63fadd90239bdf2a6921a8 100644 --- a/src/operators/kernel/conv_transpose_kernel.h +++ b/src/operators/kernel/conv_transpose_kernel.h @@ -28,7 +28,7 @@ template class ConvTransposeKernel : public OpKernelBase> { public: - void Compute(const ConvTransposeParam ¶m) const; + void Compute(const ConvTransposeParam ¶m); bool Init(ConvTransposeParam *param); }; diff --git a/src/operators/kernel/crf_kernel.h b/src/operators/kernel/crf_kernel.h index 71c07cf0384d482522de3a6652c6d24a22af656a..1436aafc0603d4c7ba9ecae911f10bd8f297852a 100644 --- a/src/operators/kernel/crf_kernel.h +++ b/src/operators/kernel/crf_kernel.h @@ -28,7 +28,7 @@ template class CrfKernel : public framework::OpKernelBase> { public: - void Compute(const CrfParam& param) const; + void Compute(const CrfParam& param); bool Init(CrfParam* param); }; } // namespace operators diff --git a/src/operators/kernel/deconv_relu_kernel.h b/src/operators/kernel/deconv_relu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bc85f1ffee19abe3941bd9d90fb8dfd04280ce14 --- /dev/null +++ b/src/operators/kernel/deconv_relu_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVRELU_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvReluParam ¶m); + + bool Init(FusionDeconvReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h index 605b81cd6ed4ccd54b1803cf7a603b8f4576982d..3ee5bf86e97baa3970239e32b7fd5fc341e09f92 100644 --- a/src/operators/kernel/depthwise_conv_kernel.h +++ b/src/operators/kernel/depthwise_conv_kernel.h @@ -31,7 +31,7 @@ template class DepthwiseConvKernel : public OpKernelBase> { public: - void Compute(const ConvParam ¶m) const; + void Compute(const ConvParam ¶m); bool Init(ConvParam *param); }; } // namespace operators diff --git a/src/operators/kernel/dequantize_kernel.h b/src/operators/kernel/dequantize_kernel.h index d147e3f94ab87165cceac886289e74747906e047..6ba8ec88c52f20ccfcd30d5b9a217eaef658d507 100644 --- a/src/operators/kernel/dequantize_kernel.h +++ b/src/operators/kernel/dequantize_kernel.h @@ -26,7 +26,7 @@ template class DequantizeKernel : public framework::OpKernelBase> { public: - void Compute(const DequantizeParam ¶m) const; + void Compute(const DequantizeParam ¶m); bool Init(DequantizeParam *param); }; diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h index b7535095d4fef11ee628aea96a074abcc3562f7f..2f59d01b6723eea274b1ed059ae08863a4937961 100644 --- a/src/operators/kernel/dropout_kernel.h +++ b/src/operators/kernel/dropout_kernel.h @@ -26,7 +26,7 @@ template class DropoutKernel : public framework::OpKernelBase> { public: - void Compute(const DropoutParam& param) const; + void Compute(const DropoutParam& param); bool Init(DropoutParam* para); }; } // namespace operators diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h index 594c594cb00f8f4ddd8a511f3c992c4efbfcdfc6..f2e4c0afbd0aaafff5339816764f9e30592f122c 100644 --- a/src/operators/kernel/dwconv_bn_relu_kernel.h +++ b/src/operators/kernel/dwconv_bn_relu_kernel.h @@ -35,7 +35,7 @@ template class DWConvBNReluKernel : public OpKernelBase> { public: - void Compute(const FusionDWConvBNReluParam ¶m) const; + void Compute(const FusionDWConvBNReluParam ¶m); bool Init(FusionDWConvBNReluParam *param); }; diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h index 67182af2e20e23c40effab6b87eefde1e0ab629d..8fa07e519ec0b78baffabd08fb7e524f8259c9eb 100644 --- a/src/operators/kernel/elementwise_add_kernel.h +++ b/src/operators/kernel/elementwise_add_kernel.h @@ -30,7 +30,7 @@ class ElementwiseAddKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseAddParam ¶m) const; + void Compute(const ElementwiseAddParam ¶m); bool Init(ElementwiseAddParam *param); }; } // namespace operators diff --git a/src/operators/kernel/elementwise_add_relu_kernel.h b/src/operators/kernel/elementwise_add_relu_kernel.h index 5eda5a0c56c228ad54c888b6faa82ce9417f2dc1..d18c4e27fa3345b1818d0e6149fc8fb83195f644 100644 --- a/src/operators/kernel/elementwise_add_relu_kernel.h +++ b/src/operators/kernel/elementwise_add_relu_kernel.h @@ -29,7 +29,7 @@ class ElementwiseAddReluKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseAddReluParam ¶m) const; + void Compute(const ElementwiseAddReluParam ¶m); bool Init(ElementwiseAddReluParam *param); }; } // namespace operators diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h index 63f0df4815dc143e482140a855eb254bd016d50c..54baa50fcafb8ddbbefecb635ea85f120f16250d 100644 --- a/src/operators/kernel/elementwise_mul_kernel.h +++ b/src/operators/kernel/elementwise_mul_kernel.h @@ -28,7 +28,7 @@ class ElementwiseMulKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseMulParam ¶m) const; + void Compute(const ElementwiseMulParam ¶m); bool Init(ElementwiseMulParam *param); }; } // namespace operators diff --git a/src/operators/kernel/elementwise_sub_kernel.h b/src/operators/kernel/elementwise_sub_kernel.h index 9516dcbd3de09debe233571eb5f60b3b8b19a2fa..89536b920837b57c4017ccadff7ea6e233cd999e 100644 --- a/src/operators/kernel/elementwise_sub_kernel.h +++ b/src/operators/kernel/elementwise_sub_kernel.h @@ -28,7 +28,7 @@ class ElementwiseSubKernel : public framework::OpKernelBase> { public: - void Compute(const ElementwiseSubParam ¶m) const; + void Compute(const ElementwiseSubParam ¶m); bool Init(ElementwiseSubParam *param); }; diff --git a/src/operators/kernel/fc_relu_kernel.h b/src/operators/kernel/fc_relu_kernel.h index 6e9446da37df4ba83db85d416aa87f216816c4a5..6735a50bee86e25d9f8d091b6218a472f3838aec 100644 --- a/src/operators/kernel/fc_relu_kernel.h +++ b/src/operators/kernel/fc_relu_kernel.h @@ -28,7 +28,7 @@ class FusionFcReluKernel : public framework::OpKernelBase> { public: - void Compute(const FusionFcReluParam& param) const; + void Compute(const FusionFcReluParam& param); bool Init(FusionFcReluParam* param); }; } // namespace operators diff --git a/src/operators/kernel/feed_kernel.h b/src/operators/kernel/feed_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..2b1220fee534040e5ccae5aee84adf3b4b6290b9 --- /dev/null +++ b/src/operators/kernel/feed_kernel.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using namespace framework; +template +class FeedKernel + : public framework::OpKernelBase> { + public: + void Compute(const FeedParam ¶m); + bool Init(FeedParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fetch_kernel.h b/src/operators/kernel/fetch_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..d9ed91855d0db5149cc8cf4f5d571afd1fbea98f --- /dev/null +++ b/src/operators/kernel/fetch_kernel.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using namespace framework; + +template +class FetchKernel + : public framework::OpKernelBase> { + public: + void Compute(const FetchParam ¶m); + bool Init(FetchParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/flatten_kernel.h b/src/operators/kernel/flatten_kernel.h index 80d66ccf87c21532c8b4590d992f5bccbe4f00dc..4846725bcb6522389d29e137980b9d53e63f9f32 100644 --- a/src/operators/kernel/flatten_kernel.h +++ b/src/operators/kernel/flatten_kernel.h @@ -28,7 +28,7 @@ template class FlattenKernel : public framework::OpKernelBase> { public: - void Compute(const FlattenParam& param) const; + void Compute(const FlattenParam& param); bool Init(FlattenParam* param); }; } // namespace operators diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/V1/concat_kernel.cpp similarity index 99% rename from src/operators/kernel/fpga/concat_kernel.cpp rename to src/operators/kernel/fpga/V1/concat_kernel.cpp index f61afd4a5c514ced87396313ea5d645fe830e12a..6644bfd83e57a7fd147c0cc6383e64eb2ad79e51 100644 --- a/src/operators/kernel/fpga/concat_kernel.cpp +++ b/src/operators/kernel/fpga/V1/concat_kernel.cpp @@ -58,7 +58,7 @@ bool ConcatKernel::Init(ConcatParam *param) { } template <> -void ConcatKernel::Compute(const ConcatParam ¶m) const { +void ConcatKernel::Compute(const ConcatParam ¶m) { ComputeFPGAConcat(param.FpgaArgs()); } template class ConcatKernel; diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp similarity index 98% rename from src/operators/kernel/fpga/conv_add_bn_kernel.cpp rename to src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp index 9b3944fc9a9ab308d9fe8b791a34e09651b87e6e..679a95ff54168da821ed0debb80b6bce8eca407b 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp @@ -78,7 +78,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { template <> void ConvAddBNKernel::Compute( - const FusionConvAddBNParam ¶m) const { + const FusionConvAddBNParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp similarity index 98% rename from src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp rename to src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp index 83f74e97d04eda29f3aaa6a0cc16ed7d194321d8..6c99750eb824940b32a857ee2baffc72bce05a7a 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp @@ -76,7 +76,7 @@ bool ConvAddBNReluKernel::Init( template <> void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) const { + const FusionConvAddBNReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp similarity index 97% rename from src/operators/kernel/fpga/conv_add_relu_kernel.cpp rename to src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp index 4975f2a905dcd76c5b7f013eafaa376dd2bb1646..ce2fbbda0ee4c7e0a1e97b45674ef269df3be3be 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp @@ -58,7 +58,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { template <> void ConvAddReluKernel::Compute( - const FusionConvAddReluParam ¶m) const { + const FusionConvAddReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp similarity index 96% rename from src/operators/kernel/fpga/conv_bn_kernel.cpp rename to src/operators/kernel/fpga/V1/conv_bn_kernel.cpp index 276e71b6a44e9a7beba0d5db2f51472a9927d8da..ac9f19e411a87bb31e320df504a0e1c88e195454 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp @@ -69,8 +69,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { } template <> -void ConvBNKernel::Compute( - const FusionConvBNParam ¶m) const { +void ConvBNKernel::Compute(const FusionConvBNParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp similarity index 98% rename from src/operators/kernel/fpga/conv_bn_relu_kernel.cpp rename to src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp index f519a37cb57378a603969adae255f88ae8a5df2a..4c9eb391ada9366478877494fbe466d5cf919327 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp @@ -70,7 +70,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { template <> void ConvBNReluKernel::Compute( - const FusionConvBNReluParam ¶m) const { + const FusionConvBNReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/V1/dropout_kernel.cpp similarity index 91% rename from src/operators/kernel/fpga/dropout_kernel.cpp rename to src/operators/kernel/fpga/V1/dropout_kernel.cpp index b0981c4254060996a16f4ae5beabb7c22edd6d34..8b990d46e0b90bf67eaf36bbf38238fd4432ace6 100644 --- a/src/operators/kernel/fpga/dropout_kernel.cpp +++ b/src/operators/kernel/fpga/V1/dropout_kernel.cpp @@ -26,8 +26,7 @@ bool DropoutKernel::Init(DropoutParam *param) { } template <> -void DropoutKernel::Compute( - const DropoutParam ¶m) const {} +void DropoutKernel::Compute(const DropoutParam ¶m) {} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp similarity index 97% rename from src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp rename to src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp index b592dd6d59a5d5cec8f12ef304099d2b89a10a05..5253d4d0d3e00190b4ed594279d9190659ec6026 100644 --- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp @@ -56,7 +56,7 @@ bool ElementwiseAddReluKernel::Init( template <> void ElementwiseAddReluKernel::Compute( - const ElementwiseAddReluParam ¶m) const { + const ElementwiseAddReluParam ¶m) { fpga::ComputeFpgaEWAdd(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp similarity index 98% rename from src/operators/kernel/fpga/fc_relu_kernel.cpp rename to src/operators/kernel/fpga/V1/fc_relu_kernel.cpp index 52d7c0a4e69080e11f86d1507829e7e779a69228..2c6b616689dca14474d1cbdc3769b438de1358e4 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp @@ -61,7 +61,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { } template <> void FusionFcReluKernel::Compute( - const FusionFcReluParam ¶m) const { + const FusionFcReluParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..161d8c9f0cf22ac79d1367e07b8ba3318a7a7123 --- /dev/null +++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + Tensor *output = param->Out(); + fpga::format_fp16_ofm(output); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto input = + reinterpret_cast(const_cast(param.InputX())); + auto input_ptr = input->data(); + fpga::format_image(input); + Tensor *output = param.Out(); + auto output_ptr = output->data(); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; + + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = reinterpret_cast(input_ptr); + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); +} +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e6e4591168b90cbe19b207cd9e77eaf5cd07de80 --- /dev/null +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/fetch_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp similarity index 96% rename from src/operators/kernel/fpga/fusion_fc_kernel.cpp rename to src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp index 407e14238d542604e876ced624d5a0db698a6101..9258fb90e1e6bf9a597a387843ce781858628139 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp @@ -62,8 +62,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { } template <> -void FusionFcKernel::Compute( - const FusionFcParam ¶m) const { +void FusionFcKernel::Compute(const FusionFcParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp similarity index 96% rename from src/operators/kernel/fpga/pool_kernel.cpp rename to src/operators/kernel/fpga/V1/pool_kernel.cpp index 6269506836c25d756040cd25cf9b0189fd03d89b..8eefc3e9bea0b3662b4c08409f16f86dab60968a 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -34,7 +34,7 @@ bool PoolKernel::Init(PoolParam *param) { fpga::PoolingArgs poolArgs = {0}; poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 poolArgs.kernel_reciprocal = - fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1]))); + fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1]))); // NOLINT poolArgs.image.address = input_ptr; poolArgs.image.channels = (uint32_t)input->dims()[1]; poolArgs.image.height = (uint32_t)input->dims()[2]; @@ -53,7 +53,7 @@ bool PoolKernel::Init(PoolParam *param) { } template <> -void PoolKernel::Compute(const PoolParam ¶m) const { +void PoolKernel::Compute(const PoolParam ¶m) { fpga::ComputeFpgaPool(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp similarity index 88% rename from src/operators/kernel/fpga/softmax_kernel.cpp rename to src/operators/kernel/fpga/V1/softmax_kernel.cpp index e36db57f4b4f18712df50b2b132cdd1032a41921..37c03e2404f761f3089adb852b94bef27bec1ce9 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -14,11 +14,9 @@ limitations under the License. */ #ifdef SOFTMAX_OP -#include "../softmax_kernel.h" -#include "../central-arm-func/softmax_arm_func.h" -#include "common/types.h" -#include "fpga/api.h" -#include "operators/math/softmax.h" +#include "operators/kernel/softmax_kernel.h" +#include "operators/kernel/central-arm-func/softmax_arm_func.h" + namespace paddle_mobile { namespace operators { @@ -47,8 +45,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { } template <> -void SoftmaxKernel::Compute( - const SoftmaxParam ¶m) const { +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { Tensor *in_x = param.FloatInput(); Tensor *out = param.Out(); diff --git a/src/operators/kernel/fpga/V2/concat_kernel.cpp b/src/operators/kernel/fpga/V2/concat_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7f9ab66d48489dbecae01f819bd607c582f6145b --- /dev/null +++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONCAT_OP + +#include "operators/kernel/concat_kernel.h" +#include "fpga/V2/api.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConcatKernel::Init(ConcatParam *param) { + auto inputs = param->Inputs(); + auto out = param->Out(); + auto image_num = inputs.size(); + auto images_in = + (half **)fpga::fpga_malloc(image_num * sizeof(int *)); // NOLINT + auto scales_in = + (float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT + auto channel_num = + (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT + auto aligned_channel_num = + (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); // NOLINT + + auto height = inputs[0]->dims()[2]; + auto width = inputs[0]->dims()[3]; + auto out_channel = + (uint32_t)fpga::get_aligned_channel_num((int)out->dims()[1]); // NOLINT + for (int i = 0; i < image_num; i++) { + auto input = inputs[i]; + PADDLE_MOBILE_ENFORCE( + input->dims()[2] == height && input->dims()[3] == width, + "Image height & width should be unified"); + images_in[i] = (half *)input->data(); // NOLINT + channel_num[i] = (uint32_t)inputs[i]->dims()[1]; + aligned_channel_num[i] = + (uint32_t)fpga::get_aligned_channel_num(channel_num[i]); + scales_in[i] = input->scale; + } + fpga::format_concat_output(out, (int)height, (int)width, // NOLINT + out_channel); + + fpga::ConcatArgs concatArgs = {0}; + concatArgs.image_num = (uint32_t)image_num; + concatArgs.images_in = images_in; + concatArgs.scales_in = scales_in; + concatArgs.image_out = (half *)out->data(); // NOLINT + concatArgs.scale_out = out->scale; + concatArgs.channel_num = channel_num; + concatArgs.aligned_channel_num = aligned_channel_num; + concatArgs.out_channel = out_channel; + concatArgs.height = (uint32_t)height; + concatArgs.width = (uint32_t)width; + param->SetFpgaArgs(concatArgs); + return true; +} + +template <> +void ConcatKernel::Compute(const ConcatParam ¶m) { + fpga::ComputeFPGAConcat(param.FpgaArgs()); +} +template class ConcatKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7c03daf7797dbc09ba85a4f4e32e983571d192df --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBN_OP + +#include "operators/kernel/conv_add_bn_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { + bool relu_enabled = false; + auto input = const_cast(param->Input()); + + auto bias = param->Bias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + + auto out = param->Output(); + + auto bn_mean_ptr = param->InputMean()->data(); + auto bn_var_ptr = param->InputVariance()->data(); + auto bn_scale_ptr = param->InputScale()->data(); + auto bn_bias_ptr = param->InputBias()->data(); + const float epsilon = param->Epsilon(); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && + bias->dims()[0] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); + + const int channel = out->dims()[1]; + auto bs_ptr = + reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({channel}); + auto new_bias_ptr = new_bias->mutable_data({channel}); + + for (int i = 0; i < channel; i++) { + new_scale_ptr[i] = bn_scale_ptr[i] / + static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); + new_bias_ptr[i] = + bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; + } + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + + return true; +} + +template <> +void ConvAddBNKernel::Compute( + const FusionConvAddBNParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8737554e6f8c343491656ca7659e1850d84ea246 --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBNRELU_OP + +#include "operators/kernel/conv_add_bn_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddBNReluKernel::Init( + FusionConvAddBNReluParam *param) { + bool relu_enabled = true; + auto input = const_cast(param->Input()); + const Tensor *bias = param->Bias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + auto bn_mean_ptr = param->InputMean()->data(); + auto bn_var_ptr = param->InputVariance()->data(); + auto bn_scale_ptr = param->InputScale()->data(); + auto bn_bias_ptr = param->InputBias()->data(); + const float epsilon = param->Epsilon(); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && + bias->dims()[0] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); + + const int channel = out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({channel}); + auto new_bias_ptr = new_bias->mutable_data({channel}); + + for (int i = 0; i < channel; i++) { + new_scale_ptr[i] = bn_scale_ptr[i] / + static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); + new_bias_ptr[i] = + bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; + bs_ptr[i + 2] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; + } + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void ConvAddBNReluKernel::Compute( + const FusionConvAddBNReluParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a3c4443645e421ee0dce10f53914600fb7af75bf --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDRELU_OP + +#include "operators/kernel/conv_add_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { + bool relu_enabled = true; + auto input = const_cast(param->Input()); + const Tensor *bias = param->Bias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = bias_ptr[i]; + } + + fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void ConvAddReluKernel::Compute( + const FusionConvAddReluParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..070fce98b9e5f0c7055943447602dba8ae78c7c4 --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBN_OP + +#include "operators/kernel/conv_bn_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvBNKernel::Init(FusionConvBNParam *param) { + bool relu_enabled = false; + auto input = const_cast(param->Input()); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + auto bn_mean_ptr = param->InputMean()->data(); + auto bn_var_ptr = param->InputVariance()->data(); + auto bn_scale_ptr = param->InputScale()->data(); + auto bn_bias_ptr = param->InputBias()->data(); + const float epsilon = param->Epsilon(); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); + const int channel = out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // // NOLINT + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({channel}); + auto new_bias_ptr = new_bias->mutable_data({channel}); + + for (int i = 0; i < channel; i++) { + new_scale_ptr[i] = bn_scale_ptr[i] / + static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); + new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; + } + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void ConvBNKernel::Compute(const FusionConvBNParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..95ac74cbf87fe20ef419e748f8a8a04df20c98e3 --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNRELU_OP + +#include "operators/kernel/conv_bn_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { + bool relu_enabled = true; + auto input = const_cast(param->Input()); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + auto bn_mean_ptr = param->InputMean()->data(); + auto bn_var_ptr = param->InputVariance()->data(); + auto bn_scale_ptr = param->InputScale()->data(); + auto bn_bias_ptr = param->InputBias()->data(); + const float epsilon = param->Epsilon(); + PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], + "Output channel should be equal to bias number"); + const int channel = out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({channel}); + auto new_bias_ptr = new_bias->mutable_data({channel}); + + for (int i = 0; i < channel; i++) { + new_scale_ptr[i] = bn_scale_ptr[i] / + static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); + new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; + } + param->SetNewScale(new_scale); + param->SetNewBias(new_bias); + + fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void ConvBNReluKernel::Compute( + const FusionConvBNReluParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3284ddcdece3ab7fcf4fb4458a59d39c452ad1ce --- /dev/null +++ b/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_TRANSPOSE_OP + +#include "operators/kernel/conv_transpose_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvTransposeKernel::Init(ConvTransposeParam *param) { + return true; +} + +template <> +void ConvTransposeKernel::Compute( + const ConvTransposeParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bf3556609a4ec2476521a9b8e80192f71aef4f52 --- /dev/null +++ b/src/operators/kernel/fpga/V2/deconv_relu_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVRELU_OP + +#include "operators/kernel/deconv_relu_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvReluKernel::Init(FusionDeconvReluParam *param) { + return true; +} + +template <> +void DeconvReluKernel::Compute( + const FusionDeconvReluParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/dropout_kernel.cpp b/src/operators/kernel/fpga/V2/dropout_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b990d46e0b90bf67eaf36bbf38238fd4432ace6 --- /dev/null +++ b/src/operators/kernel/fpga/V2/dropout_kernel.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef DROPOUT_OP + +#include "operators/kernel/dropout_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DropoutKernel::Init(DropoutParam *param) { + param->Out()->ShareDataWith(*param->InputX()); + return true; +} + +template <> +void DropoutKernel::Compute(const DropoutParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4b5085f26123994effa319826d84f2f249c80847 --- /dev/null +++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef ELEMENTWISEADD_OP + +#include "operators/kernel/elementwise_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { + bool relu_enabled = false; + auto *input_x = const_cast(param->InputX()); + auto *input_y = const_cast(param->InputY()); + auto *out = param->Out(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]); + fpga::format_fp16_ofm(out, aligned_channel_num); + auto out_ptr = out->mutable_data(); + + fpga::EWAddArgs ewaddArgs = {0}; + ewaddArgs.relu_enabled = relu_enabled; + ewaddArgs.const0 = 0x3c00; // =1 + ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + param->SetFpgaArgs(ewaddArgs); + return true; +} + +template <> +void ElementwiseAddKernel::Compute( + const ElementwiseAddParam ¶m) { + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..571987b3bf2a88c0d4ad648c7cb1966b538983a5 --- /dev/null +++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_ELEMENTWISEADDRELU_OP + +#include "operators/kernel/elementwise_add_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseAddReluKernel::Init( + ElementwiseAddReluParam *param) { + bool relu_enabled = false; + auto *input_x = const_cast(param->InputX()); + auto *input_y = const_cast(param->InputY()); + auto *out = param->Out(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + int aligned_channel_num = fpga::get_aligned_channel_num(input_x->dims()[1]); + fpga::format_fp16_ofm(out, aligned_channel_num); + auto out_ptr = out->mutable_data(); + + fpga::EWAddArgs ewaddArgs = {0}; + ewaddArgs.relu_enabled = relu_enabled; + ewaddArgs.const0 = 0x3c00; // =1 + ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + param->SetFpgaArgs(ewaddArgs); + return true; +} + +template <> +void ElementwiseAddReluKernel::Compute( + const ElementwiseAddReluParam ¶m) { + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ba869aaca7f3f5d5c598feb3837a59a3a738493b --- /dev/null +++ b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_FCRELU_OP +#include "operators/kernel/fc_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FusionFcReluKernel::Init(FusionFcReluParam *param) { + bool relu_enabled = true; + auto input_x = const_cast(param->InputX()); + auto filter = const_cast(param->InputY()); + auto input_z = param->InputZ(); + auto input_z_ptr = input_z->data(); + auto out = param->Out(); + PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + "Image channel should be equal to weight number"); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = input_z_ptr[i]; + } + + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; + int filter_channel = chw / height / width; + + out->Resize(framework::make_ddim({1, channel, 1, 1})); + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + fpga::format_fc_data(filter, out, bs_ptr); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, + 0, 0, bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} +template <> +void FusionFcReluKernel::Compute( + const FusionFcReluParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/fpga/V2/feed_kernel.cpp b/src/operators/kernel/fpga/V2/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4092307083bd38346b03857b8e9ec858795f3941 --- /dev/null +++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + Tensor *output = param->Out(); + int aligned_channel = fpga::get_aligned_channel_num(output->dims()[1]); + fpga::format_fp16_ofm(output, aligned_channel); + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + auto input = + reinterpret_cast(const_cast(param.InputX())); + auto input_ptr = input->data(); + fpga::format_image(input); + Tensor *output = param.Out(); + auto output_ptr = output->data(); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; + + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = reinterpret_cast(input_ptr); + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); +} +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/src/operators/kernel/fpga/V2/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e6e4591168b90cbe19b207cd9e77eaf5cd07de80 --- /dev/null +++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/fetch_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..589c21d667f39e24e8f62abafd38ab30523dd2de --- /dev/null +++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_FC_OP + +#include "operators/kernel/fusion_fc_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FusionFcKernel::Init(FusionFcParam *param) { + bool relu_enabled = false; + auto input_x = const_cast(param->InputX()); + auto filter = const_cast(param->InputY()); + const Tensor *input_z = param->InputZ(); + auto input_z_ptr = input_z->data(); + auto out = param->Out(); + + PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + "Image channel should be equal to weight number"); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = input_z_ptr[i]; + } + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; + int filter_channel = chw / height / width; + + out->Resize(framework::make_ddim({1, channel, 1, 1})); + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + fpga::format_fc_data(filter, out, bs_ptr); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, + 0, 0, bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void FusionFcKernel::Compute(const FusionFcParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/pool_kernel.cpp b/src/operators/kernel/fpga/V2/pool_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..480aca4eb318c18618db4f7bb498d21c10f857c8 --- /dev/null +++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef POOL_OP + +#include "operators/kernel/pool_kernel.h" + +class PoolingArgs; +namespace paddle_mobile { +namespace operators { + +template <> +bool PoolKernel::Init(PoolParam *param) { + auto *input = const_cast(param->Input()); + auto input_ptr = input->data(); + Tensor *output = param->Output(); + int aligned_channel_num = + fpga::get_aligned_channel_num((int)output->dims()[1]); // NOLINT + fpga::format_fp16_ofm(output, aligned_channel_num); + auto output_ptr = output->mutable_data(); + vector ksize = param->Ksize(); + vector strides = param->Strides(); + vector paddings = param->Paddings(); + std::string pooling_type = param->PoolingType(); + + fpga::PoolingArgs poolArgs = {0}; + poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 + poolArgs.kernel_reciprocal = + fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1]))); // NOLINT + poolArgs.image.address = input_ptr; + poolArgs.image.channels = (uint32_t)input->dims()[1]; + poolArgs.image.height = (uint32_t)input->dims()[2]; + poolArgs.image.width = (uint32_t)input->dims()[3]; + poolArgs.image.pad_height = (uint32_t)paddings[0]; + poolArgs.image.pad_width = (uint32_t)paddings[1]; + poolArgs.image.scale_address = input->scale; + poolArgs.output.address = output_ptr; + poolArgs.output.scale_address = output->scale; + poolArgs.kernel.height = (uint32_t)ksize[0]; + poolArgs.kernel.width = (uint32_t)ksize[1]; + poolArgs.kernel.stride_h = (uint32_t)strides[0]; + poolArgs.kernel.stride_w = (uint32_t)strides[1]; + param->SetFpgaArgs(poolArgs); + return true; +} + +template <> +void PoolKernel::Compute(const PoolParam ¶m) { + fpga::ComputeFpgaPool(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/slice_kernel.cpp b/src/operators/kernel/fpga/V2/slice_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bc3fbfd796fac693a319ed2ab24023b3ffb84863 --- /dev/null +++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SLICE_OP + +#include "operators/kernel/slice_kernel.h" + +namespace paddle_mobile { +namespace operators { +template <> +bool SliceKernel::Init(SliceParam* param) { + return true; +} +template <> +void SliceKernel::Compute(const SliceParam& param) {} + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/src/operators/kernel/fpga/V2/softmax_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bbdb35b715b60b25079c007a74b8b1e901cc9a59 --- /dev/null +++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SOFTMAX_OP + +#include "operators/kernel/softmax_kernel.h" +#include "operators/kernel/central-arm-func/softmax_arm_func.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool SoftmaxKernel::Init(SoftmaxParam *param) { + auto input = const_cast(param->InputX()); + auto input_ptr = input->data(); + auto float_input = new Tensor; + float_input->mutable_data({1, input->dims()[1]}); + fpga::format_fp32_ofm(float_input, 8); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input_ptr; + args.image.height = 1; + args.image.width = 1; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = float_input->data(); + args.output.scale_address = float_input->scale; + param->SetFloatInput(float_input); + param->SetFpgaArgs(args); + return true; +} + +template <> +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { + Tensor *in_x = param.FloatInput(); + Tensor *out = param.Out(); + + fpga::PerformBypass(param.FpgaArgs()); + fpga::fpga_invalidate( + (void *)in_x->data(), // NOLINT + fpga::get_aligned_channel_num((int)in_x->dims()[1]) * // NOLINT + sizeof(float)); + math::SoftmaxFuntor()(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/src/operators/kernel/fpga/V2/tanh_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..46dd3a0f6f8819f6485243a445725554943ab2bf --- /dev/null +++ b/src/operators/kernel/fpga/V2/tanh_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#include "operators/kernel/tanh_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool TanhKernel::Init(TanhParam *param) { + return true; +} + +template <> +void TanhKernel::Compute(const TanhParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h index 06d3981bd23708aee982e38d82ba592d69733a89..b8086bc66fbef7ec952548a3cb863cfa031c504e 100644 --- a/src/operators/kernel/fusion_fc_kernel.h +++ b/src/operators/kernel/fusion_fc_kernel.h @@ -27,7 +27,7 @@ template class FusionFcKernel : public framework::OpKernelBase> { public: - void Compute(const FusionFcParam& param) const; + void Compute(const FusionFcParam& param); bool Init(FusionFcParam* param); }; diff --git a/src/operators/kernel/gru_kernel.h b/src/operators/kernel/gru_kernel.h index 6b02663bd0e2982bdb2480c54632d2a8da9f67fc..b03b2e3ecb514fdf962bde9c06620fa6e64934df 100644 --- a/src/operators/kernel/gru_kernel.h +++ b/src/operators/kernel/gru_kernel.h @@ -28,7 +28,7 @@ template class GruKernel : public framework::OpKernelBase> { public: - void Compute(const GruParam& param) const; + void Compute(const GruParam& param); bool Init(GruParam* param); }; } // namespace operators diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h index df93ea5abacda1a5291caa53dc5dae7ea2b5d710..b15eb68996a990f6bc770db6940be83a0eea0cbf 100644 --- a/src/operators/kernel/im2sequence_kernel.h +++ b/src/operators/kernel/im2sequence_kernel.h @@ -29,7 +29,7 @@ template class Im2SequenceKernel : public framework::OpKernelBase> { public: - void Compute(const Im2SequenceParam& param) const; + void Compute(const Im2SequenceParam& param); bool Init(Im2SequenceParam* para); }; } // namespace operators diff --git a/src/operators/kernel/lookup_kernel.h b/src/operators/kernel/lookup_kernel.h index 73f6cfcced078382b40526eae1f6560d7d168b97..8c29349e737b0fba95688e1ebb8fe893a29b2a4f 100644 --- a/src/operators/kernel/lookup_kernel.h +++ b/src/operators/kernel/lookup_kernel.h @@ -28,7 +28,7 @@ template class LookupKernel : public framework::OpKernelBase> { public: - void Compute(const LookupParam& param) const; + void Compute(const LookupParam& param); bool Init(LookupParam* param); }; } // namespace operators diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h index 164178f1dcc0ee2523fc9c5fdc4736c14a3e55ce..99dbfe2d658cde17e6399f8ea4bc5b945092cde5 100644 --- a/src/operators/kernel/lrn_kernel.h +++ b/src/operators/kernel/lrn_kernel.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #ifdef LRN_OP #ifdef _OPENMP #include @@ -173,7 +175,7 @@ template class LrnKernel : public framework::OpKernelBase> { public: - void Compute(const LrnParam ¶m) const; + void Compute(const LrnParam ¶m); bool Init(LrnParam *param); }; } // namespace operators diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp index 50f6ef5f566347c089869c30b8f7534a4f8b6779..5d50ca9a7250f66f20b6bfaf0d93db18014d791c 100755 --- a/src/operators/kernel/mali/batchnorm_kernel.cpp +++ b/src/operators/kernel/mali/batchnorm_kernel.cpp @@ -145,7 +145,7 @@ bool BatchNormKernel::Init(BatchNormParam* param) { template <> void BatchNormKernel::Compute( - const BatchNormParam& param) const { + const BatchNormParam& param) { std::cout << "init acl" << std::endl; AclBatchNormOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp index 267c0101a8f66de3d508dbe5795c87ee5027a288..2fb05ab10eccf4e0dca9c74bbcc83067b438e981 100644 --- a/src/operators/kernel/mali/concat_kernel.cpp +++ b/src/operators/kernel/mali/concat_kernel.cpp @@ -118,7 +118,7 @@ bool ConcatKernel::Init(ConcatParam* param) { template <> void ConcatKernel::Compute( - const ConcatParam& param) const { + const ConcatParam& param) { std::cout << "init acl" << std::endl; AclConcatOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp index 74cace00dd2dead7a5d9ddfc76e2d48c67cccf89..427bcd596f71bf434ea155d04f192c5bdedfded5 100644 --- a/src/operators/kernel/mali/conv_add_kernel.cpp +++ b/src/operators/kernel/mali/conv_add_kernel.cpp @@ -212,7 +212,7 @@ bool ConvAddKernel::Init(FusionConvAddParam* param) { template <> void ConvAddKernel::Compute( - const FusionConvAddParam& param) const { + const FusionConvAddParam& param) { std::cout << "init acl" << std::endl; AclConvAddOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp index 7852e64990e5a2cd6f3d7e803e71c23c55aa7a27..7cca16274ecc7ae1707f8d5ed8faf2fde810ab30 100644 --- a/src/operators/kernel/mali/conv_kernel.cpp +++ b/src/operators/kernel/mali/conv_kernel.cpp @@ -211,8 +211,7 @@ bool ConvKernel::Init(ConvParam* param) { } template <> -void ConvKernel::Compute( - const ConvParam& param) const { +void ConvKernel::Compute(const ConvParam& param) { std::cout << "init acl" << std::endl; AclConvOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp index 5596476e1bb33ecc2b3122bf237090b099307156..3711a946b508c9ad71f59dd85f2e01c99bccc9e5 100644 --- a/src/operators/kernel/mali/elementwise_add_kernel.cpp +++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp @@ -34,7 +34,7 @@ bool ElementwiseAddKernel::Init( template <> void ElementwiseAddKernel::Compute( - const ElementwiseAddParam ¶m) const { + const ElementwiseAddParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); Tensor *Out = param.Out(); diff --git a/src/operators/kernel/mali/feed_kernel.cpp b/src/operators/kernel/mali/feed_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6af6c1a88b8031da4a23dad1d3269935ce81b9a8 --- /dev/null +++ b/src/operators/kernel/mali/feed_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/feed_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FeedKernel::Init(FeedParam *param) { + return true; +} + +template <> +void FeedKernel::Compute(const FeedParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); + param.Out()->set_lod(param.InputX()->lod()); +} + +template class FeedKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/fetch_kernel.cpp b/src/operators/kernel/mali/fetch_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f74280cfb322b8135d99ca7fb7e2652a08588bb3 --- /dev/null +++ b/src/operators/kernel/mali/fetch_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_CONVADD_OP + +#include "operators/kernel/fetch_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FetchKernel::Init(FetchParam *param) { + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + param.Out()->ShareDataWith(*(param.InputX())); +} + +template class FetchKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp index c3197f38c6c6ee1a4f4f684c824a9a9e43d69d4f..5e59215834ce00e902deb19e54e149b3b4cfb8ac 100755 --- a/src/operators/kernel/mali/fushion_fc_kernel.cpp +++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp @@ -26,7 +26,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { template <> void FusionFcKernel::Compute( - const FusionFcParam ¶m) const { + const FusionFcParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); const Tensor *input_z = param.InputZ(); diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp index fc088f735c538bedc4d5c79593aa31c48acc4fc6..b46c9680d576ead3e7ab309c08894654a9fad04a 100644 --- a/src/operators/kernel/mali/lrn_kernel.cpp +++ b/src/operators/kernel/mali/lrn_kernel.cpp @@ -127,8 +127,7 @@ bool LrnKernel::Init(LrnParam* param) { } template <> -void LrnKernel::Compute( - const LrnParam& param) const { +void LrnKernel::Compute(const LrnParam& param) { std::cout << "init acl" << std::endl; AclLrnOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp index a9e54dad2b51c595be4f68df3916a4803047617e..da69f5e6fe5a4ec95373011d360cd4d9e20a8a61 100644 --- a/src/operators/kernel/mali/mul_kernel.cpp +++ b/src/operators/kernel/mali/mul_kernel.cpp @@ -27,8 +27,7 @@ bool MulKernel::Init(MulParam *param) { } template <> -void MulKernel::Compute( - const MulParam ¶m) const { +void MulKernel::Compute(const MulParam ¶m) { const Tensor *input_x = param.InputX(); const Tensor *input_y = param.InputY(); Tensor *out = param.Out(); diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp index 33b3bd7017739144a519bfb1be247b4751883779..ec5d35a8f600d63a623b468c9c97c3540bf9c3f7 100644 --- a/src/operators/kernel/mali/pool_kernel.cpp +++ b/src/operators/kernel/mali/pool_kernel.cpp @@ -195,8 +195,7 @@ bool PoolKernel::Init(PoolParam* param) { } template <> -void PoolKernel::Compute( - const PoolParam& param) const { +void PoolKernel::Compute(const PoolParam& param) { std::cout << "init acl" << std::endl; AclPoolOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp index 10b270800dee1a0ad8176da1f788100d29b60173..68bb52af3ab9b262218223d971b044edd759b347 100644 --- a/src/operators/kernel/mali/relu_kernel.cpp +++ b/src/operators/kernel/mali/relu_kernel.cpp @@ -115,8 +115,7 @@ bool ReluKernel::Init(ReluParam* param) { } template <> -void ReluKernel::Compute( - const ReluParam& param) const { +void ReluKernel::Compute(const ReluParam& param) { std::cout << "init acl" << std::endl; AclReluOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp index 69c077e252162017cb477a000b5f17f5a968fc10..f98906c0a982c10896e75101eaa2732d75d6cdf4 100644 --- a/src/operators/kernel/mali/reshape_kernel.cpp +++ b/src/operators/kernel/mali/reshape_kernel.cpp @@ -28,7 +28,7 @@ bool ReshapeKernel::Init(ReshapeParam *param) { template <> void ReshapeKernel::Compute( - const ReshapeParam ¶m) const { + const ReshapeParam ¶m) { const auto *input_x = param.InputX(); const auto &input_x_dims = input_x->dims(); auto *out = param.Out(); diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp index d4f25c96cc47d7baa394645d4e0c84e0e3f7ad29..d6ce1ecb61c2790c68883231eb6b90dcde43a956 100644 --- a/src/operators/kernel/mali/softmax_kernel.cpp +++ b/src/operators/kernel/mali/softmax_kernel.cpp @@ -113,7 +113,7 @@ bool SoftmaxKernel::Init(SoftmaxParam* param) { template <> void SoftmaxKernel::Compute( - const SoftmaxParam& param) const { + const SoftmaxParam& param) { std::cout << "init acl" << std::endl; AclSoftmaxOp* acl_op = reinterpret_cast*>(this->GetAclOp()); diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h index e441de4d4495b736aec248c0ef85191b32bfcbf9..8deb4a2cb74786257ddfc12c805c4a7d56589bbf 100644 --- a/src/operators/kernel/mul_kernel.h +++ b/src/operators/kernel/mul_kernel.h @@ -29,7 +29,7 @@ template class MulKernel : public framework::OpKernelBase> { public: - void Compute(const MulParam ¶m) const; + void Compute(const MulParam ¶m); bool Init(MulParam *param); }; } // namespace operators diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h index b1b20ddd81b395ea94ae62b1abf2fe861d9257db..6a4ac0c22941aa364f05e38c7abaf29948cd324b 100644 --- a/src/operators/kernel/multiclass_nms_kernel.h +++ b/src/operators/kernel/multiclass_nms_kernel.h @@ -28,7 +28,7 @@ class MultiClassNMSKernel : public framework::OpKernelBase> { public: - void Compute(const MultiClassNMSParam& param) const; + void Compute(const MultiClassNMSParam& param); bool Init(MultiClassNMSParam* param); }; } // namespace operators diff --git a/src/operators/kernel/polygon_box_transform_kernel.h b/src/operators/kernel/polygon_box_transform_kernel.h index d5baf32cc7dca0aee1eb0b7c13895e806f70320a..6ed003a4c794e7293ae3506909a779f95a677579 100644 --- a/src/operators/kernel/polygon_box_transform_kernel.h +++ b/src/operators/kernel/polygon_box_transform_kernel.h @@ -27,7 +27,7 @@ class PolygonBoxTransformKernel : public framework::OpKernelBase> { public: - void Compute(const PolygonBoxTransformParam& param) const; + void Compute(const PolygonBoxTransformParam& param); bool Init(PolygonBoxTransformParam* param); }; } // namespace operators diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h index 2be254444cc410fb95a94125cccb224ca9505545..ff80e0e44536d924026dbbe80a09677c069a8f6b 100644 --- a/src/operators/kernel/pool_kernel.h +++ b/src/operators/kernel/pool_kernel.h @@ -26,7 +26,7 @@ using framework::OpKernelBase; template class PoolKernel : public OpKernelBase> { public: - void Compute(const PoolParam ¶m) const override; + void Compute(const PoolParam ¶m); bool Init(PoolParam *param); }; } // namespace operators diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h index f6c7c3ac7f139cf7eafe8843ef48e53c90292082..c043149243f21f2abceeed37c5d0e81a61e5059f 100644 --- a/src/operators/kernel/prelu_kernel.h +++ b/src/operators/kernel/prelu_kernel.h @@ -24,7 +24,7 @@ template class PReluKernel : public framework::OpKernelBase> { public: - void Compute(const PReluParam& param) const; + void Compute(const PReluParam& param); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h index 5640375483d42d52965986dab6795254bbf4b908..921d5901a8f24abab61f7aa94663385d91e597a7 100644 --- a/src/operators/kernel/prior_box_kernel.h +++ b/src/operators/kernel/prior_box_kernel.h @@ -54,7 +54,7 @@ template class PriorBoxKernel : public framework::OpKernelBase> { public: - void Compute(const PriorBoxParam& param) const; + void Compute(const PriorBoxParam& param); bool Init(PriorBoxParam* param); }; } // namespace operators diff --git a/src/operators/kernel/quantize_kernel.h b/src/operators/kernel/quantize_kernel.h index c55ca2182acd0f459c785f29d359ea9039a7350a..d864e00d9c80003d06d460f85b6fddda40e6d607 100644 --- a/src/operators/kernel/quantize_kernel.h +++ b/src/operators/kernel/quantize_kernel.h @@ -26,7 +26,7 @@ template class QuantizeKernel : public framework::OpKernelBase> { public: - void Compute(const QuantizeParam ¶m) const; + void Compute(const QuantizeParam ¶m); bool Init(QuantizeParam *param); }; diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h index b0c32791d626f14b0840ce1c8f3f12f02b403d97..48f47c2de6df8d3aa9461fba915fd1a6406d4b9f 100644 --- a/src/operators/kernel/relu_kernel.h +++ b/src/operators/kernel/relu_kernel.h @@ -27,7 +27,7 @@ template class ReluKernel : public framework::OpKernelBase> { public: - void Compute(const ReluParam& param) const; + void Compute(const ReluParam& param); bool Init(ReluParam* param); }; } // namespace operators diff --git a/src/operators/kernel/reshape2_kernel.h b/src/operators/kernel/reshape2_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c6ab3cf72a29612249d0ff08e56ef60ca30d59a8 --- /dev/null +++ b/src/operators/kernel/reshape2_kernel.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class Reshape2Kernel + : public framework::OpKernelBase> { + public: + void Compute(const Reshape2Param& param); + bool Init(Reshape2Param* param); +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h index 73eb63f797f34ec4eb2baec8c4ab79fafb06f0e2..a5405654874320cdfe3432d16d3a8c6358d2d8e1 100644 --- a/src/operators/kernel/reshape_kernel.h +++ b/src/operators/kernel/reshape_kernel.h @@ -71,7 +71,7 @@ template class ReshapeKernel : public framework::OpKernelBase> { public: - void Compute(const ReshapeParam& param) const; + void Compute(const ReshapeParam& param); bool Init(ReshapeParam* param); }; } // namespace operators diff --git a/src/operators/kernel/resize_kernel.h b/src/operators/kernel/resize_kernel.h index 7102d2f4bc9bc64d53fa40697cf2b7a68d8be566..b25a0dcef5d291f03e4bb1a127eb0b592ee89055 100644 --- a/src/operators/kernel/resize_kernel.h +++ b/src/operators/kernel/resize_kernel.h @@ -74,7 +74,7 @@ template class ResizeKernel : public framework::OpKernelBase> { public: - void Compute(const ResizeParam ¶m) const; + void Compute(const ResizeParam ¶m); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h index 2da92d8d3c8b0d7867e7e6e628a04a853dd69464..a17e57652224992b2ee7127e6081804bf3253fb1 100644 --- a/src/operators/kernel/scale_kernel.h +++ b/src/operators/kernel/scale_kernel.h @@ -24,7 +24,7 @@ template class ScaleKernel : public framework::OpKernelBase> { public: - void Compute(const ScaleParam& param) const; + void Compute(const ScaleParam& param); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/shape_kernel.h b/src/operators/kernel/shape_kernel.h index 7caf3e427a4f3b469265248708a3090c52d1ca91..9d3c6e1701523acc43410fb0e3402b5679d4f19a 100644 --- a/src/operators/kernel/shape_kernel.h +++ b/src/operators/kernel/shape_kernel.h @@ -28,7 +28,7 @@ template class ShapeKernel : public framework::OpKernelBase> { public: - void Compute(const ShapeParam& param) const; + void Compute(const ShapeParam& param); bool Init(ShapeParam* param); }; } // namespace operators diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h index e68f215b00aa2f9faba850853efe4896752a8f7b..db9fc3dd3cb1e6c0eb56cd5a14a173f5a031263c 100644 --- a/src/operators/kernel/sigmoid_kernel.h +++ b/src/operators/kernel/sigmoid_kernel.h @@ -28,7 +28,7 @@ template class SigmoidKernel : public OpKernelBase> { public: - void Compute(const SigmoidParam& param) const override; + void Compute(const SigmoidParam& param); bool Init(SigmoidParam* param); }; diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h index 17f7fe4a9ebf5b78fc92c41abd4756a7bc6bff45..89dba51d9e11570bd4228adb075ee104b2094fd8 100644 --- a/src/operators/kernel/slice_kernel.h +++ b/src/operators/kernel/slice_kernel.h @@ -24,7 +24,8 @@ template class SliceKernel : public framework::OpKernelBase> { public: - void Compute(const SliceParam& param) const {} + void Compute(const SliceParam& param); + bool Init(SliceParam* param); }; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h index 67bd9167e8c717355fc326d3025cde410ce66010..d7d7435fd5145e702de848872f93087188fd31fc 100644 --- a/src/operators/kernel/softmax_kernel.h +++ b/src/operators/kernel/softmax_kernel.h @@ -27,7 +27,7 @@ template class SoftmaxKernel : public OpKernelBase> { public: - void Compute(const SoftmaxParam ¶m) const override; + void Compute(const SoftmaxParam ¶m); bool Init(SoftmaxParam *param); }; } // namespace operators diff --git a/src/operators/kernel/split_kernel.h b/src/operators/kernel/split_kernel.h index 03a418de59606e42684c67ca3053fa8e39b07940..3a2c03dce718e650ebf9127044f0db44d9d5c9a5 100644 --- a/src/operators/kernel/split_kernel.h +++ b/src/operators/kernel/split_kernel.h @@ -28,7 +28,7 @@ template class SplitKernel : public framework::OpKernelBase> { public: - void Compute(const SplitParam& param) const; + void Compute(const SplitParam& param); bool Init(SplitParam* param); }; } // namespace operators diff --git a/src/operators/kernel/sum_kernel.h b/src/operators/kernel/sum_kernel.h index ed337432e0fd4bf4035b67d4099379ce29918547..967d6f8307beb90254c431beaf324e891898d1a0 100644 --- a/src/operators/kernel/sum_kernel.h +++ b/src/operators/kernel/sum_kernel.h @@ -25,7 +25,7 @@ template class SumKernel : public framework::OpKernelBase> { public: - void Compute(const SumParam ¶m) const; + void Compute(const SumParam ¶m); bool Init(SumParam *param); }; diff --git a/src/operators/kernel/tanh_kernel.h b/src/operators/kernel/tanh_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..035f64f840b0aae8970f1aa284054a7984fc7ed6 --- /dev/null +++ b/src/operators/kernel/tanh_kernel.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef TANH_OP + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class TanhKernel : public OpKernelBase> { + public: + void Compute(const TanhParam& param); + bool Init(TanhParam* param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/transpose2_kernel.h b/src/operators/kernel/transpose2_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..a1fb186db09520bed6f891ef9381d96a06f648c9 --- /dev/null +++ b/src/operators/kernel/transpose2_kernel.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class Transpose2Kernel + : public framework::OpKernelBase> { + public: + void Compute(const Transpose2Param& param); + bool Init(Transpose2Param* param); +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h index 56c41fd221e080a4db3b34fbd4ab208c9986c2a8..63ee6eb172ff691ff51dd3f74613cd3e412210bf 100644 --- a/src/operators/kernel/transpose_kernel.h +++ b/src/operators/kernel/transpose_kernel.h @@ -28,7 +28,7 @@ template class TransposeKernel : public framework::OpKernelBase> { public: - void Compute(const TransposeParam& param) const; + void Compute(const TransposeParam& param); bool Init(TransposeParam* param); }; } // namespace operators diff --git a/src/operators/lookup_op.h b/src/operators/lookup_op.h index 073e884e9157644670259b5acdb47443d2333e03..b5c3886cf46c9641e919aee32e7af30c6528309a 100644 --- a/src/operators/lookup_op.h +++ b/src/operators/lookup_op.h @@ -37,10 +37,6 @@ class LookupOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::LookupKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, LookupParam, - operators::LookupKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h index 26415a84aa96abdab91da7508080ce6a095aca62..3e1e92bfe6d9b888f100d07edaabfe0f8c6eaca5 100644 --- a/src/operators/lrn_op.h +++ b/src/operators/lrn_op.h @@ -35,10 +35,6 @@ class LrnOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::LrnKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, LrnParam, - operators::LrnKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index dc699192a45a3fabe90ac2809f475bae5d5bbc10..b213f82351e03ddebc47efa672f0d21513a3098f 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -257,8 +257,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, const int h = static_cast(input->dims()[2]); const int w = static_cast(input->dims()[3]); - const int l = h; - + // const int l = h; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); const int hxw = h * w; @@ -271,7 +270,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, vbias = vdupq_n_f32(bias_data[j]); } - int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 + int w_mid = w - 2; // l=1->l_mid=-1,l=2->l_mid=0 float w00 = filter_data_tmp[0]; float w01 = filter_data_tmp[1]; float w02 = filter_data_tmp[2]; @@ -283,39 +282,38 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, float w22 = filter_data_tmp[8]; output_data[0] = w11 * input_data[0] + w12 * input_data[1] + - w21 * input_data[l] + w22 * input_data[l + 1]; - output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] + - w20 * input_data[2 * l - 2] + - w21 * input_data[2 * l - 1]; - output_data[(l - 1) * l] = - w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] + - w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; - output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + - w01 * input_data[(l - 2) * (l + 1) + 1] + - w10 * input_data[l * l - 2] + - w11 * input_data[l * l - 1]; + w21 * input_data[w] + w22 * input_data[w + 1]; + output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] + + w20 * input_data[2 * w - 2] + + w21 * input_data[2 * w - 1]; + output_data[(h - 1) * w] = + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] + + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; + output_data[h * w - 1] = + w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] + + w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1]; if (if_bias) { output_data[0] += bias_data[j]; - output_data[l - 1] += bias_data[j]; - output_data[(l - 1) * l] += bias_data[j]; - output_data[l * l - 1] += bias_data[j]; + output_data[w - 1] += bias_data[j]; + output_data[(h - 1) * w] += bias_data[j]; + output_data[h * w - 1] += bias_data[j]; } - for (int i = 1; i < l - 1; ++i) { - output_data[i * l] = - w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] + - w11 * input_data[i * l] + w12 * input_data[i * l + 1] + - w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1]; - - output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + - w01 * input_data[i * l + l - 1 - l] + - w10 * input_data[i * l + l - 1 - 1] + - w11 * input_data[i * l + l - 1] + - w20 * input_data[i * l + l - 1 + l - 1] + - w21 * input_data[i * l + l - 1 + l]; + for (int i = 1; i < h - 1; ++i) { + output_data[i * w] = + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + + w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; + + output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + + w01 * input_data[i * w + w - 1 - w] + + w10 * input_data[i * w + w - 1 - 1] + + w11 * input_data[i * w + w - 1] + + w20 * input_data[i * w + w - 1 + w - 1] + + w21 * input_data[i * w + w - 1 + w]; if (if_bias) { - output_data[i * l] += bias_data[j]; - output_data[i * l + l - 1] += bias_data[j]; + output_data[i * w] += bias_data[j]; + output_data[i * w + w - 1] += bias_data[j]; } } @@ -325,15 +323,15 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); - in2 = vld1q_f32(input_tmp + l); - const float *input_tmp_end = input_tmp + (l - 2) * l; + in2 = vld1q_f32(input_tmp + w); + const float *input_tmp_end = input_tmp + (h - 2) * w; in4 = vld1q_f32(input_tmp_end); - in6 = vld1q_f32(input_tmp_end + l); - int c_mid = l_mid; + in6 = vld1q_f32(input_tmp_end + w); + int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid > 3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); - in3 = vld1q_f32(input_tmp + l + 4); + in3 = vld1q_f32(input_tmp + w + 4); tmp0 = vextq_f32(in0, in1, 1); tmp1 = vextq_f32(in0, in1, 2); @@ -352,7 +350,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, vst1q_f32(output_ptr, out0); in5 = vld1q_f32(input_tmp_end + 4); - in7 = vld1q_f32(input_tmp_end + l + 4); + in7 = vld1q_f32(input_tmp_end + w + 4); tmp0 = vextq_f32(in4, in5, 1); tmp1 = vextq_f32(in4, in5, 2); @@ -367,7 +365,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, tmp3, w12); out0 = vaddq_f32(out0, vbias); - vst1q_f32(output_ptr + (l - 1) * l, out0); + vst1q_f32(output_ptr + (h - 1) * w, out0); // can optimize to each 8 stride. input_tmp += 4; @@ -380,8 +378,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, } // top right pad - float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]); - float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]); + float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]); tmp0 = vextq_f32(in0, pad0, 1); tmp1 = vextq_f32(in0, pad0, 2); @@ -409,8 +407,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, } // bottom right pad - float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]); - float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]); + float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]); + float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]); tmp0 = vextq_f32(in4, pad2, 1); tmp1 = vextq_f32(in4, pad2, 2); @@ -427,28 +425,28 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, for (int i = 0; i < c_mid; ++i) { if (i == 0) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0); } if (i == 1) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1); } if (i == 2) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2); } } // mid - for (int i = 0; i < l - 2; ++i) { - auto output_ptr = output_data + (i + 1) * l + 1; - input_tmp = input_data + i * l; + for (int i = 0; i < h - 2; ++i) { + auto output_ptr = output_data + (i + 1) * w + 1; + input_tmp = input_data + i * w; auto in0_tmp = vld1q_f32(input_tmp); - auto in2_tmp = vld1q_f32(input_tmp + l); - auto in4_tmp = vld1q_f32(input_tmp + l + l); - c_mid = l_mid; + auto in2_tmp = vld1q_f32(input_tmp + w); + auto in4_tmp = vld1q_f32(input_tmp + w + w); + c_mid = w_mid; for (; c_mid > 3; c_mid -= 4) { auto in1_tmp = vld1q_f32(input_tmp + 4); - auto in3_tmp = vld1q_f32(input_tmp + l + 4); - auto in5_tmp = vld1q_f32(input_tmp + l + l + 4); + auto in3_tmp = vld1q_f32(input_tmp + w + 4); + auto in5_tmp = vld1q_f32(input_tmp + w + w + 4); tmp0 = vextq_f32(in0_tmp, in1_tmp, 1); tmp1 = vextq_f32(in0_tmp, in1_tmp, 2); @@ -477,9 +475,9 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter, in4_tmp = in5_tmp; } - float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]); - float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]); - float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]); + float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]); + float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]); tmp0 = vextq_f32(in0_tmp, pad0, 1); tmp1 = vextq_f32(in0_tmp, pad0, 2); @@ -539,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, const int hxw = input_height * input_width; - const int l = input_height; - + // const int l = input_height; + const int h = input_height; + const int w = input_width; float32x4_t vzero = vdupq_n_f32(0); for (int b = 0; b < batch_size; b++) { @@ -626,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } output_data[0] = w11 * input_data[0] + w12 * input_data[1] + - w21 * input_data[l] + w22 * input_data[l + 1]; - output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] + - w20 * input_data[2 * l - 2] + - w21 * input_data[2 * l - 1]; - output_data[(l - 1) * l] = - w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] + - w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; - output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + - w01 * input_data[(l - 2) * (l + 1) + 1] + - w10 * input_data[l * l - 2] + - w11 * input_data[l * l - 1]; + w21 * input_data[w] + w22 * input_data[w + 1]; + output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] + + w20 * input_data[2 * w - 2] + + w21 * input_data[2 * w - 1]; + output_data[(h - 1) * w] = + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] + + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; + output_data[h * w - 1] = + w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] + + w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1]; output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c]; - output_data[l - 1] = - output_data[l - 1] * newscale_data[c] + newbias_data[c]; - output_data[(l - 1) * l] = - output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c]; - output_data[l * l - 1] = - output_data[l * l - 1] * newscale_data[c] + newbias_data[c]; + output_data[w - 1] = + output_data[w - 1] * newscale_data[c] + newbias_data[c]; + output_data[(h - 1) * w] = + output_data[(h - 1) * w] * newscale_data[c] + newbias_data[c]; + output_data[h * w - 1] = + output_data[h * w - 1] * newscale_data[c] + newbias_data[c]; if (if_relu) { output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; - output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1]; - output_data[(l - 1) * l] = - output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l]; - output_data[l * l - 1] = - output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1]; + output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - 1]; + output_data[(h - 1) * w] = + output_data[(h - 1) * w] < 0 ? 0 : output_data[(h - 1) * w]; + output_data[h * w - 1] = + output_data[h * w - 1] < 0 ? 0 : output_data[h * w - 1]; } - for (int i = 1; i < l - 1; ++i) { - output_data[i * l] = - w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] + - w11 * input_data[i * l] + w12 * input_data[i * l + 1] + - w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1]; - - output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + - w01 * input_data[i * l + l - 1 - l] + - w10 * input_data[i * l + l - 1 - 1] + - w11 * input_data[i * l + l - 1] + - w20 * input_data[i * l + l - 1 + l - 1] + - w21 * input_data[i * l + l - 1 + l]; - output_data[i * l] = - output_data[i * l] * newscale_data[c] + newbias_data[c]; - output_data[i * l + l - 1] = - output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c]; + for (int i = 1; i < h - 1; ++i) { + output_data[i * w] = + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + + w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1]; + + output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + + w01 * input_data[i * w + w - 1 - w] + + w10 * input_data[i * w + w - 1 - 1] + + w11 * input_data[i * w + w - 1] + + w20 * input_data[i * w + w - 1 + w - 1] + + w21 * input_data[i * w + w - 1 + w]; + output_data[i * w] = + output_data[i * w] * newscale_data[c] + newbias_data[c]; + output_data[i * w + w - 1] = + output_data[i * w + w - 1] * newscale_data[c] + newbias_data[c]; if (if_relu) { - output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l]; - output_data[i * l + l - 1] = - output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1]; + output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i * w]; + output_data[i * w + w - 1] = + output_data[i * w + w - 1] < 0 ? 0 : output_data[i * w + w - 1]; } } @@ -776,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, const int h = static_cast(input->dims()[2]); const int w = static_cast(input->dims()[3]); - const int l = h; +// const int l = h; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); @@ -792,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, vnewbias = vdupq_n_f32(newbias_data[j]); vnewscale = vdupq_n_f32(newscale_data[j]); - int l_mid = l - 2; // l=1->l_mid=-1,l=2->l_mid=0 + int w_mid = w - 2; // l=1->l_mid=-1,l=2->l_mid=0 float w00 = filter_data_tmp[0]; float w01 = filter_data_tmp[1]; float w02 = filter_data_tmp[2]; @@ -804,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, float w22 = filter_data_tmp[8]; output_data[0] = w11 * input_data[0] + w12 * input_data[1] + - w21 * input_data[l] + w22 * input_data[l + 1]; - - output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - - 1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1]; - - output_data[(l - 1) * l] = - w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + - 1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1]; - output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] + - w01 * input_data[(l - 2) * (l + 1) + 1] + - w10 * input_data[l * l - 2] + - w11 * input_data[l * l - 1]; + w21 * input_data[w] + w22 * input_data[w + 1]; + + output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - + 1] + w20 * input_data[2 * w - 2] + w21 * input_data[2 * w - 1]; + + output_data[(h - 1) * w] = + w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + + 1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1]; + output_data[h * w - 1] = w00 * input_data[h*w-w-2] + + w01 * input_data[h*w-w-1] + + w10 * input_data[h * w - 2] + + w11 * input_data[h * w - 1]; output_data[0] = output_data[0] * newscale_data[j] + - newbias_data[j]; output_data[l - 1] = output_data[l - 1] * - newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] = - output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j]; - output_data[l * l - 1] = - output_data[l * l - 1] * newscale_data[j] + newbias_data[j]; + newbias_data[j]; output_data[w - 1] = output_data[w - 1] * + newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] = + output_data[(h - 1) * w] * newscale_data[j] + newbias_data[j]; + output_data[h * w - 1] = + output_data[h * w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { output_data[0] = output_data[0] < 0 ? 0 : output_data[0]; - output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - - 1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 : - output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1] - < 0 ? 0 : output_data[l * l - 1]; + output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - + 1]; output_data[(h - 1) * w] = output_data[(h - 1) * w] < 0 ? 0 : + output_data[(h - 1) * w]; output_data[h * w - 1] = output_data[h * w - 1] + < 0 ? 0 : output_data[h * w - 1]; } - for (int i = 1; i < l - 1; ++i) { - output_data[i * l] = - w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] - + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 * - input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i * - l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i - * l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 * - input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21 - * input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l] - * newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] = - output_data[i * l + l - 1] * newscale_data[j] + + for (int i = 1; i < h - 1; ++i) { + output_data[i * w] = + w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] + + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 * + input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i * + w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + w01 * input_data[i + * w + w - 1 - w] + w10 * input_data[i * w + w - 1 - 1] + w11 * + input_data[i * w + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] + w21 + * input_data[i * w + w - 1 + w]; output_data[i * w] = output_data[i * w] + * newscale_data[j] + newbias_data[j]; output_data[i * w + w - 1] = + output_data[i * w + w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { - output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i - * l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 : - output_data[i * l + l - 1]; + output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i + * w]; output_data[i * w + w - 1] = output_data[i * w + w - 1] < 0 ? 0 : + output_data[i * w + w - 1]; } } @@ -855,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 = - vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l - - 2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end + - l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid > + vld1q_f32(input_tmp + w); const float *input_tmp_end = input_tmp + (h - + 2) * w; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end + + w); int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid > 3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 = - vld1q_f32(input_tmp + l + 4); + vld1q_f32(input_tmp + w + 4); tmp0 = vextq_f32(in0, in1, 1); tmp1 = vextq_f32(in0, in1, 2); @@ -880,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, vst1q_f32(output_ptr, out0); in5 = vld1q_f32(input_tmp_end + 4); - in7 = vld1q_f32(input_tmp_end + l + 4); + in7 = vld1q_f32(input_tmp_end + w + 4); tmp0 = vextq_f32(in4, in5, 1); tmp1 = vextq_f32(in4, in5, 2); @@ -897,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, if (if_relu) { out0 = vmaxq_f32(out0, vzero); } - vst1q_f32(output_ptr + (l - 1) * l, out0); + vst1q_f32(output_ptr + (h - 1) * w, out0); // can optimize to each 8 stride. input_tmp += 4; @@ -910,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } // top right pad - float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]); - float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]); + float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]); tmp0 = vextq_f32(in0, pad0, 1); tmp1 = vextq_f32(in0, pad0, 2); @@ -941,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } // bottom right pad - float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]); - float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]); + float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]); + float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]); tmp0 = vextq_f32(in4, pad2, 1); tmp1 = vextq_f32(in4, pad2, 2); @@ -961,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, } for (int i = 0; i < c_mid; ++i) { if (i == 0) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0); } if (i == 1) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1); } if (i == 2) { - vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2); + vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2); } } // mid - for (int i = 0; i < l - 2; ++i) { - auto output_ptr = output_data + (i + 1) * l + 1; - input_tmp = input_data + i * l; + for (int i = 0; i < h - 2; ++i) { + auto output_ptr = output_data + (i + 1) * w + 1; + input_tmp = input_data + i * w; auto in0_tmp = vld1q_f32(input_tmp); - auto in2_tmp = vld1q_f32(input_tmp + l); - auto in4_tmp = vld1q_f32(input_tmp + l + l); - c_mid = l_mid; + auto in2_tmp = vld1q_f32(input_tmp + w); + auto in4_tmp = vld1q_f32(input_tmp + w + w); + c_mid = w_mid; for (; c_mid > 3; c_mid -= 4) { auto in1_tmp = vld1q_f32(input_tmp + 4); - auto in3_tmp = vld1q_f32(input_tmp + l + 4); - auto in5_tmp = vld1q_f32(input_tmp + l + l + 4); + auto in3_tmp = vld1q_f32(input_tmp + w + 4); + auto in5_tmp = vld1q_f32(input_tmp + w + w + 4); tmp0 = vextq_f32(in0_tmp, in1_tmp, 1); tmp1 = vextq_f32(in0_tmp, in1_tmp, 2); @@ -1014,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, in4_tmp = in5_tmp; } - float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]); - float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]); - float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]); + float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]); + float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]); + float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]); tmp0 = vextq_f32(in0_tmp, pad0, 1); tmp1 = vextq_f32(in0_tmp, pad0, 2); @@ -1060,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter, #endif } +/// w!=h not fix void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter, Tensor *output, const Tensor *new_scale, const Tensor *new_bias, bool if_relu) { @@ -1275,7 +1274,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, const int in_l = in_h; const int inhxw = in_h * in_w; const int outhxw = out_h * out_w; - const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; + /// todo : fix if_pad when w != h + const int if_pad_r = in_w - 1 == (out_w - 1) * 2 ? 1 : 0; + const int if_pad_b = in_h - 1 == (out_h - 1) * 2 ? 1 : 0; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); const float *input_row_ptr; @@ -1366,7 +1367,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10); elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12); - if (!if_pad) { + if (!if_pad_b) { elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21); elewise_res0 = @@ -1381,9 +1382,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, if ((w4 != w_times)) { vst1q_f32(output_row_ptr, res3); } else { - if (out_l - 2 - w_times * 3 == 1) { + if (out_w - 2 - w_times * 3 == 1) { vst1q_lane_f32(output_row_ptr, res3, 0); - } else if (out_l - 2 - w_times * 3 == 2) { + } else if (out_w - 2 - w_times * 3 == 2) { vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr + 1, res3, 1); } @@ -1393,64 +1394,65 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter, } output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + - input_const[in_l] * w21 + - input_const[in_l + 1] * w22; + input_const[in_w] * w21 + + input_const[in_w + 1] * w22; - out2in_mid = (out_l - 1) * 2; - output_data_tmp[out_l - 1] = + out2in_mid = (out_w - 1) * 2; + output_data_tmp[out_w - 1] = w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + - (1 - if_pad) * (w12 * input_const[out2in_mid + 1] + - w22 * input_const[out2in_mid + in_w + 1]); + (1 - if_pad_r) * (w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w; + out2in_mid = (out_h - 1) * 2 * in_w; - output_data_tmp[out_l * (out_l - 1)] = + output_data_tmp[out_w * (out_h - 1)] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + - (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + - w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; + (1 - if_pad_b) * (w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]); + out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2; - output_data_tmp[out_l * out_l - 1] = + output_data_tmp[out_h * out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + - (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] + - w21 * input_const[out2in_mid + in_w] + - w02 * input_const[out2in_mid - in_w + 1] + - w12 * input_const[out2in_mid + 1] + - w22 * input_const[out2in_mid + in_w + 1]); + (1 - if_pad_r) * (w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w]) + + (1 - if_pad_b) * (w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1]) + + (1 - if_pad_r) * (1 - if_pad_b) * w22 * + input_const[out2in_mid + in_w + 1]; if (if_bias) { output_data_tmp[0] += bias_data[j]; - output_data_tmp[out_l - 1] += bias_data[j]; - output_data_tmp[out_l * (out_l - 1)] += bias_data[j]; - output_data_tmp[out_l * out_l - 1] += bias_data[j]; + output_data_tmp[out_w - 1] += bias_data[j]; + output_data_tmp[out_w * (out_h - 1)] += bias_data[j]; + output_data_tmp[out_h * out_w - 1] += bias_data[j]; } for (int i = 1; i < out_h - 1; i++) { out2in_mid = i * 2 * in_w; - output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + + output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w21 * input_const[out2in_mid + in_w] + w22 * input_const[out2in_mid + in_w + 1]; - out2in_mid = i * 2 * in_w + (out_l - 1) * 2; - output_data_tmp[i * out_l + out_l - 1] = + out2in_mid = i * 2 * in_w + (out_w - 1) * 2; + output_data_tmp[i * out_w + out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + - (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] + - w12 * input_const[out2in_mid + 1] + - w22 * input_const[out2in_mid + in_w + 1]); + (1 - if_pad_r) * (w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); if (if_bias) { - output_data_tmp[i * out_l] += bias_data[j]; - output_data_tmp[i * out_l + out_l - 1] += bias_data[j]; + output_data_tmp[i * out_w] += bias_data[j]; + output_data_tmp[i * out_w + out_w - 1] += bias_data[j]; } } filter_data_tmp += 9; @@ -1657,11 +1659,13 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, const int in_w = static_cast(input->dims()[3]); const int out_h = static_cast(output->dims()[2]); const int out_w = static_cast(output->dims()[3]); - const int out_l = out_h; - const int in_l = in_h; + // const int out_l = out_h; + // const int in_l = in_h; const int inhxw = in_h * in_w; const int outhxw = out_h * out_w; - const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0; + /// todo : fix if_pad when w != h + const int if_pad_r = in_w - 1 == (out_w - 1) * 2 ? 1 : 0; + const int if_pad_b = in_h - 1 == (out_h - 1) * 2 ? 1 : 0; const int batch_size = static_cast(input->dims()[0]); const int c = static_cast(input->dims()[1]); const int w_times = (out_w - 2) / 3; @@ -1755,7 +1759,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10); elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12); - if (!if_pad) { + if (!if_pad_b) { elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21); elewise_res0 = @@ -1775,9 +1779,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, vst1q_lane_f32(output_row_ptr + 1, res3, 1); vst1q_lane_f32(output_row_ptr + 2, res3, 2); } else { - if (out_l - 2 - w_times * 3 == 1) { + if (out_w - 2 - w_times * 3 == 1) { vst1q_lane_f32(output_row_ptr, res3, 0); - } else if (out_l - 2 - w_times * 3 == 2) { + } else if (out_w - 2 - w_times * 3 == 2) { vst1q_lane_f32(output_row_ptr, res3, 0); vst1q_lane_f32(output_row_ptr + 1, res3, 1); } @@ -1787,90 +1791,91 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, } output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 + - input_const[in_l] * w21 + - input_const[in_l + 1] * w22; + input_const[in_w] * w21 + + input_const[in_w + 1] * w22; - out2in_mid = (out_l - 1) * 2; - output_data_tmp[out_l - 1] = + out2in_mid = (out_w - 1) * 2; + output_data_tmp[out_w - 1] = w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + - (1 - if_pad) * (w12 * input_const[out2in_mid + 1] + - w22 * input_const[out2in_mid + in_w + 1]); + (1 - if_pad_r) * (w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w; + out2in_mid = (out_h - 1) * 2 * in_w; - output_data_tmp[out_l * (out_l - 1)] = + output_data_tmp[out_w * (out_h - 1)] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + - (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] + - w22 * input_const[out2in_mid + in_w + 1]); - out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2; + (1 - if_pad_b) * (w21 * input_const[out2in_mid + in_w] + + w22 * input_const[out2in_mid + in_w + 1]); + out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2; - output_data_tmp[out_l * out_l - 1] = + output_data_tmp[out_h * out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + - (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] + - w21 * input_const[out2in_mid + in_w] + - w02 * input_const[out2in_mid - in_w + 1] + - w12 * input_const[out2in_mid + 1] + - w22 * input_const[out2in_mid + in_w + 1]); + (1 - if_pad_r) * (w20 * input_const[out2in_mid + in_w - 1] + + w21 * input_const[out2in_mid + in_w]) + + (1 - if_pad_b) * (w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1]) + + (1 - if_pad_r) * (1 - if_pad_b) * w22 * + input_const[out2in_mid + in_w + 1]; output_data_tmp[0] = output_data_tmp[0] * newscale_data[j] + newbias_data[j]; - output_data_tmp[out_l - 1] = - output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j]; - output_data_tmp[out_l * (out_l - 1)] = - output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] + + output_data_tmp[out_w - 1] = + output_data_tmp[out_w - 1] * newscale_data[j] + newbias_data[j]; + output_data_tmp[out_w * (out_h - 1)] = + output_data_tmp[out_w * (out_h - 1)] * newscale_data[j] + newbias_data[j]; - output_data_tmp[out_l * out_l - 1] = - output_data_tmp[out_l * out_l - 1] * newscale_data[j] + + output_data_tmp[out_h * out_w - 1] = + output_data_tmp[out_h * out_w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0]; - output_data_tmp[out_l - 1] = - output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1]; - output_data_tmp[out_l * (out_l - 1)] = - output_data_tmp[out_l * (out_l - 1)] < 0 + output_data_tmp[out_w - 1] = + output_data_tmp[out_w - 1] < 0 ? 0 : output_data_tmp[out_w - 1]; + output_data_tmp[out_w * (out_h - 1)] = + output_data_tmp[out_w * (out_h - 1)] < 0 ? 0 - : output_data_tmp[out_l * (out_l - 1)]; - output_data_tmp[out_l * out_l - 1] = - output_data_tmp[out_l * out_l - 1] < 0 + : output_data_tmp[out_w * (out_h - 1)]; + output_data_tmp[out_h * out_w - 1] = + output_data_tmp[out_h * out_w - 1] < 0 ? 0 - : output_data_tmp[out_l * out_l - 1]; + : output_data_tmp[out_h * out_w - 1]; } for (int i = 1; i < out_h - 1; i++) { out2in_mid = i * 2 * in_w; - output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] + + output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] + w02 * input_const[out2in_mid - in_w + 1] + w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] + w21 * input_const[out2in_mid + in_w] + w22 * input_const[out2in_mid + in_w + 1]; - out2in_mid = i * 2 * in_w + (out_l - 1) * 2; - output_data_tmp[i * out_l + out_l - 1] = + out2in_mid = i * 2 * in_w + (out_w - 1) * 2; + output_data_tmp[i * out_w + out_w - 1] = w00 * input_const[out2in_mid - in_w - 1] + w01 * input_const[out2in_mid - in_w] + w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] + w20 * input_const[out2in_mid + in_w - 1] + w21 * input_const[out2in_mid + in_w] + - (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] + - w12 * input_const[out2in_mid + 1] + - w22 * input_const[out2in_mid + in_w + 1]); - output_data_tmp[i * out_l] = - output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j]; - output_data_tmp[i * out_l + out_l - 1] = - output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] + + (1 - if_pad_r) * (w02 * input_const[out2in_mid - in_w + 1] + + w12 * input_const[out2in_mid + 1] + + w22 * input_const[out2in_mid + in_w + 1]); + output_data_tmp[i * out_w] = + output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j]; + output_data_tmp[i * out_w + out_w - 1] = + output_data_tmp[i * out_w + out_w - 1] * newscale_data[j] + newbias_data[j]; if (if_relu) { - output_data_tmp[i * out_l] = - output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l]; - output_data_tmp[i * out_l + out_l - 1] = - output_data_tmp[i * out_l + out_l - 1] < 0 + output_data_tmp[i * out_w] = + output_data_tmp[i * out_w] < 0 ? 0 : output_data_tmp[i * out_w]; + output_data_tmp[i * out_w + out_w - 1] = + output_data_tmp[i * out_w + out_w - 1] < 0 ? 0 - : output_data_tmp[i * out_l + out_l - 1]; + : output_data_tmp[i * out_w + out_w - 1]; } } } diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 44621ba99a92a3ed456b8d7d0959e3580662d910..d3e6de3134ff91f47c66c927194a5ba688e931b0 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -3230,6 +3230,8 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, int L1 = 64 / max_threads * 1024; KC = k; + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + memset(static_cast(zero), 0, sizeof(float) * KC); if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(float)); @@ -3255,7 +3257,7 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, packedB = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB); + (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB); packedA = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); } else { @@ -3284,12 +3286,10 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, packedA = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA); + (*this.*procPackA)(m, KC, m % MR, A, lda, packedA); packedB = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); } - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - memset(static_cast(zero), 0, sizeof(float) * KC); packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); @@ -3307,8 +3307,13 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, float *local_A = packedA + MC * KC * local_threads; float *local_C = packedC + MC * NC * local_threads; (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A); - InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, - &C(i, 0), ldc, relu, bias + i); + if (bias == nullptr) { + InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, + &C(i, 0), ldc, relu, nullptr); + } else { + InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C, + &C(i, 0), ldc, relu, bias + i); + } } } else { #pragma omp parallel for @@ -3347,6 +3352,8 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int L1 = 64 / max_threads * 1024; KC = k; + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + memset(static_cast(zero), 0, sizeof(float) * KC); if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(float)); @@ -3372,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, packedB = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB); + (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB); packedA = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); } else { @@ -3400,12 +3407,10 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, packedA = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA); + (*this.*procPackA)(m, KC, m % MR, A, lda, packedA); packedB = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); } - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - memset(static_cast(zero), 0, sizeof(float) * KC); packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); @@ -3475,6 +3480,8 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, int L1 = 8 * 1024; KC = k; + zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); + memset(static_cast(zero), 0, sizeof(float) * KC); if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(float)); @@ -3500,7 +3507,7 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, packedB = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * KC * NC)); - (*this.*procPackB)(KC, NC, NC % NR, B, ldb, packedB); + (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB); packedA = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads)); } else { @@ -3528,12 +3535,10 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, packedA = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * KC)); - (*this.*procPackA)(MC, KC, MC % MR, A, lda, packedA); + (*this.*procPackA)(m, KC, m % MR, A, lda, packedA); packedB = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads)); } - zero = static_cast(paddle_mobile::memory::Alloc(sizeof(float) * KC)); - memset(static_cast(zero), 0, sizeof(float) * KC); packedC = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads)); diff --git a/src/operators/math/gru_compute.cpp b/src/operators/math/gru_compute.cpp index 8ebf92059b5f5205b3169a6992039d3f050b3b4b..9e77f572c53bc2ba9be57f5edbd2b4bf85f5305e 100644 --- a/src/operators/math/gru_compute.cpp +++ b/src/operators/math/gru_compute.cpp @@ -30,20 +30,34 @@ struct GRUUnitFunctor { const ActivationType active_gate) { Gemm gemm; if (value.prev_out_value) { +#ifdef _OPENMP + gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, + frame_size * 2, 1, value.gate_value, frame_size * 3, false, + nullptr); +#else gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3, false, nullptr); +#endif } forward_reset_output(forward::gru_resetOutput(), value, frame_size, batch_size, active_gate); if (value.prev_out_value) { +#ifdef _OPENMP + gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, + frame_size, 1, value.gate_value + frame_size * 2, + frame_size * 3, false, nullptr); +#else gemm.Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3, false, nullptr); +#endif } forward_final_output(forward::gru_finalOutput(), value, frame_size, diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp index 8d460688bbedf3d2a4e5dadaa5eebb1ca709cf05..9449ad70819f2ea114fac8848f6ee023871d47f2 100644 --- a/src/operators/math/im2col.cpp +++ b/src/operators/math/im2col.cpp @@ -117,7 +117,7 @@ void Im2ColFunctor::operator()( (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0)); int fill = isize % 2; if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 && - dilation[0] == 1 && im_height > 2) { + dilation[0] == 1 && im_height > 2 && im_height == im_width) { for (int c = 0; c < im_channels; ++c) { int oosize = osize * osize; int nk4 = osize / 4; @@ -289,7 +289,7 @@ void Im2ColFunctor::operator()( im_data += isize * isize; } } else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 && - im_height > 2) { + im_height > 2 && im_height == im_width) { for (int c = 0; c < im_channels; ++c) { int oosize = osize * osize; int nk4 = osize / 4; @@ -676,7 +676,6 @@ class Im2ColFunctor { const T *im_data = im.data(); T *col_data = col->data(); - for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { for (int channel = 0; channel < im_channels; ++channel) { @@ -688,7 +687,6 @@ class Im2ColFunctor { ++filter_col_idx) { int im_col_offset = col_col_idx * stride[1] + filter_col_idx - padding[1]; - int col_offset = ((((col_row_idx)*col_width + col_col_idx) * im_channels + channel) * @@ -696,7 +694,6 @@ class Im2ColFunctor { filter_row_idx) * filter_width + filter_col_idx; - int im_offset = (channel * im_height + im_row_offset) * im_width + im_col_offset; col_data[col_offset] = diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp index 9dc3dbafed990de2f4057d98a2accdd8ce2fd7db..88bf866b73f6f06d28f6e1868031ae1a25b9b31c 100644 --- a/src/operators/math/pool_2x2.cpp +++ b/src/operators/math/pool_2x2.cpp @@ -58,7 +58,7 @@ void Pool2x2Maxs2p0(vector strides, vector paddings, const float *in_ptr1 = input_data + i * input_batch_stride + c * input_channel_stride + ph * input_width; const float *in_ptr2 = in_ptr1 + input_width; - if (ph + 1 >= input_height) { + if (ph != input_height && ph + 1 >= input_height) { in_ptr2 = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * input_width)); memset(static_cast(const_cast(in_ptr2)), -FLT_MAX, @@ -122,19 +122,30 @@ void Pool2x2Maxs2p0(vector strides, vector paddings, #endif if (_w2 != 0) { - in_ptr1 += 16 * w1 + 4 * w2; - in_ptr2 += 16 * w1 + 4 * w2; - out_ptr += 8 * w1 + 2 * w2; + in_ptr1 = input_data + i * input_batch_stride + + c * input_channel_stride + ph * input_width + 16 * w1 + + 4 * w2; + in_ptr2 = in_ptr1 + input_width; + out_ptr = output_data + i * output_batch_stride + + c * output_channel_stride + ph / 2 * output_width + 8 * w1 + + 2 * w2; if (_w2 == 1) { *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; } else if (_w2 == 2) { - float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++; + float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; + in_ptr1++; + in_ptr2++; float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; *out_ptr = (temp > temp1) ? temp : temp1; } else if (_w2 == 3) { - float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++; - float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++; - *out_ptr++ = (temp > temp1) ? temp : temp1; + float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; + in_ptr1++; + in_ptr2++; + float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; + in_ptr1++; + in_ptr2++; + *out_ptr = (temp > temp1) ? temp : temp1; + out_ptr++; *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2; } } @@ -173,7 +184,7 @@ void Pool2x2Avgs2p0(vector strides, vector paddings, int w2 = _w1 / 4; int _w2 = _w1 % 4; - float quarter = 1 / 4; + float quarter = 0.25; for (int i = 0; i < batch_size; ++i) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < input_height; ph += 2) { @@ -250,25 +261,32 @@ void Pool2x2Avgs2p0(vector strides, vector paddings, #endif if (_w2 != 0) { - in_ptr1 += 16 * w1 + 4 * w2; - in_ptr2 += 16 * w1 + 4 * w2; - out_ptr += 8 * w1 + 2 * w2; + in_ptr1 = input_data + i * input_batch_stride + + c * input_channel_stride + ph * input_width + 16 * w1 + + 4 * w2; + in_ptr2 = in_ptr1 + input_width; + out_ptr = output_data + i * output_batch_stride + + c * output_channel_stride + ph / 2 * output_width + 8 * w1 + + 2 * w2; if (_w2 == 1) { *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2); } else if (_w2 == 2) { float temp = 0; - temp += *in_ptr1++; - temp += *in_ptr2++; temp += *in_ptr1; temp += *in_ptr2; - *out_ptr = 0.5 * temp; + in_ptr1++; + in_ptr2++; + temp += *in_ptr1; + temp += *in_ptr2; + *out_ptr = 0.25 * temp; } else if (_w2 == 3) { float temp = 0; temp += *in_ptr1++; temp += *in_ptr2++; temp += *in_ptr1++; temp += *in_ptr2++; - *out_ptr++ = 0.5 * temp; + *out_ptr = 0.25 * temp; + out_ptr++; *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2); } } diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp index dba88c93969014f2ad0d2636b4141c734dbc2ed5..9c23d99e60f6c7f38f372cbe2d221ae3c1a58592 100644 --- a/src/operators/math/softmax.cpp +++ b/src/operators/math/softmax.cpp @@ -141,13 +141,21 @@ class SoftmaxFuntor { public: void operator()(const framework::Tensor *X, framework::Tensor *Y) { const DDim dDim = X->dims(); + int dim1 = dDim[dDim.size() - 1]; + int dim0 = X->numel() / dim1 / dDim[0]; + framework::DDim matrix_shape = {dim0, dim1}; for (int i = 0; i < dDim[0]; ++i) { framework::Tensor sub_X = X->Slice(i, i + 1); framework::Tensor sub_Y = Y->Slice(i, i + 1); - + sub_X.Resize(matrix_shape); + sub_Y.Resize(matrix_shape); + for (int j = 0; j < dim0; j++) { + framework::Tensor sub_x = sub_X.Slice(j, j + 1); + framework::Tensor sub_y = sub_Y.Slice(j, j + 1); #ifdef __ARM_NEON - SoftmaxCacl(&sub_X, &sub_Y); + SoftmaxCacl(&sub_x, &sub_y); #endif + } } } }; diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h index 5cd174db07973461fe699242a2013d9c4ea78732..51e828202e8da2080f014eff2bd60472dd873884 100644 --- a/src/operators/mul_op.h +++ b/src/operators/mul_op.h @@ -35,10 +35,6 @@ class MulOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::MulKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, MulParam, - operators::MulKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h index 4919ec69b6b5b1a702760f46ddbfc77b16c7875e..059974ab214004bcd1423514c85353da9a9bb6b8 100644 --- a/src/operators/multiclass_nms_op.h +++ b/src/operators/multiclass_nms_op.h @@ -40,10 +40,6 @@ class MultiClassNMSOp : public framework::OperatorWithKernel< DeviceType, MultiClassNMSParam, operators::MultiClassNMSKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, MultiClassNMSParam, - operators::MultiClassNMSKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/op_param.h b/src/operators/op_param.h index fb45cc9ac7fd60471f406f5208f906f000338011..0862ed9b69faa079b2bf841b014f451f9b44e855 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -23,8 +23,17 @@ limitations under the License. */ #include "framework/scope.h" #include "framework/tensor.h" #include "framework/variable.h" -#ifdef PADDLE_MOBILE_FPGA -#include "fpga/api.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif + +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +#ifdef PADDLE_MOBILE_CL +#include "framework/cl/cl_image.h" #endif namespace paddle_mobile { @@ -48,6 +57,17 @@ struct DtypeTensorTrait { typedef framework::Tensor rtype; }; +#ifdef PADDLE_MOBILE_CL +template <> +struct DtypeTensorTrait { + // This is the type we obtained in variable. + typedef framework::CLImage gtype; + // This type will be the parent class type + // or the same type. + typedef framework::CLImage rtype; +}; +#endif + class OpParam { protected: template @@ -243,6 +263,12 @@ class OpParam { return GetVarValue("Y", outputs, scope); } + template + static T *OutputXShapeFrom(const VariableNameMap &outputs, + const Scope &scope) { + return GetVarValue("XShape", outputs, scope); + } + template static T *OutputBoxesFrom(const VariableNameMap &outputs, const Scope &scope) { @@ -403,6 +429,13 @@ class ConvParam : public OpParam { const int &Groups() const { return groups; } +#ifdef PADDLE_MOBILE_CL + int Offset() const { return offset_; } + + int SetOffset(int in_offset) { offset_ = in_offset; } + +#endif + private: RType *input_; mutable RType *output_; @@ -412,6 +445,20 @@ class ConvParam : public OpParam { vector dilations_; mutable enum ExecMode exec_mode_; int groups; + +#ifdef PADDLE_MOBILE_CL + int offset_; +#endif + +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::SplitConvArgs fpga_conv_args; + + public: + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } +#endif }; template Print &operator<<(Print &printer, const ConvParam &conv_param); @@ -556,15 +603,6 @@ class MulParam : OpParam { GType *out_; int x_num_col_dims_; int y_num_col_dims_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -722,6 +760,14 @@ class BatchNormParam : OpParam { const string &DataFormat() const { return data_format_; } + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + private: RType *input_x_; RType *output_y_; @@ -733,6 +779,8 @@ class BatchNormParam : OpParam { float momentum_; bool is_test_; string data_format_; + RType *new_bias_; + RType *new_scale_; }; #endif @@ -1041,18 +1089,18 @@ class FeedParam : public OpParam { public: FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, Scope *scope) { - input_x_ = InputXFrom(inputs, *scope); - out_ = OutFrom(outputs, *scope); - auto var = scope->Var("batch_size"); + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + auto var = scope.FindVar("batch_size"); batch_size = var->GetValue(); } - const GType *InputX() const { return input_x_; } + const LoDTensor *InputX() const { return input_x_; } GType *Out() const { return out_; } const int BatchSize() const { return batch_size; } private: - GType *input_x_; + LoDTensor *input_x_; GType *out_; int batch_size; }; @@ -1066,14 +1114,19 @@ class FetchParam : public OpParam { FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + out_ = OutFrom(outputs, scope); } + const RType *InputX() const { return input_x_; } - RType *Out() const { return out_; } + Tensor *Out() const { return out_; } + + static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) { + return GetVarValue("Out", outputs, scope); + } private: RType *input_x_; - RType *out_; + Tensor *out_; }; #ifdef FILL_CONSTANT_OP @@ -1139,6 +1192,37 @@ class TransposeParam : public OpParam { }; #endif +#ifdef TRANSPOSE2_OP +template +class Transpose2Param : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + output_xshape_ = OutputXShapeFrom(outputs, scope); + axis_ = GetAttr>("axis", attrs); + } + + const RType *InputX() const { return input_x_; } + + RType *Out() const { return out_; } + + RType *OutputXShape() const { return output_xshape_; } + + const vector &Axis() const { return axis_; } + + private: + RType *input_x_; + RType *out_; + RType *output_xshape_; + vector axis_; +}; +#endif + #ifdef LOOKUP_OP template class LookupParam : public OpParam { @@ -1246,6 +1330,49 @@ class ReshapeParam : public OpParam { }; #endif +#ifdef RESHAPE2_OP +template +class Reshape2Param : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + input_shape_ = InputShapeFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + output_xshape_ = OutputXShapeFrom(outputs, scope); + shape_ = GetAttr>("shape", attrs); + if (HasAttr("inplace", attrs)) { + inplace_ = GetAttr("inplace", attrs); + } else { + inplace_ = false; + } + } + + const GType *InputX() const { return input_x_; } + + const GType *InputShape() const { return input_shape_; } + + GType *Out() const { return out_; } + + GType *OutputXShape() const { return output_xshape_; } + + const vector &Shape() const { return shape_; } + + const bool &Inplace() const { return inplace_; } + + private: + GType *input_x_; + GType *input_shape_; + GType *out_; + GType *output_xshape_; + vector shape_; + bool inplace_; +}; +#endif + #ifdef SCALE_OP template class ScaleParam : public OpParam { @@ -1380,13 +1507,13 @@ class ResizeParam : public OpParam { * @b op 层实例化好这个 param 传递给 kernel 层使用 * */ template -class ReluParam : public OpParam { +class ReluParamBase : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: - ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); } @@ -1399,6 +1526,46 @@ class ReluParam : public OpParam { RType *input_x_; RType *out_; }; + +template +class ReluParam : public ReluParamBase { + public: + using ReluParamBase::ReluParamBase; +}; + +#ifdef PADDLE_MOBILE_CL +template <> +class ReluParam : public ReluParamBase { + public: + using ReluParamBase::ReluParamBase; + framework::CLImage &getMidImage() { return midImage; } + + private: + framework::CLImage midImage; +}; +#endif + +#endif + +#ifdef TANH_OP +template +class TanhParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + } + const RType *InputX() const { return input_x_; } + RType *Out() const { return out_; } + + private: + RType *input_x_; + RType *out_; +}; #endif #ifdef PRELU_OP @@ -1509,15 +1676,6 @@ class FusionConvAddParam : public ConvParam { RType *bias_; int axis_; RType *output_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; template @@ -1564,15 +1722,6 @@ class FusionConvAddPReluParam : public ConvParam { RType *output_; RType *alpha_; std::string mode_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1622,15 +1771,6 @@ class FusionConvAddAddPReluParam : public ConvParam { std::string keyOutput_; std::string keyX1_; std::string keyY1_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1697,15 +1837,6 @@ class FusionConvAddBNReluParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1783,15 +1914,6 @@ class FusionConvBNAddReluParam : public ConvParam { std::string keyBNY_; std::string keyX_; std::string keyY_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1850,15 +1972,6 @@ class FusionConvBNParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -1925,15 +2038,6 @@ class FusionConvAddBNParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -2051,15 +2155,6 @@ class FusionConvBNReluParam : public ConvParam { bool is_test_; RType *new_bias_; RType *new_scale_; -#ifdef PADDLE_MOBILE_FPGA - - private: - fpga::SplitConvArgs fpga_conv_args; - - public: - const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } -#endif }; #endif @@ -2080,9 +2175,9 @@ class Im2SequenceParam : public OpParam { paddings_ = GetAttr>("paddings", attrs); } - const RType *Input() const { return input_x_; } + const GType *Input() const { return input_x_; } - RType *Output() const { return out_; } + GType *Output() const { return out_; } const vector &Kernels() const { return kernels_; } @@ -2091,8 +2186,8 @@ class Im2SequenceParam : public OpParam { const vector &Paddings() const { return paddings_; } private: - RType *input_x_; - RType *out_; + GType *input_x_; + GType *out_; vector kernels_; vector strides_; vector paddings_; @@ -2168,9 +2263,24 @@ class ConvTransposeParam : public OpParam { vector paddings_; vector dilations_; int groups; + +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::DeconvArgs fpga_conv_args; + + public: + const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; } +#endif }; #endif +#ifdef FUSION_DECONVRELU_OP +template +using FusionDeconvReluParam = ConvTransposeParam; +#endif + #ifdef GRU_OP template class GruParam : public OpParam { diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp index dd23059ea01a332aff45137b7f7ed4c9f6c2e1bb..241f278ec0c5dd10e103b3ab1aa6f296323eebce 100644 --- a/src/operators/pool_op.cpp +++ b/src/operators/pool_op.cpp @@ -14,7 +14,8 @@ limitations under the License. */ #ifdef POOL_OP -#include "pool_op.h" +#include "operators/pool_op.h" +#include #include "framework/op_proto_maker.h" #include "framework/op_registry.h" @@ -68,5 +69,8 @@ REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(pool2d, ops::PoolOp); +#endif #endif diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h index 9880599ce5fc71048d6a555b3fa4848c5d7a8220..8f3957e29ee0802576f604900f8d15f86a864d53 100644 --- a/src/operators/pool_op.h +++ b/src/operators/pool_op.h @@ -38,9 +38,6 @@ class PoolOp : public OperatorWithKernel, : OperatorWithKernel, operators::PoolKernel>( type, inputs, outputs, attrs, scope) {} - using OperatorWithKernel< - DeviceType, PoolParam, - operators::PoolKernel>::OperatorWithKernel; void InferShape() const override; private: diff --git a/src/operators/prelu_op.h b/src/operators/prelu_op.h index af33476b7298a5728a6ef944506d55f422a2fa8c..5d0458f896941ece4208ca4b4931db189b4f436e 100644 --- a/src/operators/prelu_op.h +++ b/src/operators/prelu_op.h @@ -38,10 +38,6 @@ class PReluOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::PReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, PReluParam, - operators::PReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h index f7e02802ae82368319d5e9095c73afcac295b4fc..f7e26430a0536cde011de14f670a9f46b8f517c1 100644 --- a/src/operators/prior_box_op.h +++ b/src/operators/prior_box_op.h @@ -40,9 +40,6 @@ class PriorBoxOp : public framework::OperatorWithKernel< operators::PriorBoxKernel>( type, inputs, outputs, attrs, scope) {} - using framework::OperatorWithKernel< - DeviceType, PriorBoxParam, - operators::PriorBoxKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp index 933e1cfce064d63664ebc35b7ac331d4f32b74b9..d6d83475ee7879f8bc967439dac2094df12c8617 100644 --- a/src/operators/relu_op.cpp +++ b/src/operators/relu_op.cpp @@ -39,5 +39,10 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp); #endif +#ifdef PADDLE_MOBILE_FPGA +#endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(relu, ops::ReluOp); +#endif #endif diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h index 584c9da3c80c4e3e9e69fdb70a602cdd486e26b8..1c94a7f6d71484d0a4bd14e89d8518f6e73a660b 100644 --- a/src/operators/relu_op.h +++ b/src/operators/relu_op.h @@ -41,10 +41,6 @@ class ReluOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ReluKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ReluParam, - operators::ReluKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/reshape2_op.cpp b/src/operators/reshape2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d1623076570d466fc53f885374060c5e744365ed --- /dev/null +++ b/src/operators/reshape2_op.cpp @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#include "operators/reshape2_op.h" +#include +#include "operators/kernel/reshape_kernel.h" +namespace paddle_mobile { +namespace operators { + +template +void Reshape2Op::InferShape() const { + auto &shape = this->param_.Shape(); + auto input_x_dims = this->param_.InputX()->dims(); + auto out_dims = ValidateShape(shape, input_x_dims); + this->param_.Out()->Resize(out_dims); + std::vector xshape_dims(input_x_dims.size() + 1, 0); + for (int i = 0; i < input_x_dims.size(); ++i) { + xshape_dims[i + 1] = input_x_dims[i]; + } + this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims)); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op); +#endif + +#endif diff --git a/src/operators/reshape2_op.h b/src/operators/reshape2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3a06c2b9b90233b6ad0bacb6176f4cc274ff1cc0 --- /dev/null +++ b/src/operators/reshape2_op.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/kernel/reshape2_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using paddle_mobile::framework::Tensor; + +template +class Reshape2Op : public framework::OperatorWithKernel< + DeviceType, Reshape2Param, + operators::Reshape2Kernel> { + public: + Reshape2Op(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel, + operators::Reshape2Kernel>( + type, inputs, outputs, attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, Reshape2Param, + operators::Reshape2Kernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp index 214007545844e19cf698c6294416a6501a595b58..8ceb157d28764de469e5de5108ad483387ba8ca9 100644 --- a/src/operators/reshape_op.cpp +++ b/src/operators/reshape_op.cpp @@ -38,5 +38,8 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp); +#endif #endif diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h index a7347ddd8c6511224d4422f66eac71e61bf48549..3109303ff0e6007d0dbec133102924ff7bb30306 100644 --- a/src/operators/reshape_op.h +++ b/src/operators/reshape_op.h @@ -39,10 +39,6 @@ class ReshapeOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ReshapeKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ReshapeParam, - operators::ReshapeKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h index c0b38bb1cf4048af4b07d05f28a88a5ac8056ea3..954b3a82f8d2b5ccba242045c3d5e0f28553d484 100644 --- a/src/operators/resize_op.h +++ b/src/operators/resize_op.h @@ -38,10 +38,6 @@ class ResizeOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ResizeKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ResizeParam, - operators::ResizeKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/scale_op.h b/src/operators/scale_op.h index 4c5f5e620f25bef88533e80cdd78b243fef9bc70..56265259fe3a10feda67cc5c5732b2ba44e0730e 100644 --- a/src/operators/scale_op.h +++ b/src/operators/scale_op.h @@ -38,10 +38,6 @@ class ScaleOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ScaleKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ScaleParam, - operators::ScaleKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/shape_op.h b/src/operators/shape_op.h index 37b4fef1f4667051e51adbd96d6ada36bf36b647..116751c48e9ca3cc9ec936b1bcbaa72b6950bbc5 100644 --- a/src/operators/shape_op.h +++ b/src/operators/shape_op.h @@ -38,10 +38,6 @@ class ShapeOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::ShapeKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, ShapeParam, - operators::ShapeKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h index 62fc65dce1025fff629dd81ea4a7f797ded1a1d6..7150a8a473e4cb1dba7230d63799bd263ef19812 100644 --- a/src/operators/sigmoid_op.h +++ b/src/operators/sigmoid_op.h @@ -36,11 +36,6 @@ class SigmoidOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SigmoidKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SigmoidParam, - operators::SigmoidKernel>::OperatorWithKernel; - void InferShape() const override; }; diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp index ac6c434c9450905931abeb395b294bed64c036b0..5704737902c03c476907ab527495b46c52567ed5 100644 --- a/src/operators/slice_op.cpp +++ b/src/operators/slice_op.cpp @@ -34,5 +34,7 @@ REGISTER_OPERATOR_CPU(slice, ops::SliceOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp); #endif - +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(slice, ops::SliceOp); +#endif #endif diff --git a/src/operators/slice_op.h b/src/operators/slice_op.h index 6bcb6fa0b9e88cefb3c88dfc096e1073ad261c1b..c45061696577dbe6948fb9cab7edebbaf8e15f2f 100644 --- a/src/operators/slice_op.h +++ b/src/operators/slice_op.h @@ -38,10 +38,6 @@ class SliceOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SliceKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SliceParam, - operators::SliceKernel>::OperatorWithKernel; void InferShape() const override; protected: diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp index e85edc69c3291c794f2eeb8119b91b2926c4d870..e605864706a6c59a35205b3072dd432b009c5d1f 100644 --- a/src/operators/softmax_op.cpp +++ b/src/operators/softmax_op.cpp @@ -36,5 +36,8 @@ REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp); #ifdef PADDLE_MOBILE_FPGA REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp); +#endif #endif diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h index cee5993174a02f610c1de0ad47ca6b73477fd946..422213feeaf2bc2301832de2f9c69827342a5062 100644 --- a/src/operators/softmax_op.h +++ b/src/operators/softmax_op.h @@ -36,11 +36,6 @@ class SoftmaxOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SoftmaxKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SoftmaxParam, - operators::SoftmaxKernel>::OperatorWithKernel; - void InferShape() const override; private: diff --git a/src/operators/split_op.h b/src/operators/split_op.h index d37bf7a0f93005a4c95e7e82c7c90313fda409cb..fc733c18520b971107e00003b3107b8c0aa9b36d 100644 --- a/src/operators/split_op.h +++ b/src/operators/split_op.h @@ -38,10 +38,6 @@ class SplitOp : public framework::OperatorWithKernel< : framework::OperatorWithKernel, operators::SplitKernel>( type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, SplitParam, - operators::SplitKernel>::OperatorWithKernel; void InferShape() const override; }; } // namespace operators diff --git a/src/operators/tanh_op.cpp b/src/operators/tanh_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..454cdfa26942eda225a811317e907b1989bcf61b --- /dev/null +++ b/src/operators/tanh_op.cpp @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#include "operators/tanh_op.h" + +namespace paddle_mobile { +namespace operators { + +template +void TanhOp::InferShape() const { + this->param_.Out()->Resize(this->param_.InputX()->dims()); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(Tanh, ops::TanhOp); +#endif + +#endif diff --git a/src/operators/tanh_op.h b/src/operators/tanh_op.h new file mode 100644 index 0000000000000000000000000000000000000000..82b0e4e9a07ae4fd3e4885790d5832065ed3eb49 --- /dev/null +++ b/src/operators/tanh_op.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "operators/kernel/tanh_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class TanhOp : public framework::OperatorWithKernel< + DeviceType, TanhParam, + operators::TanhKernel> { + public: + TanhOp(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel, + operators::TanhKernel>( + type, inputs, outputs, attrs, scope) {} + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/transpose2_op.cpp b/src/operators/transpose2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..64d07991f60b4057e3d2841afa1bfe6483f31a88 --- /dev/null +++ b/src/operators/transpose2_op.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP + +#include + +#include "common/enforce.h" +#include "operators/transpose2_op.h" +namespace paddle_mobile { +namespace operators { + +template +void Transpose2Op::InferShape() const { + auto input_x_dims = this->param_.InputX()->dims(); + auto axis = this->param_.Axis(); + + size_t x_dims_size = input_x_dims.size(); + size_t axis_size = axis.size(); + + PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size), + "input_dims must " + "be equal to the axis_size. ") + + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_MOBILE_ENFORCE( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + "Each element of Attribute axis should be a unique value " + "range from 0 to (dims - 1), " + "where the dims is the axis's size"); + } + framework::DDim out_dims(input_x_dims); + for (size_t i = 0; i < axis_size; i++) { + out_dims[i] = input_x_dims[axis[i]]; + } + this->param_.Out()->Resize(out_dims); + std::vector xshape_dims(input_x_dims.size() + 1, 0); + for (int i = 0; i < input_x_dims.size(); ++i) { + xshape_dims[i + 1] = input_x_dims[i]; + } + this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims)); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op); +#endif + +#endif // TRANSPOSE_OP diff --git a/src/operators/transpose2_op.h b/src/operators/transpose2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f1339cc59e0c71a232eddd5dcef47f62994b80da --- /dev/null +++ b/src/operators/transpose2_op.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/kernel/transpose2_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using paddle_mobile::framework::Tensor; + +template +class Transpose2Op : public framework::OperatorWithKernel< + DeviceType, Transpose2Param, + operators::Transpose2Kernel> { + public: + Transpose2Op(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, Transpose2Param, + operators::Transpose2Kernel>(type, inputs, outputs, + attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, Transpose2Param, + operators::Transpose2Kernel>::OperatorWithKernel; + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h index 7e5f72058d4e06f5b5b1fef81ade0350ea78f21c..eb98ce235491632aa1149acc158552955c2c1e0c 100644 --- a/src/operators/transpose_op.h +++ b/src/operators/transpose_op.h @@ -40,10 +40,6 @@ class TransposeOp : public framework::OperatorWithKernel< DeviceType, TransposeParam, operators::TransposeKernel>(type, inputs, outputs, attrs, scope) {} - - using framework::OperatorWithKernel< - DeviceType, TransposeParam, - operators::TransposeKernel>::OperatorWithKernel; void InferShape() const override; }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8f24392f7a7acf8dd7529619c4e950dd3598f1d5..38dc540e206ade4adb1427bf2121475217b2d730 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -62,15 +62,21 @@ if (CON GREATER -1) endif () -list(FIND NET "FPGAnets" CON) +list(FIND NET "FPGA_NET_V1" CON) if (CON GREATER -1) ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet50 paddle-mobile) - -# ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) -# target_link_libraries(test-resnet paddle-mobile) set(FOUND_MATCH ON) +endif () +list(FIND NET "FPGA_NET_V2" CON) +if (CON GREATER -1) + ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-resnet50 paddle-mobile) + + ADD_EXECUTABLE(test-pe fpga/test_pe.cpp) + target_link_libraries(test-pe paddle-mobile) + set(FOUND_MATCH ON) endif () list(FIND NET "mobilenetssd" CON) @@ -184,6 +190,10 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h) target_link_libraries(test-transpose-op paddle-mobile) + # gen test + ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h) + target_link_libraries(test-transpose2-op paddle-mobile) + # gen test ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h) target_link_libraries(test-multiclassnms-op paddle-mobile) @@ -200,6 +210,10 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h) target_link_libraries(test-reshape-op paddle-mobile) + # gen test + ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h) + target_link_libraries(test-reshape2-op paddle-mobile) + # gen test ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h) target_link_libraries(test-relu-op paddle-mobile) @@ -330,6 +344,14 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) target_link_libraries(test-fssd paddle-mobile) + # gen test + ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h) + target_link_libraries(test-mobilenetgpu paddle-mobile) + + # gen test + ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-yologpu paddle-mobile) + # gen test ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h) target_link_libraries(test-multi-process paddle-mobile) @@ -338,5 +360,9 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp) target_link_libraries(test-benchmark paddle-mobile) + # gen test + ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h) + target_link_libraries(test-eng paddle-mobile) + #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) endif () diff --git a/test/executor_for_test.h b/test/executor_for_test.h index 60f1856bb9294c6f9b4bd5cfb7d44f984c6f0794..970eff2400a1806c4db96cb6112c4d64dfc7eb3b 100644 --- a/test/executor_for_test.h +++ b/test/executor_for_test.h @@ -18,8 +18,8 @@ limitations under the License. */ #include #include "common/log.h" +#include "framework/executor.h" #include "framework/op_registry.h" -#include "io/executor.h" #include "operators/conv_op.h" #include "operators/elementwise_add_op.h" #include "operators/pool_op.h" @@ -29,9 +29,9 @@ limitations under the License. */ #include "operators/softmax_op.h" #include "operators/transpose_op.h" -using paddle_mobile::Executor; using paddle_mobile::framework::BlockDesc; using paddle_mobile::framework::DDim; +using paddle_mobile::framework::Executor; using paddle_mobile::framework::LoDTensor; using paddle_mobile::framework::OpDesc; using paddle_mobile::framework::Program; diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp index 5d1a5828b36b3d9ed371a271af6db82657ff1596..44b9f4971bbd5cc69e1f663ae71e27e69c31a04b 100644 --- a/test/fpga/test_concat_op.cpp +++ b/test/fpga/test_concat_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/concat_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_googlenet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/fpga/test_pe.cpp b/test/fpga/test_pe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f5f2708b9e628af80433be4e7ccbb205d3fcd6f6 --- /dev/null +++ b/test/fpga/test_pe.cpp @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#include "fpga/V2/filter.h" + +namespace fpga = paddle_mobile::fpga; + +static const uint32_t N = 64; +static const uint32_t C = 3; +static const uint32_t H = 224; +static const uint32_t W = 224; +static const uint32_t G = 1; + +fpga::DataType input_type = fpga::DATA_TYPE_FP32; +fpga::DataType output_type = fpga::DATA_TYPE_FP16; + +void* ifm = nullptr; +void* ofm = nullptr; +void* filter = nullptr; +void* ifm_scale = nullptr; +void* ofm_scale = nullptr; +void* filter_scale = nullptr; + +int ifm_size = 0, ofm_size = 0; + +void format_data() { + ifm_scale = fpga::fpga_malloc(8); + ofm_scale = fpga::fpga_malloc(8); + int ifm_channel = fpga::filter::calc_aligned_channel(C); + int ofm_channel = fpga::filter::calc_aligned_channel(N); + int num = fpga::filter::calc_aligned_num(N, C); + DLOG << "ifm_channel = " << ifm_channel; + DLOG << "ofm_channel = " << ofm_channel; + DLOG << "aligned_num = " << num; + ifm_size = ifm_channel * H * W; + ofm_size = ofm_channel * H * W; + ifm = fpga::fpga_malloc(ifm_size * sizeof(float)); + ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t)); + memset(ifm, 0, ifm_size * sizeof(float)); + memset(ofm, 0, ofm_size * sizeof(int16_t)); + + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + for (int c = 0; c < C; c++) { + int index = h * W * ifm_channel + w * ifm_channel + c; + (reinterpret_cast(ifm))[index] = h + w + c * 0.1f; + // DLOG << index << ":" << ((float *) ifm)[index]; + } + } + } + fpga::fpga_flush(ifm, ifm_size * sizeof(float)); + fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t)); +} + +void print_fp16(int16_t* ptr, int total_size, int num) { + fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t)); + int stride = total_size / num; + for (int i = 0; i < total_size; i += stride) { + DLOG << fpga::fp16_2_fp32(ptr[i]); + } +} + +void print_fp32(float* ptr, int total_size, int num) { + fpga::fpga_invalidate(ptr, total_size * sizeof(float)); + int stride = total_size / num; + for (int i = 0; i < total_size; i += stride) { + DLOG << ptr[i]; + } +} + +void test_bypass() { + fpga::BypassArgs args; + args.input_data_type = input_type; + args.output_data_type = output_type; + args.image.address = ifm; + args.image.height = H; + args.image.width = W; + args.image.channels = C; + args.image.scale_address = reinterpret_cast(ifm_scale); + args.output.address = ofm; + args.output.scale_address = reinterpret_cast(ofm_scale); + fpga::PerformBypass(args); +} + +int main() { + paddle_mobile::fpga::open_device(); + format_data(); + DLOG << "format data done"; + print_fp32(reinterpret_cast(ifm), ifm_size, 200); + DLOG << "print input done"; + test_bypass(); + DLOG << "test done"; + print_fp16(reinterpret_cast(ofm), ifm_size, 200); + std::cout << "Computation done" << std::endl; + return 0; +} + +#endif diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 8a6a9dc8af836010695c6c6dc30e81ba224c7ffd..4d05328179fa2acc771e08a6dfddea4f770d9780 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -13,7 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include #include "../test_include.h" -#include "fpga/api.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + void readStream(std::string filename, float *buf) { std::ifstream in; in.open(filename, std::ios::in); diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp index 64fa42658be6b39fabe9bb26296a426949d31197..3d1b6af935b2f3e7f0c60f5c0cbbcc696f6aeba2 100644 --- a/test/framework/test_load.cpp +++ b/test/framework/test_load.cpp @@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "../test_helper.h" -#include "io/loader.h" +#include "framework/loader.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../../../test/models/googlenet // ../../../test/models/mobilenet - // auto program = loader.Load(g_googlenet, true); // auto program = loader.Load(g_mobilenet_ssd, true); - auto program = loader.Load(std::string(g_ocr) + "/model", - std::string(g_ocr) + "/params", false); + // auto program = loader.Load(std::string(g_ocr) + "/model", + // std::string(g_ocr) + "/params", false); // program.originProgram->Description("program desc: "); + return 0; } diff --git a/test/framework/test_load_memory_inference_api.cpp b/test/framework/test_load_memory_inference_api.cpp index 05d51910172547c6dab7adc8231663be55c916bf..5b2773f8f1a21c3b9253b34fc5c18cd64ece27e7 100644 --- a/test/framework/test_load_memory_inference_api.cpp +++ b/test/framework/test_load_memory_inference_api.cpp @@ -55,11 +55,11 @@ static char *Get_binary_data(std::string filename) { paddle_mobile::PaddleMobileConfig GetConfig() { paddle_mobile::PaddleMobileConfig config; config.precision = paddle_mobile::PaddleMobileConfig::FP32; - config.device = paddle_mobile::PaddleMobileConfig::kCPU; + config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL; const std::shared_ptr &memory_pack = std::make_shared(); - auto model_path = std::string(g_genet_combine) + "/model"; - auto params_path = std::string(g_genet_combine) + "/params"; + auto model_path = std::string(g_mobilenet_combined) + "/model"; + auto params_path = std::string(g_mobilenet_combined) + "/params"; memory_pack->model_size = ReadBuffer(model_path.c_str(), &memory_pack->model_buf); std::cout << "sizeBuf: " << memory_pack->model_size << std::endl; diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp index 3cae963eca048da221d69c4c336dd4fdfecbb584..0392020789096e921865afed0b0fc51fa5999c6b 100644 --- a/test/framework/test_optimize.cpp +++ b/test/framework/test_optimize.cpp @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "../test_helper.h" +#include "framework/loader.h" #include "framework/program/program-optimize/node.h" #include "framework/program/program-optimize/program_optimize.h" -#include "io/loader.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // "../../../test/models/googlenet" auto program = loader.Load(g_mobilenet_ssd, true); paddle_mobile::framework::ProgramOptimize optimize; diff --git a/test/net/test_eng.cpp b/test/net/test_eng.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b99a6c927a44ca4032b352731b3971b63cf26b4f --- /dev/null +++ b/test/net/test_eng.cpp @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main() { +#ifdef PADDLE_MOBILE_CPU + paddle_mobile::PaddleMobile paddle_mobile; +#endif + // paddle_mobile.SetThreadNum(4); + auto time1 = time(); + if (paddle_mobile.Load(std::string(g_eng) + "/model", + std::string(g_eng) + "/params", true, false, 1, + true)) { + auto time2 = time(); + std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + std::vector dims{1, 1, 48, 400}; + LoDTensor input_tensor; + SetupTensor(&input_tensor, {1, 1, 48, 400}, static_cast(0), + static_cast(1)); + + std::vector input(input_tensor.data(), + input_tensor.data() + input_tensor.numel()); + // 预热十次 + for (int i = 0; i < 1; ++i) { + paddle_mobile.PredictLod(input_tensor); + } + auto time3 = time(); + for (int i = 0; i < 1; ++i) { + paddle_mobile.PredictLod(input_tensor); + } + auto time4 = time(); + std::cout << "predict cost :" << time_diff(time3, time4) << "ms" + << std::endl; + } + return 0; +} diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index a61df31e39c653e346c467c6ca17d5df3e08673e..c3379df609fc1e18b8c3545e25849f8a7ff0461b 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -28,8 +28,9 @@ int main() { bool optimize = true; auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { - auto time2 = time(); - std::cout << "load cost: " << time_diff(time1, time2) << "ms\n"; + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; std::vector input; std::vector output; std::vector dims{1, 3, 224, 224}; diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp index 4ed7d3b756cfef9554028e1d33f4dd86bf58e4b8..5cce53e866df0530d6c8e1f35bc7159ba6e5ba9b 100644 --- a/test/net/test_mobilenet.cpp +++ b/test/net/test_mobilenet.cpp @@ -19,14 +19,15 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; paddle_mobile.SetThreadNum(4); - auto time1 = time(); + auto time1 = paddle_mobile::time(); // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", // std::string(g_mobilenet_detect) + "/params", true); auto isok = paddle_mobile.Load(g_mobilenet, true); if (isok) { - auto time2 = time(); - std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms" + << std::endl; std::vector input; std::vector dims{1, 3, 224, 224}; @@ -42,14 +43,14 @@ int main() { for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } - auto time3 = time(); + auto time3 = paddle_mobile::time(); for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } DLOG << vec_result; - auto time4 = time(); - std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" - << std::endl; + auto time4 = paddle_mobile::time(); + std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10 + << "ms" << std::endl; } std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e410baf77616584995f1e3687b47ca0af337a231 --- /dev/null +++ b/test/net/test_mobilenet_GPU.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../../src/common/types.h" +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); + auto time1 = paddle_mobile::time(); +#ifdef PADDLE_MOBILE_CL + paddle_mobile.SetCLPath("/data/local/tmp/bin"); +#endif + + auto isok = + paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model", + std::string(g_mobilenet_mul) + "/params", true); + + // auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 3, 224, 224}; + GetInput(g_test_image_1x3x224x224_banana, &input, dims); + + std::vector vec_result = paddle_mobile.Predict(input, dims); + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + } + + std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " + "是否存在?" + << std::endl; + return 0; +} diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0215ded59e5f74f0c103d4b51abe06b487bd50ab --- /dev/null +++ b/test/net/test_yologpu.cpp @@ -0,0 +1,189 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "../../src/common/types.h" +#include "../../src/io/paddle_test_inference_api.h" +#include "../test_helper.h" +#include "../test_include.h" +void t1() { + paddle_mobile::PaddleMobile paddle_mobile_gpu; + paddle_mobile::PaddleMobile paddle_mobile_cpu; + paddle_mobile::PaddleTester paddle_test_cpu; + paddle_mobile::PaddleTester paddle_test_gpu; + printf("cpu time:%f\n", paddle_test_cpu.CaculatePredictTime()); + std::string path = "/data/local/tmp/bin"; + printf("gpu time:%f\n", paddle_test_gpu.CaculatePredictTime(&path)); + // paddle_mobile.SetThreadNum(4); +#ifdef PADDLE_MOBILE_CL + paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin"); +#endif + auto time1 = paddle_mobile::time(); + auto isok = paddle_mobile_gpu.Load(std::string(g_yolo_mul) + "/model", + std::string(g_yolo_mul) + "/params", true); + + // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 3, 416, 416}; + GetInput(g_yolo_img, &input, dims); + + std::vector vec_result; + // = paddle_mobile.Predict(input, dims); + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile_gpu.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + // auto time3 = paddle_mobile::time(); + + // for (int i = 0; i < 10; ++i) { + // auto vec_result = paddle_mobile.Predict(input, dims); + // } + + // auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + // for (float i : vec_result) { + // std::cout << i << std::endl; + // } + } +} + +void t2() { + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); +#ifdef PADDLE_MOBILE_CL + paddle_mobile.SetCLPath("/data/local/tmp/bin"); +#endif + auto time1 = paddle_mobile::time(); + auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model", + std::string(g_yolo_mul) + "/params", true); + + // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 3, 416, 416}; + GetInput(g_yolo_img, &input, dims); + + std::vector vec_result; + // = paddle_mobile.Predict(input, dims); + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + // auto time3 = paddle_mobile::time(); + + // for (int i = 0; i < 10; ++i) { + // auto vec_result = paddle_mobile.Predict(input, dims); + // } + + // auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + // for (float i : vec_result) { + // std::cout << i << std::endl; + // } + } +} + +void t3() { + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); + //#ifdef PADDLE_MOBILE_CL + // paddle_mobile.SetCLPath("/data/local/tmp/bin"); + //#endif + auto time1 = paddle_mobile::time(); + auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model", + std::string(g_yolo_mul) + "/params", true); + + // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 3, 416, 416}; + GetInput(g_yolo_img, &input, dims); + + std::vector vec_result = paddle_mobile.Predict(input, dims); + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + // auto time3 = paddle_mobile::time(); + + // for (int i = 0; i < 10; ++i) { + // auto vec_result = paddle_mobile.Predict(input, dims); + // } + + // auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + // for (float i : vec_result) { + // std::cout << i << std::endl; + // } + } +} + +int main() { + // std::thread th1(t1); + // std::thread th2(t2); + // std::thread th3(t3); + std::thread th1(t1); + // th1.join(); + // th2.join(); + // th3.join(); + th1.join(); + return 0; +} diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp index 4ccad8c1512036c2400a09575b3775e75b26acce..c027d4bd31d5ff41f42e9cd333618f8630aad5d9 100644 --- a/test/operators/test_batchnorm_op.cpp +++ b/test/operators/test_batchnorm_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/batchnorm_op.h" @@ -127,7 +125,7 @@ template class TestBatchNormOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run BatchNormOp Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); /// input x (4,10,2,2) diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp index 92cba3995c866c67c00491ad5cc38fb094594ad3..721e691107c2c2d0117fdedecf219484556c9541 100644 --- a/test/operators/test_box_coder_op.cpp +++ b/test/operators/test_box_coder_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/box_coder_op.h" @@ -115,7 +114,7 @@ template class TestBoxCoderOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run BoxCoderOp Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); paddle_mobile::framework::Tensor priorbox; diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp index edaa4ce1ddba251886c90262895333b0a56c3a07..1a347a9c37a96f3c31506d0b45f95e05b64292ff 100644 --- a/test/operators/test_concat_op.cpp +++ b/test/operators/test_concat_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/concat_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_googlenet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_conv_add_relu_op.cpp b/test/operators/test_conv_add_relu_op.cpp index 987f52cd62f91b3bc00cc1ef49bd21913e288d75..f170719218b98d341985a61ca6160884afe4ad3b 100644 --- a/test/operators/test_conv_add_relu_op.cpp +++ b/test/operators/test_conv_add_relu_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/fusion_conv_add_relu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_googlenet, true); diff --git a/test/operators/test_depthwise_conv_op.cpp b/test/operators/test_depthwise_conv_op.cpp index bd2aad19eda896bad3da8a47f5b70b1a923dc1a7..77c76eedc5690412dfee95dd11e8a3fe9ed6ecbe 100644 --- a/test/operators/test_depthwise_conv_op.cpp +++ b/test/operators/test_depthwise_conv_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/depthwise_conv_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_mobilenet_ssd); diff --git a/test/operators/test_elementwise_add_op.cpp b/test/operators/test_elementwise_add_op.cpp index 0a5e9f7e92701e748df51078b21eb46eec90599d..3922b216cfc6ecf55be251ded02c0c064e2c3ffc 100644 --- a/test/operators/test_elementwise_add_op.cpp +++ b/test/operators/test_elementwise_add_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "../test_include.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_resnet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp index cfac83eff7a012d52d47f96e088bd8519603cadc..e1030852976a68db827ebb7629caf8bb199a2456 100644 --- a/test/operators/test_elementwise_sub_op.cpp +++ b/test/operators/test_elementwise_sub_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/elementwise_sub_op.h" @@ -106,7 +104,7 @@ template class TestElementwiseSubOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run ElementwiseSub Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr) + "/model", std::string(g_ocr) + "/params"); diff --git a/test/operators/test_fill_constant_op.cpp b/test/operators/test_fill_constant_op.cpp index b099217d1641eb221b3d0d86d780fb6ecfa929bd..9dc7bb13884efb8860a6670e088bd5af67c1f0ea 100644 --- a/test/operators/test_fill_constant_op.cpp +++ b/test/operators/test_fill_constant_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/fill_constant_op.h" @@ -95,7 +94,7 @@ template class TestFillConstantOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run FillConstant Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr) + "/model", std::string(g_ocr) + "/params"); diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp index 7764d95ed72da613459233bd55ddcffdc444318f..347bcb40a6156a576842af34920bde838dd83cd8 100644 --- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp +++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/fusion_conv_add_bn_relu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // ../models/image_classification_resnet.inference.model auto program = loader.Load(g_mobilenet, true); diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp index a23bde45cb74f0f75e655821b15e66b1cef4c081..a8ec4883aab4218aa526e7b90267998754d1eb30 100644 --- a/test/operators/test_fusion_fc_op.cpp +++ b/test/operators/test_fusion_fc_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include #include "../test_include.h" #include "operators/fusion_fc_op.h" @@ -114,7 +112,7 @@ template class TestFcOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Fc Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; // "../../../test/models/googlenet" auto program = loader.Load(g_googlenet); paddle_mobile::framework::ProgramOptimize optimize; diff --git a/test/operators/test_gru_op.cpp b/test/operators/test_gru_op.cpp index 52ab8b54d709391ea263b74a395a635ce50a18af..f2ce833661bfd1b3d751a7ac2d54cfb70114a6c6 100644 --- a/test/operators/test_gru_op.cpp +++ b/test/operators/test_gru_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/gru_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_nlp); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp index b45e437e12f95cd9f7050247fc03a152246d8122..3cd172d99bb1bb9c24f035d501dce362476909c2 100644 --- a/test/operators/test_im2sequence_op.cpp +++ b/test/operators/test_im2sequence_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/im2sequence_op.h" @@ -62,7 +60,6 @@ class TestIm2SequenceOp { Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1"); auto tensor_x1 = x1_feed_value->GetMutable(); tensor_x1->ShareDataWith(t1); - Variable *output = scope->Var("im2sequence_0.tmp_0"); auto *output_tensor = output->GetMutable(); output_tensor->mutable_data({2, 12}); @@ -102,7 +99,7 @@ template class TestIm2SequenceOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Im2Sequence Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_eng) + "/model", std::string(g_eng) + "/params"); diff --git a/test/operators/test_lrn_op.cpp b/test/operators/test_lrn_op.cpp index d4d9f8da802fc0f5f885a3b2e81cba695776c29e..5d1ac9b4dd7225112ace8bfbb13f926502c77b94 100644 --- a/test/operators/test_lrn_op.cpp +++ b/test/operators/test_lrn_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/lrn_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_googlenet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp index d1b98d4965fd182ab1adc480279f38cea53974be..32c2c1f6bd682fdac8d9b81155b8aa044b87232b 100644 --- a/test/operators/test_multiclass_nms_op.cpp +++ b/test/operators/test_multiclass_nms_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/multiclass_nms_op.h" @@ -31,14 +30,12 @@ class TestMultiClassNMSOp { const std::vector> blocks = to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); for (auto block_desc : blocks) { std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); for (auto op : ops) { if (op->Type() == "multiclass_nms" && op->Input("BBoxes")[0] == "box_coder_0.tmp_0") { - DLOG << " mul attr size: " << op->GetAttrMap().size(); + DLOG << " attr size: " << op->GetAttrMap().size(); DLOG << " inputs size: " << op->GetInputs().size(); DLOG << " outputs size: " << op->GetOutputs().size(); DLOG << " BBoxes is : " << op->Input("BBoxes")[0]; @@ -55,14 +52,6 @@ class TestMultiClassNMSOp { << op->GetAttrMap().at("nms_top_k").Get(); DLOG << " score_threshold : " << op->GetAttrMap().at("score_threshold").Get(); - // DLOG << " variances : " << - // op->GetAttrMap().at("variances").Get>(); - // DLOG << " aspect_ratios : " << - // op->GetAttrMap().at("aspect_ratios").Get>(); - // DLOG << " min_sizes : " << - // op->GetAttrMap().at("min_sizes").Get>(); - // DLOG << " max_sizes : " << - // op->GetAttrMap().at("max_sizes").Get>(); std::shared_ptr> priorbox = std::make_shared>( op->Type(), op->GetInputs(), op->GetOutputs(), @@ -88,16 +77,12 @@ class TestMultiClassNMSOp { auto *output_tensor = output->GetMutable(); output_tensor->mutable_data({1917, 6}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - std::shared_ptr out_tensor = std::make_shared(); out_tensor.reset(output_tensor); predict(t1, t2, 0); return out_tensor; - // return outvars_tensor; } private: @@ -126,9 +111,8 @@ template class TestMultiClassNMSOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run MulticlassNMS Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); - paddle_mobile::framework::Tensor inputx1; SetupTensor(&inputx1, {1, 2, 4}, static_cast(0), static_cast(1)); diff --git a/test/operators/test_polygon_box_transform_op.cpp b/test/operators/test_polygon_box_transform_op.cpp index a71177ddbd8e4d8b0f204fd6ec9c948882499cbd..2347f06989153b9ce5994fa0e4d09673ab2698f1 100644 --- a/test/operators/test_polygon_box_transform_op.cpp +++ b/test/operators/test_polygon_box_transform_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/polygon_box_transform_op.h" @@ -97,7 +96,7 @@ template class TestPolygonBoxTransformOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run PolygonBoxTransform Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_ocr)); paddle_mobile::framework::Tensor input; diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp index 2daecd7b4c1a50c612bc784c801208d2e6f31482..09470caf82eb90df56f7aa79b6873c2a6b94fbef 100644 --- a/test/operators/test_pool_op.cpp +++ b/test/operators/test_pool_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/pool_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_googlenet)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_prelu_op.cpp b/test/operators/test_prelu_op.cpp index e93d8732d18496721b24cfba1df296250169f8b2..f98c9904ae3799cb863142b0fcb332c74c91ba98 100644 --- a/test/operators/test_prelu_op.cpp +++ b/test/operators/test_prelu_op.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/prelu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_resnet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp index 8c697a9a7982f05b71caa5bb5f4d12e50dc9d418..424f2443f8627002cff0adc19600f9aba50ad0fb 100644 --- a/test/operators/test_prior_box_op.cpp +++ b/test/operators/test_prior_box_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/prior_box_op.h" @@ -126,7 +125,7 @@ template class TestPriorBoxOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run PriorBoxOp Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); /// input x (1,3,300,300) diff --git a/test/operators/test_relu_op.cpp b/test/operators/test_relu_op.cpp index fad0d0c30a126cc2730e4aa8b87364eee9fc8209..542d3d18f6a383c1e03962ba845b39c04a51631b 100644 --- a/test/operators/test_relu_op.cpp +++ b/test/operators/test_relu_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/relu_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(g_resnet); PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, "program file read fail"); diff --git a/test/operators/test_reshape2_op.cpp b/test/operators/test_reshape2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d0d51f984a617ea37713e5830adf6b5d248fb434 --- /dev/null +++ b/test/operators/test_reshape2_op.cpp @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/reshape2_op.h" + +namespace paddle_mobile { +namespace framework { + +template +class TestReshape2Op { + public: + explicit TestReshape2Op(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } + const std::vector> blocks = + to_predict_program_->Blocks(); + for (auto block_desc : blocks) { + std::vector> ops = block_desc->Ops(); + for (auto op : ops) { + if (op->Type() == "reshape2") { + DLOG << " attr size: " << op->GetAttrMap().size(); + std::unordered_map attrs = op->GetAttrMap(); + for (std::unordered_map::iterator it = + attrs.begin(); + it != attrs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " inputs size: " << op->GetInputs().size(); + VariableNameMap inputs = op->GetInputs(); + for (VariableNameMap::iterator it = inputs.begin(); + it != inputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " outputs size: " << op->GetOutputs().size(); + VariableNameMap outputs = op->GetOutputs(); + for (VariableNameMap::iterator it = outputs.begin(); + it != outputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + input_var_name = op->Input("X")[0]; + output_var_name = op->Output("Out")[0]; + std::shared_ptr> op_ptr = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(op_ptr); + return; + } + } + } + } + + std::shared_ptr predict(const Tensor &t) { + auto scope = program_.scope; + Variable *input_feed_value = scope->Var(input_var_name); + auto tensor_input = input_feed_value->GetMutable(); + tensor_input->ShareDataWith(t); + + Variable *output = scope->Var(output_var_name); + auto *output_tensor = output->GetMutable(); + + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); + + predict(t, 0); + + return out_tensor; + } + + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; + string input_var_name; + string output_var_name; + + void predict(const Tensor &t, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + op->Run(); + } + } +}; + +template class TestReshape2Op; +} // namespace framework +} // namespace paddle_mobile + +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run Reshape2 Test"; + paddle_mobile::framework::Loader loader; + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params"); + + paddle_mobile::framework::Tensor input; + SetupTensor(&input, {1, 4, 4}, static_cast(0), + static_cast(1)); + auto *input_ptr = input.data(); + for (int i = 0; i < 16; ++i) { + *(input_ptr + i) = i; + } + DLOG << "input : "; + for (int i = 0; i < input.numel(); ++i) { + DLOG << " index " << i << " : " << input_ptr[i]; + } + + paddle_mobile::framework::TestReshape2Op testReshape2Op( + program); + + auto output = testReshape2Op.predict(input); + auto *output_ptr = output->data(); + + DLOG << "output : "; + for (int i = 0; i < output->numel(); ++i) { + DLOG << " index " << i << " : " << output_ptr[i]; + } + return 0; +} diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp index 3541151d8a1a286527e715f402df381d2efc094c..ff3299f5e818d8169a356323213707417d747dba 100644 --- a/test/operators/test_reshape_op.cpp +++ b/test/operators/test_reshape_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/reshape_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_resize_op.cpp b/test/operators/test_resize_op.cpp index f4dcaa6885d92a727e8c97d5106c3b6913a4ab33..c452ef8d850f97f6988688c4e47d5041220cb828 100644 --- a/test/operators/test_resize_op.cpp +++ b/test/operators/test_resize_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "operators/resize_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp index 739c594ad7044025eaa3637d8669c43f1c6c6348..df93da1529ae1e03561643ebeef4cb821f10d211 100644 --- a/test/operators/test_sigmoid_op.cpp +++ b/test/operators/test_sigmoid_op.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h" #include "../../src/operators/kernel/sigmoid_kernel.h" #include "../test_helper.h" -#include "io/executor.h" +#include "framework/executor.h" int main() { paddle_mobile::framework::Tensor input; diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp index a0184729a8bc5e6b0ba952923eecd5242cfe36d4..f31bcb4e455a6b9699cf96271310681e51d4c6a7 100644 --- a/test/operators/test_softmax_op.cpp +++ b/test/operators/test_softmax_op.cpp @@ -17,7 +17,7 @@ limitations under the License. */ #include "operators/softmax_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp index e51d1cff5e99c5d9c444db046e78eee6a03f9243..9cabf1212525a7d4d6f36c45f81cba438694843d 100644 --- a/test/operators/test_sum_op.cpp +++ b/test/operators/test_sum_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/sum_op.h" @@ -105,7 +103,7 @@ template class TestSumOp; int main() { DLOG << "----------**********----------"; DLOG << "begin to run Sum Test"; - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_eng) + "/model", std::string(g_eng) + "/params"); diff --git a/test/operators/test_transpose2_op.cpp b/test/operators/test_transpose2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5da0faaf119c553e2fb019de76bb40f875f9d673 --- /dev/null +++ b/test/operators/test_transpose2_op.cpp @@ -0,0 +1,143 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/transpose2_op.h" + +namespace paddle_mobile { +namespace framework { + +template +class TestTranspose2Op { + public: + explicit TestTranspose2Op(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } + const std::vector> blocks = + to_predict_program_->Blocks(); + for (auto block_desc : blocks) { + std::vector> ops = block_desc->Ops(); + for (auto op : ops) { + if (op->Type() == "transpose2") { + DLOG << " attr size: " << op->GetAttrMap().size(); + std::unordered_map attrs = op->GetAttrMap(); + for (std::unordered_map::iterator it = + attrs.begin(); + it != attrs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " inputs size: " << op->GetInputs().size(); + VariableNameMap inputs = op->GetInputs(); + for (VariableNameMap::iterator it = inputs.begin(); + it != inputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " outputs size: " << op->GetOutputs().size(); + VariableNameMap outputs = op->GetOutputs(); + for (VariableNameMap::iterator it = outputs.begin(); + it != outputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + input_var_name = op->Input("X")[0]; + output_var_name = op->Output("Out")[0]; + std::shared_ptr> op_ptr = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(op_ptr); + return; + } + } + } + } + + std::shared_ptr predict(const Tensor &t) { + auto scope = program_.scope; + Variable *input_feed_value = scope->Var(input_var_name); + auto tensor_input = input_feed_value->GetMutable(); + tensor_input->ShareDataWith(t); + + Variable *output = scope->Var(output_var_name); + auto *output_tensor = output->GetMutable(); + output_tensor->mutable_data({1, 2, 8}); + + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); + + predict(t, 0); + + return out_tensor; + } + + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; + string input_var_name; + string output_var_name; + + void predict(const Tensor &t, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + op->Run(); + } + } +}; + +template class TestTranspose2Op; +} // namespace framework +} // namespace paddle_mobile + +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run Transpose2 Test"; + paddle_mobile::framework::Loader loader; + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params"); + + paddle_mobile::framework::Tensor input; + SetupTensor(&input, {1, 8, 2}, static_cast(0), + static_cast(1)); + auto *input_ptr = input.data(); + for (int i = 0; i < 16; ++i) { + *(input_ptr + i) = i; + } + DLOG << "input : "; + for (int i = 0; i < input.numel(); ++i) { + DLOG << " index " << i << " : " << input_ptr[i]; + } + + paddle_mobile::framework::TestTranspose2Op + testTranspose2Op(program); + + auto output = testTranspose2Op.predict(input); + auto *output_ptr = output->data(); + + DLOG << "output : "; + for (int i = 0; i < output->numel(); ++i) { + DLOG << " index " << i << " : " << output_ptr[i]; + } + return 0; +} diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp index f83ee23c25d8f2588e0fe40d5fabc6114129b995..263fdcfa0ed448b126f4b9cb01ace889318eeddb 100644 --- a/test/operators/test_transpose_op.cpp +++ b/test/operators/test_transpose_op.cpp @@ -16,7 +16,7 @@ limitations under the License. */ #include "../test_include.h" #include "operators/transpose_op.h" int main() { - paddle_mobile::Loader loader; + paddle_mobile::framework::Loader loader; auto program = loader.Load(std::string(g_mobilenet_ssd)); if (program.originProgram == nullptr) { DLOG << "program read file"; diff --git a/test/test_helper.h b/test/test_helper.h index 41d6faed5229be8944178ea62786477ceadd6416..0eb11efd19b7d937f93eec14e163c8c42cb77f12 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -36,6 +36,7 @@ static const char *g_squeezenet = "../models/squeezenet"; static const char *g_googlenet = "../models/googlenet"; static const char *g_googlenet_quali = "../models/googlenet_combine_quali"; static const char *g_mobilenet = "../models/mobilenet"; +static const char *g_mobilenet_mul = "../models/mobilenet_mul"; static const char *g_alexnet = "../models/alexnet"; static const char *g_inceptionv4 = "../models/inceptionv4"; static const char *g_nlp = "../models/nlp"; @@ -44,8 +45,8 @@ static const char *g_resnet = "../models/resnet"; static const char *g_googlenet_combine = "../models/googlenet_combine"; static const char *g_yolo = "../models/yolo"; static const char *g_yolo_combined = "../models/yolo_combined"; +static const char *g_yolo_mul = "../models/yolo_mul"; static const char *g_fluid_fssd_new = "../models/fluid_fssd_new"; - static const char *g_test_image_1x3x224x224 = "../images/test_image_1x3x224x224_float"; static const char *g_test_image_1x3x224x224_banana = @@ -57,9 +58,12 @@ static const char *g_moto = "../images/moto_300x300_float"; static const char *g_imgfssd_ar = "../images/test_image_ssd_ar"; static const char *g_imgfssd_ar1 = "../images/003_0001.txt"; static const char *g_img = "../images/img.bin"; +static const char *g_yolo_img = "../images/in_put_1_3_416_416_2"; +static const char *g_mobilenet_img = "../images/image"; using paddle_mobile::framework::DDim; using paddle_mobile::framework::Tensor; +using namespace paddle_mobile; template void SetupTensor(paddle_mobile::framework::Tensor *input, diff --git a/third_party/opencl/OpenCL-Headers/CL/cl.h b/third_party/opencl/OpenCL-Headers/CL/cl.h new file mode 100644 index 0000000000000000000000000000000000000000..7224ed38faad33d8ed9c25acaeee26400c716aa6 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl.h @@ -0,0 +1,1783 @@ +/******************************************************************************* + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#ifdef __APPLE__ +#include +#include +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_device_svm_capabilities; +#endif +typedef cl_bitfield cl_command_queue_properties; +#ifdef CL_VERSION_1_2 +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; +#endif + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_queue_properties; +#endif +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_svm_mem_flags; +#endif +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +#ifdef CL_VERSION_1_2 +typedef cl_bitfield cl_mem_migration_flags; +#endif +typedef cl_uint cl_image_info; +#ifdef CL_VERSION_1_1 +typedef cl_uint cl_buffer_create_type; +#endif +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +#ifdef CL_VERSION_2_0 +typedef intptr_t cl_pipe_properties; +typedef cl_uint cl_pipe_info; +#endif +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +#ifdef CL_VERSION_1_2 +typedef cl_uint cl_program_binary_type; +#endif +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +#ifdef CL_VERSION_1_2 +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +#endif +typedef cl_uint cl_kernel_work_group_info; +#ifdef CL_VERSION_2_1 +typedef cl_uint cl_kernel_sub_group_info; +#endif +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; +#ifdef CL_VERSION_2_0 +typedef cl_bitfield cl_sampler_properties; +typedef cl_uint cl_kernel_exec_info; +#endif + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +#ifdef CL_VERSION_1_2 + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; +#ifdef __GNUC__ + __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ +#endif + union { + cl_mem buffer; + cl_mem mem_object; + }; +} cl_image_desc; + +#endif + +#ifdef CL_VERSION_1_1 + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + +#endif + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#ifdef CL_VERSION_1_1 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#endif +#ifdef CL_VERSION_1_2 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 +#endif + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#ifdef CL_VERSION_1_1 +#define CL_INVALID_PROPERTY -64 +#endif +#ifdef CL_VERSION_1_2 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 +#endif +#ifdef CL_VERSION_2_0 +#define CL_INVALID_PIPE_SIZE -69 +#define CL_INVALID_DEVICE_QUEUE -70 +#endif +#ifdef CL_VERSION_2_2 +#define CL_INVALID_SPEC_ID -71 +#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 +#endif + + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#ifdef CL_VERSION_1_2 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE +#endif + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 +#ifdef CL_VERSION_2_1 +#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 +#endif + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#ifdef CL_VERSION_1_2 +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#endif +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ +#ifdef CL_VERSION_2_0 +#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A +#endif +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#ifdef CL_VERSION_1_2 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +#endif +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */ +#ifdef CL_VERSION_1_1 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#endif +#ifdef CL_VERSION_1_2 +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B +#endif +#ifdef CL_VERSION_2_0 +#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C +#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D +#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E +#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F +#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 +#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 +#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 +#define CL_DEVICE_SVM_CAPABILITIES 0x1053 +#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 +#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 +#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 +#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 +#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 +#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 +#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A +#endif +#ifdef CL_VERSION_2_1 +#define CL_DEVICE_IL_VERSION 0x105B +#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C +#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D +#endif + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#ifdef CL_VERSION_1_1 +#define CL_FP_SOFT_FLOAT (1 << 6) +#endif +#ifdef CL_VERSION_1_2 +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) +#endif + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) +#ifdef CL_VERSION_2_0 +#define CL_QUEUE_ON_DEVICE (1 << 2) +#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) +#endif + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#ifdef CL_VERSION_1_1 +#define CL_CONTEXT_NUM_DEVICES 0x1083 +#endif + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#ifdef CL_VERSION_1_2 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +#endif + +#ifdef CL_VERSION_2_0 + +/* cl_device_svm_capabilities */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS (1 << 3) + +#endif + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 +#ifdef CL_VERSION_2_0 +#define CL_QUEUE_SIZE 0x1094 +#endif +#ifdef CL_VERSION_2_1 +#define CL_QUEUE_DEVICE_DEFAULT 0x1095 +#endif + +/* cl_mem_flags and cl_svm_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#ifdef CL_VERSION_1_2 +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) +#endif +#ifdef CL_VERSION_2_0 +#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ +#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ +#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +#endif + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#ifdef CL_VERSION_1_1 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#endif +#ifdef CL_VERSION_1_2 +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE +#endif +#ifdef CL_VERSION_2_0 +#define CL_sRGB 0x10BF +#define CL_sRGBx 0x10C0 +#define CL_sRGBA 0x10C1 +#define CL_sBGRA 0x10C2 +#define CL_ABGR 0x10C3 +#endif + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#ifdef CL_VERSION_1_2 +#define CL_UNORM_INT24 0x10DF +#endif +#ifdef CL_VERSION_2_1 +#define CL_UNORM_INT_101010_2 0x10E0 +#endif + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#ifdef CL_VERSION_1_2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 +#endif +#ifdef CL_VERSION_2_0 +#define CL_MEM_OBJECT_PIPE 0x10F7 +#endif + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#ifdef CL_VERSION_1_1 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 +#endif +#ifdef CL_VERSION_2_0 +#define CL_MEM_USES_SVM_POINTER 0x1109 +#endif + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#ifdef CL_VERSION_1_2 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A +#endif + +#ifdef CL_VERSION_2_0 + +/* cl_pipe_info */ +#define CL_PIPE_PACKET_SIZE 0x1120 +#define CL_PIPE_MAX_PACKETS 0x1121 + +#endif + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#ifdef CL_VERSION_1_1 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 +#endif + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 +#ifdef CL_VERSION_2_0 +#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 +#define CL_SAMPLER_LOD_MIN 0x1156 +#define CL_SAMPLER_LOD_MAX 0x1157 +#endif + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#ifdef CL_VERSION_1_2 +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) +#endif + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#ifdef CL_VERSION_1_2 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 +#endif +#ifdef CL_VERSION_2_1 +#define CL_PROGRAM_IL 0x1169 +#endif +#ifdef CL_VERSION_2_2 +#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A +#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B +#endif + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#ifdef CL_VERSION_1_2 +#define CL_PROGRAM_BINARY_TYPE 0x1184 +#endif +#ifdef CL_VERSION_2_0 +#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +#endif + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#ifdef CL_VERSION_1_2 +#define CL_KERNEL_ATTRIBUTES 0x1195 +#endif +#ifdef CL_VERSION_2_1 +#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 +#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +#endif + +#ifdef CL_VERSION_1_2 + +/* cl_kernel_arg_type_qualifier */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) +#ifdef CL_VERSION_2_0 +#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) +#endif + +#endif + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#ifdef CL_VERSION_1_2 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 +#endif + +#ifdef CL_VERSION_2_1 + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 +#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 + +#endif + +#ifdef CL_VERSION_2_0 + +/* cl_kernel_exec_info */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 + +#endif + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#ifdef CL_VERSION_1_1 +#define CL_EVENT_CONTEXT 0x11D4 +#endif + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#ifdef CL_VERSION_1_1 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#endif +#ifdef CL_VERSION_1_2 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 +#endif +#ifdef CL_VERSION_2_0 +#define CL_COMMAND_SVM_FREE 0x1209 +#define CL_COMMAND_SVM_MEMCPY 0x120A +#define CL_COMMAND_SVM_MEMFILL 0x120B +#define CL_COMMAND_SVM_MAP 0x120C +#define CL_COMMAND_SVM_UNMAP 0x120D +#endif + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +#ifdef CL_VERSION_1_1 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +#endif + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 +#ifdef CL_VERSION_2_0 +#define CL_PROFILING_COMMAND_COMPLETE 0x1284 +#endif + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetDefaultDeviceCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceAndHostTimer(cl_device_id /* device */, + cl_ulong* /* device_timestamp */, + cl_ulong* /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetHostTimer(cl_device_id /* device */, + cl_ulong * /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithProperties(cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreatePipe(cl_context /* context */, + cl_mem_flags /* flags */, + cl_uint /* pipe_packet_size */, + cl_uint /* pipe_max_packets */, + const cl_pipe_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPipeInfo(cl_mem /* pipe */, + cl_pipe_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback(cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +#endif + +/* SVM Allocation APIs */ + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY void * CL_API_CALL +clSVMAlloc(cl_context /* context */, + cl_svm_mem_flags /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFree(cl_context /* context */, + void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +/* Sampler APIs */ + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSamplerWithProperties(cl_context /* context */, + const cl_sampler_properties * /* normalized_coords */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithIL(cl_context /* context */, + const void* /* il */, + size_t /* length */, + cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramReleaseCallback(cl_program /* program */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_2_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramSpecializationConstant(cl_program /* program */, + cl_uint /* spec_id */, + size_t /* spec_size */, + const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2; + +#endif + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCloneKernel(cl_kernel /* source_kernel */, + cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointer(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfo(cl_kernel /* kernel */, + cl_kernel_exec_info /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_sub_group_info /* param_name */, + size_t /* input_value_size */, + const void* /*input_value */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1; + +#endif + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_VERSION_2_0 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFree(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpy(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFill(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMap(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmap(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +#endif + +#ifdef CL_VERSION_2_1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMigrateMem(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + const void ** /* svm_pointers */, + const size_t * /* sizes */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_1; + +#endif + +#ifdef CL_VERSION_1_2 + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, + const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + /* + * WARNING: + * This API introduces mutable state into the OpenCL implementation. It has been REMOVED + * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the + * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. + * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. + * + * Software developers previously relying on this API are instructed to set the command queue + * properties when creating the queue, instead. + */ + extern CL_API_ENTRY cl_int CL_API_CALL + clSetCommandQueueProperty(cl_command_queue /* command_queue */, + cl_command_queue_properties /* properties */, + cl_bool /* enable */, + cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; +#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* Deprecated OpenCL 2.0 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h new file mode 100644 index 0000000000000000000000000000000000000000..d5960a43f72123bdd693da50d3ad9a3a82cd032c --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D10_H +#define __OPENCL_CL_D3D10_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d10_sharing */ +#define cl_khr_d3d10_sharing 1 + +typedef cl_uint cl_d3d10_device_source_khr; +typedef cl_uint cl_d3d10_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D10_DEVICE_KHR -1002 +#define CL_INVALID_D3D10_RESOURCE_KHR -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 + +/* cl_d3d10_device_source_nv */ +#define CL_D3D10_DEVICE_KHR 0x4010 +#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 + +/* cl_d3d10_device_set_nv */ +#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 + +/* cl_context_info */ +#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 +#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C + +/* cl_mem_info */ +#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 + +/* cl_image_info */ +#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D10_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h new file mode 100644 index 0000000000000000000000000000000000000000..39f9072398a29ab0c5a91f3a08b8c75034e8ac17 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D11_H +#define __OPENCL_CL_D3D11_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d11_sharing */ +#define cl_khr_d3d11_sharing 1 + +typedef cl_uint cl_d3d11_device_source_khr; +typedef cl_uint cl_d3d11_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D11_DEVICE_KHR -1006 +#define CL_INVALID_D3D11_RESOURCE_KHR -1007 +#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 +#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 + +/* cl_d3d11_device_source */ +#define CL_D3D11_DEVICE_KHR 0x4019 +#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A + +/* cl_d3d11_device_set */ +#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B +#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C + +/* cl_context_info */ +#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D +#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D + +/* cl_mem_info */ +#define CL_MEM_D3D11_RESOURCE_KHR 0x401E + +/* cl_image_info */ +#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 +#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( + cl_platform_id platform, + cl_d3d11_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D11_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h new file mode 100644 index 0000000000000000000000000000000000000000..2729e8b9e89a10dc410863140a904ee67250950d --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h @@ -0,0 +1,132 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* cl_khr_dx9_media_sharing */ +#define cl_khr_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_media_adapter_type_khr; +typedef cl_uint cl_dx9_media_adapter_set_khr; + +#if defined(_WIN32) +#include +typedef struct _cl_dx9_surface_info_khr +{ + IDirect3DSurface9 *resource; + HANDLE shared_handle; +} cl_dx9_surface_info_khr; +#endif + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 +#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 +#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 +#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 + +/* cl_media_adapter_type_khr */ +#define CL_ADAPTER_D3D9_KHR 0x2020 +#define CL_ADAPTER_D3D9EX_KHR 0x2021 +#define CL_ADAPTER_DXVA_KHR 0x2022 + +/* cl_media_adapter_set_khr */ +#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 +#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 + +/* cl_context_info */ +#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 +#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 +#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 + +/* cl_mem_info */ +#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 +#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 + +/* cl_image_info */ +#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B +#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( + cl_platform_id platform, + cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr * media_adapter_type, + void * media_adapters, + cl_dx9_media_adapter_set_khr media_adapter_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( + cl_context context, + cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, + void * surface_info, + cl_uint plane, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..331bab97c74050724573be927774523fb24101df --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h @@ -0,0 +1,182 @@ +/********************************************************************************** + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2016 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_dx9_media_sharing_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*************************************** +* cl_intel_dx9_media_sharing extension * +****************************************/ + +#define cl_intel_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_device_source_intel; +typedef cl_uint cl_dx9_device_set_intel; + +/* error codes */ +#define CL_INVALID_DX9_DEVICE_INTEL -1010 +#define CL_INVALID_DX9_RESOURCE_INTEL -1011 +#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 +#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 + +/* cl_dx9_device_source_intel */ +#define CL_D3D9_DEVICE_INTEL 0x4022 +#define CL_D3D9EX_DEVICE_INTEL 0x4070 +#define CL_DXVA_DEVICE_INTEL 0x4071 + +/* cl_dx9_device_set_intel */ +#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 +#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 + +/* cl_context_info */ +#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 +#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 +#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 + +/* cl_mem_info */ +#define CL_MEM_DX9_RESOURCE_INTEL 0x4027 +#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 + +/* cl_image_info */ +#define CL_IMAGE_DX9_PLANE_INTEL 0x4075 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A +#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B +/******************************************************************************/ + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDsFromDX9INTEL( + cl_platform_id /* platform */, + cl_dx9_device_source_intel /* dx9_device_source */, + void* /* dx9_object */, + cl_dx9_device_set_intel /* dx9_device_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( + cl_platform_id /* platform */, + cl_dx9_device_source_intel /* dx9_device_source */, + void* /* dx9_object */, + cl_dx9_device_set_intel /* dx9_device_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromDX9MediaSurfaceINTEL( + cl_context /* context */, + cl_mem_flags /* flags */, + IDirect3DSurface9* /* resource */, + HANDLE /* sharedHandle */, + UINT /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( + cl_context /* context */, + cl_mem_flags /* flags */, + IDirect3DSurface9* /* resource */, + HANDLE /* sharedHandle */, + UINT /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireDX9ObjectsINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseDX9ObjectsINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_egl.h b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h new file mode 100644 index 0000000000000000000000000000000000000000..a765bd5266c02fc2fd2892f0257b228996d73c5f --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h @@ -0,0 +1,136 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_EGL_H +#define __OPENCL_CL_EGL_H + +#ifdef __APPLE__ + +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ +#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F +#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D +#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E + +/* Error type for clCreateFromEGLImageKHR */ +#define CL_INVALID_EGL_OBJECT_KHR -1093 +#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 + +/* CLeglImageKHR is an opaque handle to an EGLImage */ +typedef void* CLeglImageKHR; + +/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ +typedef void* CLeglDisplayKHR; + +/* CLeglSyncKHR is an opaque handle to an EGLSync object */ +typedef void* CLeglSyncKHR; + +/* properties passed to clCreateFromEGLImageKHR */ +typedef intptr_t cl_egl_image_properties_khr; + + +#define cl_khr_egl_image 1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromEGLImageKHR(cl_context /* context */, + CLeglDisplayKHR /* egldisplay */, + CLeglImageKHR /* eglimage */, + cl_mem_flags /* flags */, + const cl_egl_image_properties_khr * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( + cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +#define cl_khr_egl_event 1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromEGLSyncKHR(cl_context /* context */, + CLeglSyncKHR /* sync */, + CLeglDisplayKHR /* display */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( + cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EGL_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_ext.h b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..af3ce461f3a48e7707caca966e704dfe5eb58e30 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h @@ -0,0 +1,723 @@ +/******************************************************************************* + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include + #include +#else + #include +#endif + +/* cl_khr_fp64 extension - no extension #define since it has no functions */ +/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ + +#if CL_TARGET_OPENCL_VERSION <= 110 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +#endif + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */, + void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)( + cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + + + +/******************************* + * cl_khr_il_program extension * + *******************************/ +#define cl_khr_il_program 1 + +/* New property to clGetDeviceInfo for retrieving supported intermediate + * languages + */ +#define CL_DEVICE_IL_VERSION_KHR 0x105B + +/* New property to clGetProgramInfo for retrieving for retrieving the IL of a + * program + */ +#define CL_PROGRAM_IL_KHR 0x1169 + +extern CL_API_ENTRY cl_program + CL_API_CALL clCreateProgramWithILKHR( + cl_context /* context */, + const void * /* il */, + size_t /* length */, + cl_int * /* errcode_ret */); + +typedef CL_API_ENTRY cl_program + (CL_API_CALL *clCreateProgramWithILKHR_fn)( + cl_context /* context */, + const void * /* il */, + size_t /* length */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +/* Extension: cl_khr_image2D_buffer + * + * This extension allows a 2D image to be created from a cl_mem buffer without a copy. + * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t. + * Both the sampler and sampler-less read_image built-in functions are supported for 2D images + * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported + * for 2D images created from a buffer. + * + * When the 2D image from buffer is created, the client must specify the width, + * height, image format (i.e. channel order and channel data type) and optionally the row pitch + * + * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels. + * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels. + */ + +/************************************** + * cl_khr_initialize_memory extension * + **************************************/ + +#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 + + +/************************************** + * cl_khr_terminate_context extension * + **************************************/ + +#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 +#define CL_CONTEXT_TERMINATE_KHR 0x2032 + +#define cl_khr_terminate_context 1 +extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + + +/* + * Extension: cl_khr_spir + * + * This extension adds support to create an OpenCL program object from a + * Standard Portable Intermediate Representation (SPIR) instance + */ + +#define CL_DEVICE_SPIR_VERSIONS 0x40E0 +#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 + + +/***************************************** + * cl_khr_create_command_queue extension * + *****************************************/ +#define cl_khr_create_command_queue 1 + +typedef cl_bitfield cl_queue_properties_khr; + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithPropertiesKHR( cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties_khr* /* properties */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; +typedef CL_API_ENTRY cl_command_queue +(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties_khr* /* properties */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ + +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 + + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ + +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 + + +/********************************* +* cl_arm_printf extension +*********************************/ + +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + + +/*********************************** +* cl_ext_device_fission extension +***********************************/ +#define cl_ext_device_fission 1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef cl_ulong cl_device_partition_property_ext; +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevicesEXT( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + +/* cl_device_partition_property_ext */ +#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 +#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 +#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + +/* clDeviceGetInfo selectors */ +#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 +#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 +#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 +#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 +#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + +/* error codes */ +#define CL_DEVICE_PARTITION_FAILED_EXT -1057 +#define CL_INVALID_PARTITION_COUNT_EXT -1058 +#define CL_INVALID_PARTITION_NAME_EXT -1059 + +/* CL_AFFINITY_DOMAINs */ +#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 +#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 +#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 +#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 +#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 +#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + +/* cl_device_partition_property_ext list terminators */ +#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) +#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) +#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + + +/*********************************** + * cl_ext_migrate_memobject extension definitions + ***********************************/ +#define cl_ext_migrate_memobject 1 + +typedef cl_bitfield cl_mem_migration_flags_ext; + +#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1 + +#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjectEXT( cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags_ext /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */ ); + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)( cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags_ext /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */ ); + + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ +#define cl_qcom_ext_host_ptr 1 + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + + +/******************************************* +* cl_qcom_ext_host_ptr_iocoherent extension +********************************************/ + +/* Cache policy specifying io-coherence */ +#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9 + + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + + +/********************************* +* cl_qcom_android_native_buffer_host_ptr extension +*********************************/ + +#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 + +typedef struct _cl_mem_android_native_buffer_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* Virtual pointer to the android native buffer */ + void* anb_ptr; + +} cl_mem_android_native_buffer_host_ptr; + + +/****************************************** + * cl_img_yuv_image extension * + ******************************************/ + +/* Image formats used in clCreateImage */ +#define CL_NV21_IMG 0x40D0 +#define CL_YV12_IMG 0x40D1 + + +/****************************************** + * cl_img_cached_allocations extension * + ******************************************/ + +/* Flag values used by clCreteBuffer */ +#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) +#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) + + +/****************************************** + * cl_img_use_gralloc_ptr extension * + ******************************************/ +#define cl_img_use_gralloc_ptr 1 + +/* Flag values used by clCreteBuffer */ +#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 +#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 + +/* Error code from clEnqueueReleaseGrallocObjectsIMG */ +#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGrallocObjectsIMG(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGrallocObjectsIMG(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + + +/********************************* +* cl_khr_subgroups extension +*********************************/ +#define cl_khr_subgroups 1 + +#if !defined(CL_VERSION_2_1) +/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h. + In hindsight, there should have been a khr suffix on this type for + the extension, but keeping it un-suffixed to maintain backwards + compatibility. */ +typedef cl_uint cl_kernel_sub_group_info; +#endif + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + + +/********************************* +* cl_khr_priority_hints extension +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_priority_hints 1 + +typedef cl_uint cl_queue_priority_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_PRIORITY_KHR 0x1096 + +/* cl_queue_priority_khr */ +#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) +#define CL_QUEUE_PRIORITY_MED_KHR (1<<1) +#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) + + +/********************************* +* cl_khr_throttle_hints extension +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_throttle_hints 1 + +typedef cl_uint cl_queue_throttle_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_THROTTLE_KHR 0x1097 + +/* cl_queue_throttle_khr */ +#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) +#define CL_QUEUE_THROTTLE_MED_KHR (1<<1) +#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) + + +/********************************* +* cl_khr_subgroup_named_barrier +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_subgroup_named_barrier 1 + +/* cl_device_info */ +#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 + + +/********************************** + * cl_arm_import_memory extension * + **********************************/ +#define cl_arm_import_memory 1 + +typedef intptr_t cl_import_properties_arm; + +/* Default and valid proporties name for cl_arm_import_memory */ +#define CL_IMPORT_TYPE_ARM 0x40B2 + +/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_HOST_ARM 0x40B3 + +/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 + +/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_SECURE_ARM 0x40B5 + +/* This extension adds a new function that allows for direct memory import into + * OpenCL via the clImportMemoryARM function. + * + * Memory imported through this interface will be mapped into the device's page + * tables directly, providing zero copy access. It will never fall back to copy + * operations and aliased buffers. + * + * Types of memory supported for import are specified as additional extension + * strings. + * + * This extension produces cl_mem allocations which are compatible with all other + * users of cl_mem in the standard API. + * + * This extension maps pages with the same properties as the normal buffer creation + * function clCreateBuffer. + */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clImportMemoryARM( cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0; + + +/****************************************** + * cl_arm_shared_virtual_memory extension * + ******************************************/ +#define cl_arm_shared_virtual_memory 1 + +/* Used by clGetDeviceInfo */ +#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 + +/* Used by clGetMemObjectInfo */ +#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 + +/* Used by clSetKernelExecInfoARM: */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_SVM_FREE_ARM 0x40BA +#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB +#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC +#define CL_COMMAND_SVM_MAP_ARM 0x40BD +#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE + +/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) + +/* Flag values used by clSVMAllocARM: */ +#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) +#define CL_MEM_SVM_ATOMICS_ARM (1 << 11) + +typedef cl_bitfield cl_svm_mem_flags_arm; +typedef cl_uint cl_kernel_exec_info_arm; +typedef cl_bitfield cl_device_svm_capabilities_arm; + +extern CL_API_ENTRY void * CL_API_CALL +clSVMAllocARM(cl_context /* context */, + cl_svm_mem_flags_arm /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFreeARM(cl_context /* context */, + void * /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFreeARM(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpyARM(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFillARM(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMapARM(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmapARM(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointerARM(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2; +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfoARM(cl_kernel /* kernel */, + cl_kernel_exec_info_arm /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_EXT_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + + +#endif /* __CL_EXT_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..1c358cfc10c5c01fa5b5bfcc65d4e5904f830a9e --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h @@ -0,0 +1,429 @@ +/******************************************************************************* + * Copyright (c) 2008-2017 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2017 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_ext_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + +#ifndef __CL_EXT_INTEL_H +#define __CL_EXT_INTEL_H + +#ifdef __APPLE__ + #include + #include +#else + #include + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/*************************************** +* cl_intel_thread_local_exec extension * +****************************************/ + +#define cl_intel_thread_local_exec 1 + +#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) + +/*********************************************** +* cl_intel_device_partition_by_names extension * +************************************************/ + +#define cl_intel_device_partition_by_names 1 + +#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 +#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1 + +/************************************************ +* cl_intel_accelerator extension * +* cl_intel_motion_estimation extension * +* cl_intel_advanced_motion_estimation extension * +*************************************************/ + +#define cl_intel_accelerator 1 +#define cl_intel_motion_estimation 1 +#define cl_intel_advanced_motion_estimation 1 + +typedef struct _cl_accelerator_intel* cl_accelerator_intel; +typedef cl_uint cl_accelerator_type_intel; +typedef cl_uint cl_accelerator_info_intel; + +typedef struct _cl_motion_estimation_desc_intel { + cl_uint mb_block_type; + cl_uint subpixel_mode; + cl_uint sad_adjust_mode; + cl_uint search_path_type; +} cl_motion_estimation_desc_intel; + +/* error codes */ +#define CL_INVALID_ACCELERATOR_INTEL -1094 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 + +/* cl_accelerator_type_intel */ +#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 + +/* cl_accelerator_info_intel */ +#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 +#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 +#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 +#define CL_ACCELERATOR_TYPE_INTEL 0x4093 + +/* cl_motion_detect_desc_intel flags */ +#define CL_ME_MB_TYPE_16x16_INTEL 0x0 +#define CL_ME_MB_TYPE_8x8_INTEL 0x1 +#define CL_ME_MB_TYPE_4x4_INTEL 0x2 + +#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 + +#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 + +#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 +#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 +#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 + +#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0 +#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1 +#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2 +#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4 + +#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1 +#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2 +#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3 + +#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16 +#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21 +#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32 +#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43 +#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48 + +#define CL_ME_COST_PENALTY_NONE_INTEL 0x0 +#define CL_ME_COST_PENALTY_LOW_INTEL 0x1 +#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2 +#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3 + +#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CL_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 + +#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 + +#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +/* cl_device_info */ +#define CL_DEVICE_ME_VERSION_INTEL 0x407E + +#define CL_ME_VERSION_LEGACY_INTEL 0x0 +#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1 +#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2 + +extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL +clCreateAcceleratorINTEL( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetAcceleratorInfoINTEL( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainAcceleratorINTEL( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseAcceleratorINTEL( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +/****************************************** +* cl_intel_simultaneous_sharing extension * +*******************************************/ + +#define cl_intel_simultaneous_sharing 1 + +#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 +#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 + +/*********************************** +* cl_intel_egl_image_yuv extension * +************************************/ + +#define cl_intel_egl_image_yuv 1 + +#define CL_EGL_YUV_PLANE_INTEL 0x4107 + +/******************************** +* cl_intel_packed_yuv extension * +*********************************/ + +#define cl_intel_packed_yuv 1 + +#define CL_YUYV_INTEL 0x4076 +#define CL_UYVY_INTEL 0x4077 +#define CL_YVYU_INTEL 0x4078 +#define CL_VYUY_INTEL 0x4079 + +/******************************************** +* cl_intel_required_subgroup_size extension * +*********************************************/ + +#define cl_intel_required_subgroup_size 1 + +#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 +#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 +#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A + +/**************************************** +* cl_intel_driver_diagnostics extension * +*****************************************/ + +#define cl_intel_driver_diagnostics 1 + +typedef cl_uint cl_diagnostics_verbose_level; + +#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 + +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 ) + +/******************************** +* cl_intel_planar_yuv extension * +*********************************/ + +#define CL_NV12_INTEL 0x410E + +#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 ) +#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 ) + +#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E +#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F + +/******************************************************* +* cl_intel_device_side_avc_motion_estimation extension * +********************************************************/ + +#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B +#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C +#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D + +#define CL_AVC_ME_VERSION_0_INTEL 0x0; // No support. +#define CL_AVC_ME_VERSION_1_INTEL 0x1; // First supported version. + +#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0 +#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1 +#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2 +#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3 + +#define CL_AVC_ME_MINOR_8x8_INTEL 0x0 +#define CL_AVC_ME_MINOR_8x4_INTEL 0x1 +#define CL_AVC_ME_MINOR_4x8_INTEL 0x2 +#define CL_AVC_ME_MINOR_4x4_INTEL 0x3 + +#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0 +#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 +#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 + +#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 +#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E +#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D +#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B +#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 +#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F +#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F +#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F + +#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 +#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 +#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 +#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 +#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 +#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 +#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 +#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 +#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 +#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9 +#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2 +#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa + +#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 + +#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 + +#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 +#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 +#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 +#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B +#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 + +#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 +#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 +#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 +#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 + +#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 +#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 + +#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 ) + +#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 +#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 + +#define CL_AVC_ME_INTRA_16x16_INTEL 0x0 +#define CL_AVC_ME_INTRA_8x8_INTEL 0x1 +#define CL_AVC_ME_INTRA_4x4_INTEL 0x2 + +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 + +#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 + +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1 +#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2 +#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3 + +#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 +#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 +#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 + +#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 +#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 + +#ifdef __cplusplus +} +#endif + +#endif /* __CL_EXT_INTEL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_gl.h b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h new file mode 100644 index 0000000000000000000000000000000000000000..58b6449f9b4e98d561ee9a6f8b3daa6caede9f44 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h @@ -0,0 +1,175 @@ +/********************************************************************************** + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#ifdef CL_VERSION_1_2 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 +#endif + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#ifdef CL_VERSION_1_2 +#define CL_GL_NUM_SAMPLES 0x2012 +#endif + + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* bufobj */, + int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +#ifdef CL_VERSION_1_2 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +#endif + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* renderbuffer */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem /* memobj */, + cl_gl_object_type * /* gl_object_type */, + cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem /* memobj */, + cl_gl_texture_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * /* properties */, + cl_gl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..e3c14c6408c44160103bcb4c0dcd230a674643a5 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h @@ -0,0 +1,74 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ +/* OpenGL dependencies. */ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include +#else + #include +#endif + +/* + * For each extension, follow this template + * cl_VEN_extname extension */ +/* #define cl_VEN_extname 1 + * ... define new types, if any + * ... define new tokens, if any + * ... define new APIs, if any + * + * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header + * This allows us to avoid having to decide whether to include GL headers or GLES here. + */ + +/* + * cl_khr_gl_event extension + * See section 9.9 in the OpenCL 1.1 spec for more information + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context /* context */, + cl_GLsync /* cl_GLsync */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_platform.h b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..c2f408fed59fc42f9c2573061704610498890b40 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h @@ -0,0 +1,1460 @@ +/********************************************************************************** + * Copyright (c) 2008-2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + #include + + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include +#else + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +/* + * Deprecation flags refer to the last version of the header in which the + * feature was not deprecated. + * + * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without + * deprecation but is deprecated in versions later than 1.1. + */ + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + #define CL_API_SUFFIX__VERSION_2_0 + #define CL_EXT_SUFFIX__VERSION_2_0 + #define CL_API_SUFFIX__VERSION_2_1 + #define CL_EXT_SUFFIX__VERSION_2_1 + #define CL_API_SUFFIX__VERSION_2_2 + #define CL_EXT_SUFFIX__VERSION_2_2 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #endif + #elif defined(_WIN32) + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 1.7976931348623158e+308 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef vector unsigned char __cl_uchar16; + typedef vector signed char __cl_char16; + typedef vector unsigned short __cl_ushort8; + typedef vector signed short __cl_short8; + typedef vector unsigned int __cl_uint4; + typedef vector signed int __cl_int4; + typedef vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ +#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && defined(_MSC_VER) + #if _MSC_VER >= 1500 + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ + #define __CL_HAS_ANON_STRUCT__ 1 + #define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ + #pragma warning( push ) + #pragma warning( disable : 4201 ) + #endif +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + + +/* ---- cl_halfn ---- */ +typedef union +{ + cl_half CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2; +#endif +}cl_half2; + +typedef union +{ + cl_half CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[2]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4; +#endif +}cl_half4; + +/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ +typedef cl_half4 cl_half3; + +typedef union +{ + cl_half CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[4]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[2]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8; +#endif +}cl_half8; + +typedef union +{ + cl_half CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[8]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[4]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8[2]; +#endif +#if defined( __CL_HALF16__ ) + __cl_half16 v16; +#endif +}cl_half16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && defined(_MSC_VER) + #if _MSC_VER >=1500 + #pragma warning( pop ) + #endif +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..28444288573219be06fa449bb50161a20e95acfc --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h @@ -0,0 +1,172 @@ +/********************************************************************************** + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2016 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_va_api_media_sharing_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + + +#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H +#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************** +* cl_intel_va_api_media_sharing extension * +*******************************************/ + +#define cl_intel_va_api_media_sharing 1 + +/* error codes */ +#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 +#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 +#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 +#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 + +/* cl_va_api_device_source_intel */ +#define CL_VA_API_DISPLAY_INTEL 0x4094 + +/* cl_va_api_device_set_intel */ +#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 +#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 + +/* cl_context_info */ +#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 + +/* cl_mem_info */ +#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 + +/* cl_image_info */ +#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A +#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B + +typedef cl_uint cl_va_api_device_source_intel; +typedef cl_uint cl_va_api_device_set_intel; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDsFromVA_APIMediaAdapterINTEL( + cl_platform_id /* platform */, + cl_va_api_device_source_intel /* media_adapter_type */, + void* /* media_adapter */, + cl_va_api_device_set_intel /* media_adapter_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( + cl_platform_id /* platform */, + cl_va_api_device_source_intel /* media_adapter_type */, + void* /* media_adapter */, + cl_va_api_device_set_intel /* media_adapter_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromVA_APIMediaSurfaceINTEL( + cl_context /* context */, + cl_mem_flags /* flags */, + VASurfaceID* /* surface */, + cl_uint /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( + cl_context /* context */, + cl_mem_flags /* flags */, + VASurfaceID* /* surface */, + cl_uint /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireVA_APIMediaSurfacesINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseVA_APIMediaSurfacesINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/CL/cl_version.h b/third_party/opencl/OpenCL-Headers/CL/cl_version.h new file mode 100644 index 0000000000000000000000000000000000000000..bb766cb9bbddca65a3cd599375a24cb827789d08 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/cl_version.h @@ -0,0 +1,86 @@ +/******************************************************************************* + * Copyright (c) 2018 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __CL_VERSION_H +#define __CL_VERSION_H + +/* Detect which version to target */ +#if !defined(CL_TARGET_OPENCL_VERSION) +#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)") +#define CL_TARGET_OPENCL_VERSION 220 +#endif +#if CL_TARGET_OPENCL_VERSION != 100 && \ + CL_TARGET_OPENCL_VERSION != 110 && \ + CL_TARGET_OPENCL_VERSION != 120 && \ + CL_TARGET_OPENCL_VERSION != 200 && \ + CL_TARGET_OPENCL_VERSION != 210 && \ + CL_TARGET_OPENCL_VERSION != 220 +#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)") +#undef CL_TARGET_OPENCL_VERSION +#define CL_TARGET_OPENCL_VERSION 220 +#endif + + +/* OpenCL Version */ +#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) +#define CL_VERSION_2_2 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) +#define CL_VERSION_2_1 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) +#define CL_VERSION_2_0 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) +#define CL_VERSION_1_2 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) +#define CL_VERSION_1_1 1 +#endif +#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) +#define CL_VERSION_1_0 1 +#endif + +/* Allow deprecated APIs for older OpenCL versions. */ +#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) +#define CL_USE_DEPRECATED_OPENCL_2_1_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) +#define CL_USE_DEPRECATED_OPENCL_2_0_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#endif +#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) +#define CL_USE_DEPRECATED_OPENCL_1_0_APIS +#endif + +#endif /* __CL_VERSION_H */ diff --git a/third_party/opencl/OpenCL-Headers/CL/opencl.h b/third_party/opencl/OpenCL-Headers/CL/opencl.h new file mode 100644 index 0000000000000000000000000000000000000000..9855cd75e7da064e094658b660851997c38a8c56 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/CL/opencl.h @@ -0,0 +1,59 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + +#include +#include +#include +#include + +#else + +#include +#include +#include +#include + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ + diff --git a/third_party/opencl/OpenCL-Headers/LICENSE b/third_party/opencl/OpenCL-Headers/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..020ce65fcac2a60e44dab1626fa4924dec17ea23 --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/LICENSE @@ -0,0 +1,25 @@ +Copyright (c) 2008-2015 The Khronos Group Inc. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and/or associated documentation files (the +"Materials"), to deal in the Materials without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Materials, and to +permit persons to whom the Materials are furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Materials. + +MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS +KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS +SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + https://www.khronos.org/registry/ + +THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. diff --git a/third_party/opencl/OpenCL-Headers/README.md b/third_party/opencl/OpenCL-Headers/README.md new file mode 100644 index 0000000000000000000000000000000000000000..757e56e152f8bc2fed68d2cdf38164c3171f929d --- /dev/null +++ b/third_party/opencl/OpenCL-Headers/README.md @@ -0,0 +1,50 @@ +# OpenCLTM API Headers + +This repository contains C language headers for the OpenCL API. + +The authoritative public repository for these headers is located at: + +https://github.com/KhronosGroup/OpenCL-Headers + +Issues, proposed fixes for issues, and other suggested changes should be +created using Github. + +## Branch Structure + +The OpenCL API headers in this repository are Unified headers and are designed +to work with all released OpenCL versions. This differs from previous OpenCL +API headers, where version-specific API headers either existed in separate +branches, or in separate folders in a branch. + +## Compiling for a Specific OpenCL Version + +By default, the OpenCL API headers in this repository are for the latest +OpenCL version (currently OpenCL 2.2). To use these API headers to target +a different OpenCL version, an application may `#define` the preprocessor +value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers. +The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing +the OpenCL API version. + +For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may +include the OpenCL API headers as follows: + +``` +#define CL_TARGET_OPENCL_VERSION 120 +#include +``` + +## Directory Structure + +``` +README.md This file +LICENSE Source license for the OpenCL API headers +CL/ Unified OpenCL API headers tree +``` + +## License + +See [LICENSE](LICENSE). + +--- + +OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos. diff --git a/tools/android-debug-script/push2android.sh b/tools/android-debug-script/push2android.sh index fae1a856123bd16cf3f7a115f61b3e4473ff58a3..68cbc6cf858ed9fbf7f1fd2522cd897309e31f78 100644 --- a/tools/android-debug-script/push2android.sh +++ b/tools/android-debug-script/push2android.sh @@ -5,12 +5,12 @@ MODELS_PATH="../../test/models/*" MODELS_SRC="../../test/models" IMAGE_PATH="../../test/images/*" EXE_FILE="../../test/build/*" -EXE_DIR="data/local/tmp/bin" +EXE_DIR="/data/local/tmp/bin" adb shell mkdir ${EXE_DIR} -MODELS_DIR="data/local/tmp/models" +MODELS_DIR="/data/local/tmp/models" adb shell mkdir ${MODELS_DIR} for file in `ls ${MODELS_SRC}` -do +do adb shell mkdir ${MODELS_DIR}"/"${file} done @@ -19,11 +19,15 @@ ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*" adb push ${ACL_BUILD_PATH} ${EXE_DIR} fi -IMAGES_DIR="data/local/tmp/images" +IMAGES_DIR="/data/local/tmp/images" adb shell mkdir ${IMAGES_DIR} LIB_PATH="../../build/release/arm-v7a/build/*" adb push ${EXE_FILE} ${EXE_DIR} -adb push ${LIB_PATH} ${EXE_DIR} +for file in ${LIB_PATH} +do + adb push ${file} ${EXE_DIR} +done + if [[ $1 != "npm" ]]; then adb push ${IMAGE_PATH} ${IMAGES_DIR} adb push ${MODELS_PATH} ${MODELS_DIR} diff --git a/tools/build.sh b/tools/build.sh index 330bc208ef2c5e27b7ad113e9a202948a144829c..65d6f58fbfbcff37d9a3325e62a70241fc54aed9 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -92,6 +92,8 @@ build_for_android() { fi cd "../build/release/${PLATFORM}" make -j 8 + mkdir ./build/cl_kernel + cp ../../../src/operators/kernel/cl/cl_kernel/* ./build/cl_kernel/ } diff --git a/tools/op.cmake b/tools/op.cmake index f7a6ed4b134f78ddb23487cd3a861f244e6a86db..ae1ac1a4ffd4a5a563c8a7be0b90c9f26a6b0f70 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -106,9 +106,9 @@ if (CON GREATER -1) set(FOUND_MATCH ON) endif() -list(FIND NET "FPGAnets" CON) +list(FIND NET "FPGA_NET_V1" CON) if (CON GREATER -1) - message("FPGAnets enabled") + message("FPGA_NET_V1 enabled") set(FUSION_CONVADDRELU_OP ON) set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDBN_OP ON) @@ -124,6 +124,23 @@ if (CON GREATER -1) set(FOUND_MATCH ON) endif() +list(FIND NET "FPGA_NET_V2" CON) +if (CON GREATER -1) + message("FPGA_NET_V2 enabled") + set(FUSION_ELEMENTWISEADDRELU_OP ON) + set(FUSION_FC_OP ON) + set(POOL_OP ON) + set(SOFTMAX_OP ON) + set(FUSION_CONVBNRELU_OP ON) + set(FUSION_CONVBN_OP ON) + set(CONV_TRANSPOSE_OP ON) + set(FUSION_DECONVRELU_OP ON) + set(SLICE_OP ON) + set(TANH_OP ON) + set(ELEMENTWISEADD_OP ON) + set(FOUND_MATCH ON) +endif() + list(FIND NET "nlp" CON) if (CON GREATER -1) message("nlp enabled") @@ -201,9 +218,11 @@ if(NOT FOUND_MATCH) set(PRIORBOX_OP ON) set(RELU_OP ON) set(RESHAPE_OP ON) + set(RESHAPE2_OP ON) set(SIGMOID_OP ON) set(SOFTMAX_OP ON) set(TRANSPOSE_OP ON) + set(TRANSPOSE2_OP ON) set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDADDPRELU_OP ON) set(FUSION_DWCONVBNRELU_OP ON) @@ -246,9 +265,11 @@ endif() # option(PRIORBOX_OP "" ON) # option(RELU_OP "" ON) # option(RESHAPE_OP "" ON) + # option(RESHAPE2_OP "" ON) # option(SIGMOID_OP "" ON) # option(SOFTMAX_OP "" ON) # option(TRANSPOSE_OP "" ON) + # option(TRANSPOSE2_OP "" ON) # endif () if (BATCHNORM_OP) @@ -314,6 +335,9 @@ endif() if (RESHAPE_OP) add_definitions(-DRESHAPE_OP) endif() +if (RESHAPE2_OP) + add_definitions(-DRESHAPE2_OP) +endif() if (SIGMOID_OP) add_definitions(-DSIGMOID_OP) endif() @@ -323,6 +347,9 @@ endif() if (TRANSPOSE_OP) add_definitions(-DTRANSPOSE_OP) endif() +if (TRANSPOSE2_OP) + add_definitions(-DTRANSPOSE2_OP) +endif() if (FUSION_CONVADDBNRELU_OP) add_definitions(-DFUSION_CONVADDBNRELU_OP) endif() @@ -420,3 +447,9 @@ if (DEQUANT_OP) add_definitions(-DDEQUANT_OP) endif() +if (TANH_OP) + add_definitions(-DTANH_OP) +endif() +if (FUSION_DECONVRELU_OP) + add_definitions(-DFUSION_DECONVRELU_OP) +endif() \ No newline at end of file diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook index ece9ebc598e3fa63d1d76409dc0068854aaec851..92377d2dd6b53c69aaff41e4ea204b80fef31671 100644 --- a/tools/pre-commit.hooks/clang-format.hook +++ b/tools/pre-commit.hooks/clang-format.hook @@ -17,7 +17,7 @@ shift perl -i -pe 's|^\s+#pragma\s+omp|// #pragma omp|' "$@" ( # remove clang format ios_io folder -flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||') +flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||') clang-format -i $flist ) perl -i -pe 's|// ||' "$@"