diff --git a/CMakeLists.txt b/CMakeLists.txt index 7475fed772b96fb373c7fe03c6a6fb650181849c..f5d68712a64b5a47657a7af9c0e6b47604893e23 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,7 +213,7 @@ else() set(NET "default" CACHE STRING "select net type") endif() -set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP") +set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP") include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake") diff --git a/README.md b/README.md index 2572f25444dc4268e7a6a3f43cfdc1b38dae8e02..c3a30009825a7f8f9f5c4940a847fc88fe6a840e 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,8 @@ 开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md) -* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) +* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) +* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md) * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md) * [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md) diff --git a/doc/development_android.md b/doc/development_android.md index 528d7aa2def78103b8dbdcf0329279f029c85cac..d0da36aa82cfc9d4826f03bd2bdf1dd8f551965a 100644 --- a/doc/development_android.md +++ b/doc/development_android.md @@ -1,10 +1,93 @@ # Android开发文档 -用户可通过如下两种方式,交叉编译Android平台上适用的paddle-mobile库: +用户可通过如下两种方式进行编译: +- 基于macOS 、Linux交叉编译 - 基于Docker容器编译 -- 基于Linux交叉编译 +## 基于macOS 、Linux交叉编译 + +需要: NDK17及以上、cmake 3.0及以上 + +### 执行编译 +在paddle-mobile根目录中,执行以下命令: + +``` + +cd tools +sh build.sh android + +# 如果想编译只支持某些特定网络的库 (可以控制包体积, 编译出来的库就只包含了支持这些特定模型的算子), 可以使用 + +sh build.sh android mobilenet googlenet + +# 当然这些网络是需要在 cmakelist 中配置的(https://github.com/PaddlePaddle/paddle-mobile/blob/73769e7d05ef4820a115ad3fb9b1ca3f55179d03/CMakeLists.txt#L216), 目前配置了几个常见模型 + +``` + +执行完毕后,生成的so位于 build/release/ 目录中 + +jni 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni) + +c++ 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h) + +单测可执行文件位于 test/build 目录中。 + +如果有环境问题, 可以看接下来的环节 + +### 环境配置 + +##### 下载Android NDK + +如果你的电脑安装了Android Studio, 可以在 Android Studio 中直接下载安装 NDK + +或者可以在 [https://developer.android.com/ndk/](https://developer.android.com/ndk/) 这里自行下载,也可以通过以下命令获取: + +- Mac平台 +``` +wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip +unzip android-ndk-r17b-darwin-x86_64.zip + +``` +- Linux平台 +``` +wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip +unzip android-ndk-r17b-linux-x86_64.zip +``` + +##### 设置环境变量 +工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK,因此需要配置环境变量: + +``` +export NDK_ROOT = "path to ndk" +``` + +##### 安装 CMake + +- Mac平台 + +mac 平台下可以使用 homebrew 安装 + +``` +brew install cmake + +``` +- Linux平台 + +linux 下可以使用 apt-get 进行安装 +``` +apt-get install cmake + +``` + +##### Tips: +如果想要获得体积更小的库,可选择编译支持指定模型结构的库。 +如执行如下命令: + +``` +sh build.sh android googlenet +``` +会得到一个支持googlnet的体积更小的库。 ## 基于Docker容器编译 ### 1. 安装 docker @@ -67,115 +150,38 @@ root@5affd29d4fc5:/ # make ### 6. 查看构建产出 构架产出可以在 host 机器上查看,在 paddle-mobile 的目录下,build 以及 test/build 下,可以使用 adb 指令或者 scp 传输到 device 上执行 -## 基于Linux交叉编译 -### 交叉编译环境准备 -##### 下载Android NDK +## 测试 +在编译完成后,我们提供了自动化的测试脚本,帮助用户将运行单测文件所需要的模型及库文件push到Android设备 -从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/),用户可以自行前往下载,也可以通过以下命令获取: -- Mac平台 -``` -wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip -unzip android-ndk-r17b-darwin-x86_64.zip +* 下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip) -``` -- Linux平台 -``` -wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip -unzip android-ndk-r17b-linux-x86_64.zip -``` - -##### 设置环境变量 -工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK,因此需要配置环境变量: - -``` -export NDK_ROOT = "path to ndk" -``` -### 执行编译 -在paddle-mobile根目录中,执行以下命令: +* 创建模型和图片文件夹 ``` -cd tools -sh build.sh android - +cd test +mkdir models +mkdir images ``` -执行完毕后,生成的so位于build目录中,单测可执行文件位于test/build目录中。 -##### Tips: -如果想要获得体积更小的库,可选择编译支持指定模型结构的库。 -如执行如下命令: -``` -sh build.sh android googlenet -``` -会得到一个支持googlnet的体积更小的库。 +* 将mobilenet复制到paddle-mobile/test/models目录下 将test_image_1x3x224x224_float复制到paddle-mobile/test/images目录下 -##测试 -在编译完成后,我们提供了自动化的测试脚本,帮助用户将运行单测文件所需要的模型及库文件push到Android设备中,执行以下命令: +* 执行下面命令将可执行文件和预测需要的文件部署到手机 ``` cd tools/android-debug-script -sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上 +sh push2android.sh ``` -出现如下提示: -``` -**** choose OP or NET to test **** -which to test : -``` -输入名称即可运行对应的测试文件。 +* mobilenet cpu模型预测结果 -##部署 -Android应用可通过JNI接口调用底层C/C++,paddle-mobile对外提供的JNI接口如下: +假设mobilenet和test_image_1x3x224x224_float文件已经推送到手机上,执行下面命令进行mobilenet cpu的预测 -##### 1 load接口 加载模型参数 -- 用于加载参数文件分散的模型 ``` -/** - * Load seperated parameters - * @param modelDir - * @return - */ - public static native boolean load(String modelDir); +adb shell +cd /data/local/tmp/bin/ +export LD_LIBRARY_PATH=. +./test-mobilenet ``` -- 用于加载参数文件合并的模型文件 -``` -/** - * Load combined parameters - * @param modelPath - * @param paramPath - * @return - */ - public static native boolean loadCombined(String modelPath,String paramPath); -``` -##### 2 predict接口 执行预测 -- 接受预处理过的RGB数组的predict接口 -``` -/** -*@param buf 输入数据 -*@return 输出数据 -JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage( - JNIEnv *env, jclass thiz, jfloatArray buf); -``` -- 接受原始yuv数据的predict接口 -``` - /** - * - * @param buf yuv420格式的字节数组 - * @param imgWidth yuv数据的宽 - * @param imgHeight yuv数据的高 - * @param ddims 输入数据的形状 - * @param meanValues 模型训练时各通道的均值 - * @return - */ - - public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues); - -``` -##### 3 clear接口 销毁实例、清理内存操作 - -``` -JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env, - jclass thiz); -``` diff --git a/src/common/types.cpp b/src/common/types.cpp index 8f284b3fe1115bd8cec78430a405289aae98e898..ba00f639d76ae7c928f5b7484c08cec0b0926904 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -71,10 +71,10 @@ const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; -extern const char *G_OP_TYPE_TANH = "tanh"; -extern const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add"; -extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; +const char *G_OP_TYPE_TANH = "tanh"; +const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu"; +const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add"; +const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; std::unordered_map< std::string, std::pair, std::vector>> diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 04e51ab9b09fabc41fcd1cd73864bc285d183821..7c1f15f7c90e0b1ebc15a9ec8f3f6333ff173978 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -13,251 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V1/api.h" -#include -#include -#include -#include -#include #include "fpga/V1/bias_scale.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" -#define FPGA_TEST_MODE -#define PADDLE_MOBILE_OS_LINUX namespace paddle_mobile { namespace fpga { -static int fd = -1; -static const char *device_path = "/dev/fpgadrv0"; -static std::map memory_map; - -static inline int do_ioctl(int req, const void *arg) { -#ifdef PADDLE_MOBILE_OS_LINUX - int result = ioctl(fd, req, (uint64_t)arg); - PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly"); - return result; -#else - return -1; -#endif -} - -int open_device() { - if (fd == -1) { - fd = open(device_path, O_RDWR); - } - return fd; -} - -// memory management; -void *fpga_malloc(size_t size) { - static uint64_t counter = 0; - -#ifdef PADDLE_MOBILE_OS_LINUX - auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); -#else - auto ptr = malloc(size); -#endif - counter += size; - memory_map.insert(std::make_pair(ptr, size)); - // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " - // << counter << " bytes"; - return ptr; -} - -void fpga_free(void *ptr) { - static uint64_t counter = 0; - size_t size = 0; - - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); -#ifdef PADDLE_MOBILE_OS_LINUX - munmap(ptr, size); -#else - free(ptr); -#endif - counter += size; - // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " - // << counter << " bytes"; - } else { - DLOG << "Invalid pointer"; - } -} - -void fpga_copy(void *dest, const void *src, size_t num) { - memcpy(dest, src, num); -} - -int fpga_flush(void *address, size_t size) { - struct MemoryCacheArgs args = {nullptr}; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); -} - -int fpga_invalidate(void *address, size_t size) { - struct MemoryCacheArgs args = {nullptr}; - args.address = address; - args.size = size; - return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); -} - -half fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10)); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; -} - -float fp16_2_fp32(half fp16_num) { - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; -} - -int ComputeBasicConv(const struct ConvArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "======Compute Basic Conv======"; - DLOG << " relu_enabled:" << args.relu_enabled - << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - return do_ioctl(IOCTL_CONFIG_CONV, &args); -} - -int ComputeFpgaConv(const struct SplitConvArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFPGAConv==========="; - DLOG << " filter_num:" << args.filter_num - << " group_num:" << args.group_num - << " split_num:" << args.split_num; -#endif - - int split_num = args.split_num; - for (int i = 0; i < split_num; i++) { - ComputeBasicConv(args.conv_args[i]); - } - - if (split_num > 1) { - ComputeFPGAConcat(args.concat_arg); - } -} - -int ComputeFpgaPool(const struct PoolingArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaPool==========="; - DLOG << " mode:" << args.mode - << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_POOLING, &args); -} - -int ComputeFpgaEWAdd(const struct EWAddArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaEWAdd==========="; - DLOG << " relu_enabled:" << args.relu_enabled - << " const0:" << fp16_2_fp32(int16_t(args.const0)) - << " const1:" << fp16_2_fp32(int16_t(args.const1)); - DLOG << " image0_address:" << args.image0.address - << " image0_scale_address:" << args.image0.scale_address - << " image0_channels:" << args.image0.channels - << " image0_height:" << args.image0.height - << " image0_width:" << args.image0.width - << " pad0_height:" << args.image0.pad_height - << " pad0_width:" << args.image0.pad_width; - DLOG << " image1_address:" << args.image1.address - << " image1_scale_address:" << args.image1.scale_address - << " image1_channels:" << args.image1.channels - << " image1_height:" << args.image1.height - << " image1_width:" << args.image1.width - << " pad1_height:" << args.image1.pad_height - << " pad_width:" << args.image1.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_EW, &args); -} -int PerformBypass(const struct BypassArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaBypass==========="; - DLOG << " input_type:" << args.input_data_type - << " output_type:" << args.output_data_type - << " input_layout_type:" << args.input_layout_type - << " output_layout_type:" << args.output_layout_type; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; -#endif - - return do_ioctl(IOCTL_CONFIG_BYPASS, &args); -} - -int ComputeFPGAConcat(const struct ConcatArgs &args) { -#ifdef FPGA_TEST_MODE - DLOG << "=============ComputeFpgaConcat==========="; - DLOG << " Image_num: " << args.image_num - << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out; - DLOG << " image_height:" << args.height << " image_width:" << args.width; - for (int i = 0; i < args.image_num; i++) { - DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.channel_num[i] - << " image_address:" << args.images_in[i] - << " image_scale_address:" << args.scales_in[i]; - } -#endif - - image::concat_images(args.images_in, args.scales_in, args.image_out, - args.scale_out, args.image_num, args.channel_num, - args.height, args.width); - return 0; -} - int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } void format_image(framework::Tensor *image_tensor) { @@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->filter_num = (uint32_t)filter->dims()[0]; arg->output.address = out_ptr; arg->output.scale_address = out->scale; - arg->conv_args = + arg->conv_arg = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT arg->concat_arg.image_num = arg->split_num; @@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); for (int i = 0; i < n; i++) { - arg->conv_args[i].relu_enabled = relu_enabled; - arg->conv_args[i].group_num = (uint32_t)group_num; - arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h; - arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w; - arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; - arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; - arg->conv_args[i].image.address = input_ptr; - arg->conv_args[i].image.channels = (uint32_t)input->dims()[1]; - arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; - arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; - arg->conv_args[i].image.scale_address = input->scale; - arg->conv_args[i].image.pad_height = (uint32_t)padding_h; - arg->conv_args[i].image.pad_width = (uint32_t)padding_w; - arg->conv_args[i].filter_scale_address = filter->scale; - arg->conv_args[i].filter_address = &( + arg->conv_arg[i].relu_enabled = relu_enabled; + arg->conv_arg[i].group_num = (uint32_t)group_num; + arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h; + arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w; + arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2]; + arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3]; + arg->conv_arg[i].image.address = input_ptr; + arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1]; + arg->conv_arg[i].image.height = (uint32_t)input->dims()[2]; + arg->conv_arg[i].image.width = (uint32_t)input->dims()[3]; + arg->conv_arg[i].image.scale_address = input->scale; + arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; + arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; + arg->conv_arg[i].filter_scale_address = filter->scale; + arg->conv_arg[i].filter_address = &( (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; - arg->conv_args[i].filter_num = (uint32_t)( + arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); if (n > 1) { - arg->conv_args[i].output.scale_address = + arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT - arg->conv_args[i].output.address = fpga_malloc( - input->dims()[2] * - align_to_x(input->dims()[3] * arg->conv_args[i].filter_num, - IMAGE_ALIGNMENT) * - sizeof(half)); + arg->conv_arg[i].output.address = + fpga_malloc(input->dims()[2] * + align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, + IMAGE_ALIGNMENT) * + sizeof(half)); } else { - arg->conv_args[i].output.scale_address = out->scale; - arg->conv_args[i].output.address = out_ptr; + arg->conv_arg[i].output.scale_address = out->scale; + arg->conv_arg[i].output.address = out_ptr; } arg->concat_arg.images_in[i] = - (half *)arg->conv_args[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address; - arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num; + (half *)arg->conv_arg[i].output.address; // NOLINT + arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; + arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; } } diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index f535975a35ecc3c454bbac597b31d8c3670cbf91..daa7902ab4a6cb72a77bba31f8cfe84c897f30a4 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -14,178 +14,13 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include +#include "fpga/common/fpga_common.h" +#include "fpga/common/pe.h" #include "framework/tensor.h" namespace paddle_mobile { namespace fpga { -enum DataType { - DATA_TYPE_FP32 = 1, - DATA_TYPE_FP16 = 0, -}; - -enum LayoutType { - LAYOUT_CHW = 1, - LAYOUT_HWC = 0, -}; - -struct VersionArgs { - void* buffer; -}; - -struct MemoryCopyArgs { - void* src; - void* dest; - size_t size; -}; - -struct KernelArgs { - uint32_t width; - uint32_t height; - uint32_t stride_w; - uint32_t stride_h; -}; - -struct ImageInputArgs { - void* address; // input featuremap virtual address - float* scale_address; // input scale address; - uint32_t channels; - uint32_t width; // featuremap width - uint32_t height; - uint32_t pad_width; // padding width; - uint32_t pad_height; -}; - -struct ImageOutputArgs { - void* address; // output result address; - float* scale_address; // output scale address; -}; - -struct ConvArgs { - bool relu_enabled; - void* sb_address; // scale and bias are interlaced; - void* filter_address; - float* filter_scale_address; - uint32_t filter_num; - uint32_t group_num; - - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct ConcatArgs { - uint32_t image_num; - half** images_in; - float** scales_in; - void* image_out; - float* scale_out; - uint32_t* channel_num; - uint32_t height; - uint32_t width; -}; - -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -struct GroupConvArgs { - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct SplitConvArgs* conv_args; - struct ConcatArgs concat_arg; -}; - -struct PoolingArgs { - int16_t mode; // mode: 0:max, 1:avg - half kernel_reciprocal; - struct KernelArgs kernel; - struct ImageInputArgs image; // input image; - struct ImageOutputArgs output; -}; - -struct EWAddArgs { - bool relu_enabled; - - uint32_t const0; // output0 = const0 x input0 + const1 x input1; - uint32_t const1; - struct ImageInputArgs image0; - struct ImageInputArgs image1; - struct ImageOutputArgs output; -}; - -struct BypassArgs { - enum DataType input_data_type; - enum DataType output_data_type; - enum LayoutType input_layout_type; - enum LayoutType output_layout_type; - struct ImageInputArgs image; - struct ImageOutputArgs output; -}; - -struct FpgaRegWriteArgs { - uint64_t address; // - uint64_t value; -}; - -struct FpgaRegReadArgs { - uint64_t address; - uint64_t value; -}; - -struct MemoryCacheArgs { - void* address; - size_t size; -}; - -#define IOCTL_FPGA_MAGIC 'FPGA' - -#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs) - -#define IOCTL_SEPARATOR_0 10 - -#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs) -#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) -#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) - -#define IOCTL_SEPARATOR_1 20 - -#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs) -#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs) -#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs) -#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs) -#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs) -#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs) - -//============================== API ============================= - -int open_device(); -int close_device(); - -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -void fpga_copy(void* dst, const void* src, size_t num); -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - -int PerformBypass(const struct BypassArgs& args); -int ComputeFpgaConv(const struct SplitConvArgs& args); -int ComputeFpgaPool(const struct PoolingArgs& args); -int ComputeFpgaEWAdd(const struct EWAddArgs& args); -int ComputeFPGAConcat(const struct ConcatArgs& args); - -static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } - int get_align_image_cw(int cw); void format_image(framework::Tensor* image_tensor); void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory @@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, bool relu_enabled, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float* bs_ptr); -half fp32_2_fp16(float fp32_num); -float fp16_2_fp32(half fp16_num); - } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/bias_scale.cpp b/src/fpga/V1/bias_scale.cpp index 3c2c04dc1d7f76953b04a879fbcfa8377dd7ba8a..263a7494c5602c13208aa0d8899ce80d781aa11b 100644 --- a/src/fpga/V1/bias_scale.cpp +++ b/src/fpga/V1/bias_scale.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #include "fpga/V1/bias_scale.h" #include -#include "fpga/V1/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index 3f4a3e2c876f0b54546f0e385d4a5e8bbfacdf3c..157ac90a60262cadacb648173cbc5ba6c01e674e 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V1/filter.h" #include #include -#include "fpga/V1/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) { } int calc_division_number(int num, int group_num, int division_capacity) { - PADDLE_MOBILE_ENFORCE(num % group_num == 0, - "Filter number should be divisible by group number"); + // PADDLE_MOBILE_ENFORCE(num % group_num == 0, + // "Filter number should be divisible by group + // number"); int split_num = calc_split_num(num, division_capacity); - PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - "Split number or group number should be 1"); + // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, + // "Split number or group number should be 1"); return group_num * split_num; } int calc_num_per_div(int num, int group_num, int division_capacity) { - PADDLE_MOBILE_ENFORCE(num % group_num == 0, - "Filter number should be divisible by group number"); + // PADDLE_MOBILE_ENFORCE(num % group_num == 0, + // "Filter number should be divisible by group + // number"); int split_num = calc_split_num(num, division_capacity); - PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, - "Split number or group number should be 1"); + // PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, + // "Split number or group number should be 1"); if (group_num == 1) { if (num > division_capacity) { return division_capacity; diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index 73be05c942d6a848db830148d25bc8b3e14b53e4..f4142ad58a273691c84db9dd585518e7edcff8a6 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V1/image.h" #include #include -#include "fpga/V1/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -111,6 +111,27 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); } +void split_image(int16_t *image_in, float *scale_in, void **images_out, + float **scales_out, int image_num, uint32_t *channel_nums, + int height, int width) { + int total_channel = 0; + for (int i = 0; i < image_num; i++) { + scales_out[i][0] = scale_in[0]; + scales_out[i][1] = scale_in[1]; + total_channel += channel_nums[i]; + } + + for (int h = 0; h < height; h++) { + int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT); + for (int i = 0; i < image_num; i++) { + int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT); + memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, + channel_nums[i] * sizeof(int16_t)); + src_offset += channel_nums[i]; + } + } +} + } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h index 7e004916118ae97d60d24e798300d66a98191211..321967bbe233c5bec889aeb63f98dc23779b4918 100644 --- a/src/fpga/V1/image.h +++ b/src/fpga/V1/image.h @@ -28,6 +28,9 @@ void concat_images(int16_t** images_in, float** scales_in, void* image_out, float* scale_out, int image_num, uint32_t* channel_num, int height, int width); // Concat featuremaps along channel direction +void split_image(int16_t* image_in, float* scale_in, void** images_out, + float** scales_out, int image_num, uint32_t* channel_nums, + int height, int width); } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f0e5768a7017a4c7f928fea86b8f4ef3cdbae3d --- /dev/null +++ b/src/fpga/V1/pe.cpp @@ -0,0 +1,178 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/common/pe.h" +#include "fpga/V1/filter.h" +#include "fpga/V1/image.h" +#include "fpga/common/config.h" +#include "fpga/common/driver.h" + +namespace paddle_mobile { +namespace fpga { + +int ComputeFpgaConv(const struct SplitConvArgs &args) { + ComputeBasicConv(args.conv_arg[0]); +} + +int ComputeBasicConv(const struct ConvArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "======Compute Basic Conv======"; + DLOG << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif + +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + return 0; +} + +int ComputeFpgaPool(const struct PoolingArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaPool==========="; + DLOG << " mode:" << args.mode + << " kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal); + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int ComputeFpgaEWAdd(const struct EWAddArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaEWAdd==========="; + DLOG << " relu_enabled:" << args.relu_enabled + << " const0:" << fp16_2_fp32(int16_t(args.const0)) + << " const1:" << fp16_2_fp32(int16_t(args.const1)); + DLOG << " image0_address:" << args.image0.address + << " image0_scale_address:" << args.image0.scale_address + << " image0_channels:" << args.image0.channels + << " image0_height:" << args.image0.height + << " image0_width:" << args.image0.width + << " pad0_height:" << args.image0.pad_height + << " pad0_width:" << args.image0.pad_width; + DLOG << " image1_address:" << args.image1.address + << " image1_scale_address:" << args.image1.scale_address + << " image1_channels:" << args.image1.channels + << " image1_height:" << args.image1.height + << " image1_width:" << args.image1.width + << " pad1_height:" << args.image1.pad_height + << " pad_width:" << args.image1.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + return 0; +} + +int PerformBypass(const struct BypassArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaBypass==========="; + DLOG << " input_type:" << args.input_data_type + << " output_type:" << args.output_data_type + << " input_layout_type:" << args.input_layout_type + << " output_layout_type:" << args.output_layout_type; + DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; +#endif +#ifndef PADDLE_MOBILE_ZU5 + return 0; +#endif + + return 0; +} + +int ComputeFPGAConcat(const struct ConcatArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaConcat==========="; + DLOG << " Image_num: " << args.image_num + << " out_address:" << args.image_out + << " out_scale_address:" << args.scale_out; + DLOG << " image_height:" << args.height << " image_width:" << args.width; + for (int i = 0; i < args.image_num; i++) { + DLOG << " " << i << "th: "; + DLOG << " channel_num:" << args.channel_num[i] + << " image_address:" << args.images_in[i] + << " image_scale_address:" << args.scales_in[i]; + } +#endif + + image::concat_images(args.images_in, args.scales_in, args.image_out, + args.scale_out, args.image_num, args.channel_num, + args.height, args.width); + return 0; +} + +int ComputeFPGASplit(const struct SplitArgs &args) { +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFpgaSplit==========="; + DLOG << " Image_num: " << args.image_num + << " in_address:" << args.image_in + << " in_scale_address:" << args.scale_in; + DLOG << " image_height:" << args.height << " image_width:" << args.width; + for (int i = 0; i < args.image_num; i++) { + DLOG << " " << i << "th: "; + DLOG << " channel_num:" << args.out_channel_nums[i] + << " image_address:" << args.images_out[i] + << " image_scale_address:" << args.scales_out[i]; + } +#endif + image::split_image(args.image_in, args.scale_in, args.images_out, + args.scales_out, args.image_num, args.out_channel_nums, + args.height, args.width); + return 0; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 2f8a9f119e643b3836ef2c541e098f39ab3cbd17..5bfd34104600668ce63a9c7d684d4482d5d804fb 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -13,84 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V2/api.h" -#include #include "fpga/V2/bias_scale.h" -#include "fpga/V2/config.h" -#include "fpga/V2/driver/driver.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" namespace paddle_mobile { namespace fpga { -static std::map memory_map; - -int open_device() { - int ret = driver::open_device_driver(); - return ret; -} - -int close_device() { - int ret = driver::close_device_driver(); - return ret; -} - -void *fpga_malloc(size_t size) { - static uint64_t counter = 0; -#ifdef PADDLE_MOBILE_ZU5 - auto ptr = driver::fpga_malloc_driver(size); -#else - auto ptr = malloc(size); -#endif - counter += size; - memory_map.insert(std::make_pair(ptr, size)); - // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " - // << counter << " bytes"; - return ptr; -} - -void fpga_free(void *ptr) { - static uint64_t counter = 0; - size_t size = 0; - auto iter = memory_map.find(ptr); // std::map::iterator - if (iter != memory_map.end()) { - size = iter->second; - memory_map.erase(iter); -#ifdef PADDLE_MOBILE_ZU5 - driver::fpga_free_driver(ptr); -#else - free(ptr); -#endif - counter += size; - // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " - // << counter << " bytes"; - } else { - DLOG << "Invalid pointer"; - } -} -void fpga_copy(void *dest, const void *src, size_t num) { -#ifdef PADDLE_MOBILE_ZU5 - driver::fpga_copy_driver(dest, src, num); -#else - memcpy(dest, src, num); -#endif -} - -int fpga_flush(void *address, size_t size) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::fpga_flush_driver(address, size); -#else - return 0; -#endif -} -int fpga_invalidate(void *address, size_t size) { -#ifdef PADDLE_MOBILE_ZU5 - return driver::fpga_invalidate_driver(address, size); -#else - return 0; -#endif -} - void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; @@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].output.address = out_ptr; arg->conv_arg[i].output.scale_address = out->scale; - int num_after_alignment = - filter::calc_aligned_num((int)input->dims()[1], arg->filter_num); + int num_after_alignment = filter::calc_aligned_num( + (int)input->dims()[1], arg->filter_num); // NOLINT arg->conv_arg[i].free_space = fpga_malloc(num_after_alignment * 2 * sizeof(half)); } diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 1f4a203936b517d93e2d417b08a8b8456cc1fc93..1386810164d72ef849162b76a8b83fcf32082907 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -14,21 +14,13 @@ limitations under the License. */ #pragma once -#include "fpga/V2/driver/pe.h" -#include "fpga/V2/fpga_common.h" +#include "fpga/common/fpga_common.h" +#include "fpga/common/pe.h" #include "framework/tensor.h" namespace paddle_mobile { namespace fpga { -int open_device(); -int close_device(); -void* fpga_malloc(size_t size); -void fpga_free(void* ptr); -void fpga_copy(void* dest, const void* src, size_t num); -int fpga_flush(void* address, size_t size); -int fpga_invalidate(void* address, size_t size); - float filter_find_max(framework::Tensor* filter_tensor); int get_aligned_channel_num(int channel_num); int get_aligned_filter_num(framework::Tensor* filter_tensor); diff --git a/src/fpga/V2/bias_scale.cpp b/src/fpga/V2/bias_scale.cpp index 3afd3f51bbb10e3bb2d66195fcc54d25c56e2393..c8f587da330c6e6e9e35969d58ae27f4366830d2 100644 --- a/src/fpga/V2/bias_scale.cpp +++ b/src/fpga/V2/bias_scale.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #include "fpga/V2/bias_scale.h" #include -#include "fpga/V2/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V2/filter.cpp b/src/fpga/V2/filter.cpp index ce278edbeed64f2ca413c1f75ff620ee1f44c83d..b17ce4406bf1b6b4619d0e9e75d3f432dfa84fb1 100644 --- a/src/fpga/V2/filter.cpp +++ b/src/fpga/V2/filter.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V2/filter.h" #include #include -#include "fpga/V2/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -73,7 +73,7 @@ void convert_to_hwc(float **data_in, int num, int channel, int height, void align_filter(float **data_in, int num, int channel, int height, int width) { - int aligned_channel = calc_channel_parallelism(channel); + int aligned_channel = calc_aligned_channel(channel); int hw = height * width; int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width); float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float)); // NOLINT diff --git a/src/fpga/V2/fpga_common.cpp b/src/fpga/V2/fpga_common.cpp deleted file mode 100644 index 01bca30a9ccf79232e1f28bbf77b1c030632f5bc..0000000000000000000000000000000000000000 --- a/src/fpga/V2/fpga_common.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -namespace paddle_mobile { -namespace fpga { - -int16_t fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT - auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | - (((tmp & 0x7f800000) >> 13) - (112 << 10))); - if (tmp & 0x1000) { - t++; // roundoff - } - return t; -} - -float fp16_2_fp32(int16_t fp16_num) { - if (0 == fp16_num) { - return 0; - } - int frac = (fp16_num & 0x3ff); - int exp = ((fp16_num & 0x7c00) >> 10) + 112; - int s = fp16_num & 0x8000; - int tmp = 0; - float fp32_num; - tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; // NOLINT - return fp32_num; -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/src/fpga/V2/image.cpp b/src/fpga/V2/image.cpp index 26829bfba65f2375b27251070b33b2bbe57d069b..3d1ed95df2a805c8c64f9184e0a720f5449d6181 100644 --- a/src/fpga/V2/image.cpp +++ b/src/fpga/V2/image.cpp @@ -15,7 +15,7 @@ limitations under the License. */ #include "fpga/V2/image.h" #include #include -#include "fpga/V2/api.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/pe.cpp similarity index 79% rename from src/fpga/V2/driver/pe.cpp rename to src/fpga/V2/pe.cpp index 2e806bfb37c131fad1c011c960bc79aa1b121186..35ef06de2d7b3adf5d8d71722319f66026635eaa 100644 --- a/src/fpga/V2/driver/pe.cpp +++ b/src/fpga/V2/pe.cpp @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/V2/driver/pe.h" -#include "fpga/V2/config.h" -#include "fpga/V2/driver/driver.h" +#include "fpga/common/pe.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" +#include "fpga/common/config.h" +#include "fpga/common/driver.h" namespace paddle_mobile { namespace fpga { @@ -166,54 +166,54 @@ int PerformBypass(const struct BypassArgs &args) { return 0; #endif - uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); - uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); - uint64_t bp_enable; - int64_t length; - uint64_t pixels; - - // fp32->fp16 - if ((args.input_data_type) && (!args.output_data_type)) { - pixels = (args.image.channels) * (args.image.width) * (args.image.height); - length = pixels * sizeof(float); - bp_enable = 0x8800000000000000 + length; - } - // fp16->fp32 - else if ((!args.input_data_type) && (args.output_data_type)) { - pixels = filter::calc_aligned_channel((args.image.channels)) * - (args.image.width) * (args.image.height); - length = pixels * sizeof(short); - length = align_to_x((int)length, 64); // NOLINT - bp_enable = 0x8a00000000000000 + length; - } - // fp16->fp16 findmax - else if ((!args.input_data_type) && (!args.output_data_type)) { - pixels = (args.image.channels) * (args.image.width) * (args.image.height); - length = pixels * sizeof(short); - bp_enable = 0x8900000000000000 + length; - } else { - return -1; - } - - // start bypass - driver::reg_writeq(ifm_src_paddr, MUL8(27)); - driver::reg_writeq(ifm_dst_paddr, MUL8(28)); - driver::reg_writeq(0, MUL8(0)); - driver::reg_writeq(bp_enable, MUL8(0)); - // poll - int ret = -1; - ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); - if (ret != -1) { - // clear "irq" - driver::reg_readq(MUL8(63)); - } - // get max value - if ((!args.input_data_type) && (!args.output_data_type)) { - float scale = Findfp16Max(); - args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT - args.output.scale_address[1] = scale; - } - return ret; + // uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address); + // uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address); + // uint64_t bp_enable; + // int64_t length; + // uint64_t pixels; + // + // // fp32->fp16 + // if ((args.input_data_type) && (!args.output_data_type)) { + // pixels = (args.image.channels) * (args.image.width) * + // (args.image.height); length = pixels * sizeof(float); bp_enable = + // 0x8800000000000000 + length; + // } + // // fp16->fp32 + // else if ((!args.input_data_type) && (args.output_data_type)) { + // pixels = filter::calc_aligned_channel((args.image.channels)) * + // (args.image.width) * (args.image.height); + // length = pixels * sizeof(short); + // length = align_to_x((int)length, 64); // NOLINT + // bp_enable = 0x8a00000000000000 + length; + // } + // // fp16->fp16 findmax + // else if ((!args.input_data_type) && (!args.output_data_type)) { + // pixels = (args.image.channels) * (args.image.width) * + // (args.image.height); length = pixels * sizeof(short); bp_enable = + // 0x8900000000000000 + length; + // } else { + // return -1; + // } + // + // // start bypass + // driver::reg_writeq(ifm_src_paddr, MUL8(27)); + // driver::reg_writeq(ifm_dst_paddr, MUL8(28)); + // driver::reg_writeq(0, MUL8(0)); + // driver::reg_writeq(bp_enable, MUL8(0)); + // // poll + // int ret = -1; + // ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); + // if (ret != -1) { + // // clear "irq" + // driver::reg_readq(MUL8(63)); + // } + // // get max value + // if ((!args.input_data_type) && (!args.output_data_type)) { + // float scale = Findfp16Max(); + // args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT + // args.output.scale_address[1] = scale; + // } + // return ret; } int ComputeFPGAConcat(const struct ConcatArgs &args) { diff --git a/src/fpga/V2/driver/bitmap.cpp b/src/fpga/common/bitmap.cpp similarity index 99% rename from src/fpga/V2/driver/bitmap.cpp rename to src/fpga/common/bitmap.cpp index c612faa6aed11b683ff81fffdf6c57a6fed9536d..9742a4559927b0520b32eeabc757f5a0f4e3392a 100644 --- a/src/fpga/V2/driver/bitmap.cpp +++ b/src/fpga/common/bitmap.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "fpga/V2/driver/bitmap.h" +#include "fpga/common/bitmap.h" namespace fpga_bitmap { void bitmap_set(uint64_t *map, unsigned int start, int len) { diff --git a/src/fpga/V2/driver/bitmap.h b/src/fpga/common/bitmap.h similarity index 100% rename from src/fpga/V2/driver/bitmap.h rename to src/fpga/common/bitmap.h diff --git a/src/fpga/V2/config.h b/src/fpga/common/config.h similarity index 100% rename from src/fpga/V2/config.h rename to src/fpga/common/config.h diff --git a/src/fpga/V2/driver/driver.cpp b/src/fpga/common/driver.cpp similarity index 96% rename from src/fpga/V2/driver/driver.cpp rename to src/fpga/common/driver.cpp index d7e71782676fd350f938847c03e9736ff0adb64a..8c59ac14fb11282b29a837152194d873bd65d87d 100644 --- a/src/fpga/V2/driver/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -28,8 +28,8 @@ limitations under the License. */ #include #include "common/enforce.h" -#include "fpga/V2/driver/bitmap.h" -#include "fpga/V2/driver/driver.h" +#include "fpga/common/bitmap.h" +#include "fpga/common/driver.h" namespace paddle_mobile { namespace fpga { @@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) { } } -static inline int do_ioctl(unsigned long req, const void *arg) { +static inline int do_ioctl(int64_t req, const void *arg) { return ioctl(g_fpgainfo.fd_mem, req, arg); } @@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) { p_addr = vaddr_to_paddr(address); - args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.size = size; return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args); @@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) { p_addr = vaddr_to_paddr(address); - args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); + args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.size = size; return do_ioctl(IOCTL_MEMCACHE_INVAL, &args); @@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { for (i = 0; i < num; i++) { // DLOG << "i:" << i << " val:" << *((int8_t *)src + i); // usleep(1); - *((int8_t *)dest + i) = *((int8_t *)src + i); + *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT } return; diff --git a/src/fpga/V2/driver/driver.h b/src/fpga/common/driver.h similarity index 90% rename from src/fpga/V2/driver/driver.h rename to src/fpga/common/driver.h index 633e95ea8204ada2a330a6bb4fab4ce8fe23248b..2dad07ec5206a7ca64449aa38ebe0603d72b71e3 100644 --- a/src/fpga/V2/driver/driver.h +++ b/src/fpga/common/driver.h @@ -33,8 +33,6 @@ namespace driver { #define FPGA_MEM_PHY_ADDR 0x20000000 #define FPGA_MEM_SIZE 0x20000000 -#define CPU_FREQ 1000000000 - #define FPGA_PAGE_SIZE (16UL * 1024UL) // PE related macros @@ -53,7 +51,7 @@ struct MemoryCacheArgs { size_t size; }; -#define IOCTL_FPGA_MAGIC 'FPGA' +#define IOCTL_FPGA_MAGIC 'F' #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) @@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo; inline uint64_t reg_readq(uint32_t offset) { // DLOG << "offset : " << offset; - uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + - offset); // NOLINT + uint64_t value = + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT + offset); // NOLINT return value; } inline void reg_writeq(uint64_t value, uint32_t offset) { // DLOG << "offset : " << offset << ", value : " << value; - *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + - offset) = // NOLINT - value; + *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT + offset) = value; } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9c7ae838fa4216d121cf38a11ef4897043b9a0dd --- /dev/null +++ b/src/fpga/common/fpga_common.cpp @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/common/fpga_common.h" +#include +#include +#include "fpga/common/config.h" +#include "fpga/common/driver.h" + +namespace paddle_mobile { +namespace fpga { + +int16_t fp32_2_fp16(float fp32_num) { + unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT + auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | + (((tmp & 0x7f800000) >> 13) - (112 << 10))); + if (tmp & 0x1000) { + t++; // roundoff + } + return t; +} + +float fp16_2_fp32(int16_t fp16_num) { + if (0 == fp16_num) { + return 0; + } + int frac = (fp16_num & 0x3ff); + int exp = ((fp16_num & 0x7c00) >> 10) + 112; + int s = fp16_num & 0x8000; + int tmp = 0; + float fp32_num; + tmp = s << 16 | exp << 23 | frac << 13; + fp32_num = *(float *)&tmp; // NOLINT + return fp32_num; +} + +static std::map memory_map; + +int open_device() { + int ret = driver::open_device_driver(); + return ret; +} + +int close_device() { + int ret = driver::close_device_driver(); + return ret; +} + +void *fpga_malloc(size_t size) { + static uint64_t counter = 0; +#ifdef PADDLE_MOBILE_ZU5 + auto ptr = driver::fpga_malloc_driver(size); +#else + auto ptr = malloc(size); +#endif + counter += size; + memory_map.insert(std::make_pair(ptr, size)); + // DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total " + // << counter << " bytes"; + return ptr; +} + +void fpga_free(void *ptr) { + static uint64_t counter = 0; + size_t size = 0; + auto iter = memory_map.find(ptr); // std::map::iterator + if (iter != memory_map.end()) { + size = iter->second; + memory_map.erase(iter); +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_free_driver(ptr); +#else + free(ptr); +#endif + counter += size; + // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " + // << counter << " bytes"; + } else { + DLOG << "Invalid pointer"; + } +} +void fpga_copy(void *dest, const void *src, size_t num) { +#ifdef PADDLE_MOBILE_ZU5 + driver::fpga_copy_driver(dest, src, num); +#else + memcpy(dest, src, num); +#endif +} + +int fpga_flush(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_flush_driver(address, size); +#else + return 0; +#endif +} +int fpga_invalidate(void *address, size_t size) { +#ifdef PADDLE_MOBILE_ZU5 + return driver::fpga_invalidate_driver(address, size); +#else + return 0; +#endif +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/V2/fpga_common.h b/src/fpga/common/fpga_common.h similarity index 84% rename from src/fpga/V2/fpga_common.h rename to src/fpga/common/fpga_common.h index 1862d843503ee8faf58caf038202e198ca079905..b3f619f2f24aba47d99f7f427c4b67af8c0d430d 100644 --- a/src/fpga/V2/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include namespace paddle_mobile { @@ -73,8 +74,19 @@ struct ConcatArgs { void* image_out; float* scale_out; uint32_t* channel_num; - uint32_t* aligned_channel_num; - uint32_t out_channel; + // uint32_t* aligned_channel_num; + // uint32_t out_channel; + uint32_t height; + uint32_t width; +}; + +struct SplitArgs { + uint32_t image_num; + int16_t* image_in; + float* scale_in; + void** images_out; + float** scales_out; + uint32_t* out_channel_nums; uint32_t height; uint32_t width; }; @@ -117,9 +129,19 @@ struct BypassArgs { struct DeconvArgs { struct ConvArgs conv_arg; }; + static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } + int16_t fp32_2_fp16(float fp32_num); float fp16_2_fp32(int16_t fp16_num); +int open_device(); +int close_device(); +void* fpga_malloc(size_t size); +void fpga_free(void* ptr); +void fpga_copy(void* dest, const void* src, size_t num); +int fpga_flush(void* address, size_t size); +int fpga_invalidate(void* address, size_t size); + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V2/driver/pe.h b/src/fpga/common/pe.h similarity index 91% rename from src/fpga/V2/driver/pe.h rename to src/fpga/common/pe.h index 4903bf4c33f6b5d5899c56eeaada8c7a21d1a875..ae773f25b4171df3e552aaa07bb05af8564d872a 100644 --- a/src/fpga/V2/driver/pe.h +++ b/src/fpga/common/pe.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "fpga/V2/fpga_common.h" +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { @@ -25,6 +25,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args); int ComputeFpgaConv(const struct SplitConvArgs& args); int ComputeFPGAConcat(const struct ConcatArgs& args); +int ComputeFPGASplit(const struct SplitArgs& args); } // namespace fpga } // namespace paddle_mobile diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h index bea91ee24ceb5e9011708bd277629a07beb4b8ef..8a79d9bad766ba3ef4d3fc1eeae7813ecd60887b 100644 --- a/src/framework/cl/cl_helper.h +++ b/src/framework/cl/cl_helper.h @@ -61,9 +61,16 @@ class CLHelper { auto work_size_2 = n * h; return {work_size_0, work_size_1, work_size_2}; } else if (image_dim.size() == 2) { + auto h = image_dim[0]; + auto w = image_dim[1]; return {1, image.ImageWidth(), image.ImageHeight()}; } else if (image_dim.size() == 1) { return {1, image.ImageWidth(), 1}; + } else if (image_dim.size() == 3) { + int c = image_dim[0]; + int h = image_dim[1]; + int w = image_dim[2]; + return {(c + 3) / 4, w, h}; } PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp "); } diff --git a/src/framework/cl/cl_image.cpp b/src/framework/cl/cl_image.cpp index f32de0a61461d9a9b28d4a0cf5e13ecc9d564cf5..d6cc52d69cef1249fe9ed62a344d069af0305bc0 100644 --- a/src/framework/cl/cl_image.cpp +++ b/src/framework/cl/cl_image.cpp @@ -13,18 +13,98 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "framework/cl/cl_image.h" +#include "framework/cl/cl_tensor.h" namespace paddle_mobile { namespace framework { -void CLImageToTensor(CLImage *cl_image, Tensor *tensor, - cl_command_queue commandQueue) { - // TODO(yangfei): need imp +void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context, + cl_command_queue commandQueue, cl_kernel kernel) { + tensor->mutable_data(); + const auto &dim = cl_image->dims(); + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < dim.size(); ++j) { + new_dims[4 - dim.size() + j] = dim[j]; + } + size_t C, in_height, in_width; + + C = new_dims[1]; + in_height = new_dims[2]; + in_width = new_dims[3]; + + CLTensor out_cl_tensor(context, commandQueue); + out_cl_tensor.Resize(tensor->dims()); + cl_mem outBuffer = out_cl_tensor.mutable_data(); + + auto input_image = cl_image->GetCLImage(); + + clSetKernelArg(kernel, 0, sizeof(int), &in_height); + clSetKernelArg(kernel, 1, sizeof(int), &in_width); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image); + clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; + clSetKernelArg(kernel, 4, sizeof(int), &size_ch); + clSetKernelArg(kernel, 5, sizeof(int), &size_block); + clSetKernelArg(kernel, 6, sizeof(int), &size_batch); + clSetKernelArg(kernel, 7, sizeof(int), &C); + size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3], + new_dims[0] * new_dims[2]}; + clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, global_work_size, NULL, + 0, NULL, NULL); + memcpy(tensor->data(), out_cl_tensor.Data(), + tensor->memory_size()); } -void TensorToCLImage(const Tensor *tensor, CLImage *cl_image, - cl_command_queue commandQueue) { - // TODO(yangfei): need imp +void TensorToCLImage(Tensor *tensor, CLImage *cl_image, cl_context context, + cl_command_queue commandQueue, cl_kernel kernel) { + const auto &dim = cl_image->dims(); + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < dim.size(); ++j) { + new_dims[4 - dim.size() + j] = dim[j]; + } + cl_int status; + auto output = cl_image; + const Tensor *input = tensor; + const float *input_data = input->data(); + auto output_image = output->GetCLImage(); + const int out_C = new_dims[1]; + const int out_H = new_dims[2]; + const int out_W = new_dims[3]; + const int Stride2 = out_C * out_H * out_W; + const int Stride1 = out_H * out_W; + const int Stride0 = out_W; + DLOG << out_C; + DLOG << out_H; + DLOG << out_W; + CLTensor input_cl_tensor(context, commandQueue); + input_cl_tensor.Resize(input->dims()); + cl_mem inputBuffer = input_cl_tensor.mutable_with_data(input_data); + + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); + CL_CHECK_ERRORS(status); + + size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3], + new_dims[0] * new_dims[2]}; + status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL, + global_work_size, NULL, 0, NULL, NULL); + + CL_CHECK_ERRORS(status); } #ifdef PADDLE_MOBILE_DEBUG diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 35f60d3b773937d381447b23b64985ce543fddee..f94eba187f2c5610d7a20098e95015244b420ce2 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -120,17 +120,19 @@ class CLImage { PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, " empty image tensor data shouldn't have value"); - CLImageConverterFolder *folder_converter = new CLImageConverterFolder(); + // CLImageConverterFolder *folder_converter = new + // CLImageConverterFolder(); + CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); DLOG << " to get image dims "; - image_dims_ = folder_converter->InitImageDimInfoWith(dim); + image_dims_ = normal_converter->InitImageDimInfoWith(dim); DLOG << " end get image dims " << image_dims_; InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); tensor_dims_ = dim; command_queue_ = command_queue; - image_converter_ = folder_converter; + image_converter_ = normal_converter; cl_event_ = CLEngine::Instance()->CreateEvent(context); initialized_ = true; DLOG << " end init cl image"; @@ -220,11 +222,11 @@ class CLImage { CLImageConverterBase *image_converter_ = nullptr; }; -void TensorToCLImage(Tensor *tensor, CLImage *image, - cl_command_queue commandQueue); +void TensorToCLImage(Tensor *tensor, CLImage *image, cl_context context, + cl_command_queue commandQueue, cl_kernel kernel); -void CLImageToTensor(CLImage *image, Tensor *tensor, - cl_command_queue commandQueue); +void CLImageToTensor(CLImage *image, Tensor *tensor, cl_context context, + cl_command_queue commandQueue, cl_kernel kernel); #ifdef PADDLE_MOBILE_DEBUG Print &operator<<(Print &printer, const CLImage &image); diff --git a/src/framework/cl/cl_image_converter.cpp b/src/framework/cl/cl_image_converter.cpp index 13094a8d05ac6f7f8d2451a3498da058b37ee98b..1e63bd05671a5c47ee324554ae84a3f4f71e6020 100644 --- a/src/framework/cl/cl_image_converter.cpp +++ b/src/framework/cl/cl_image_converter.cpp @@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor, } } +const DDim &CLImageConverterNormal::InitImageDimInfoWith( + const DDim &tensor_dim) { + size_t new_dims[] = {1, 1, 1, 1}; + for (int j = 0; j < tensor_dim.size(); ++j) { + new_dims[4 - tensor_dim.size() + j] = tensor_dim[j]; + } + size_t N, C, H, W; + N = new_dims[0]; + C = new_dims[1]; + H = new_dims[2]; + W = new_dims[3]; + size_t width = W * ((C + 3) / 4); + size_t height = H * N; + + width_of_one_block_ = W; + height_of_one_block_ = H; + c_block_ = width / W; + + return make_ddim({width, height}); +} + +void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image, + const DDim &tensor_dim) { + PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0, + "tensor dim is not support "); + + CLImageConverterDefault default_converter; + default_converter.NCHWToImage(tensor, image, tensor_dim); +} + +void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor, + const DDim &image_dim, + const DDim &tensor_dim) { + CLImageConverterDefault default_converter; + default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim); +} + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/cl/cl_image_converter.h b/src/framework/cl/cl_image_converter.h index 02887b0cd468a45630122bb3f236c0775ac1eaa1..ad5994f852f3b47c6a645b16cfd253a6529e117e 100644 --- a/src/framework/cl/cl_image_converter.h +++ b/src/framework/cl/cl_image_converter.h @@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase { int height_of_one_block_; }; +class CLImageConverterNormal : public CLImageConverterBase { + public: + const DDim &InitImageDimInfoWith(const DDim &tensor_dim); + void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); + void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim, + const DDim &tensor_dim); + + /* + * width of original tensor + * */ + inline size_t WidthOfOneBlock() const { return width_of_one_block_; } + + /* + * height of original tensor + * */ + inline size_t HeightOfOneBlock() const { return height_of_one_block_; } + + int GetCBlock() const { return c_block_; } + + private: + int c_block_; + int width_of_one_block_; + int height_of_one_block_; +}; + class CLImageConverterNWBlock : public CLImageConverterBase { const DDim &InitImageDimInfoWith(const DDim &tensor_dim); void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim); diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index 6a773da00ff6541d55a6a9d04ca470fed5de81a1..4b50f15a868e3bdbb8434af0cc0d49a6cb54c6a5 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -143,10 +143,12 @@ double PaddleMobile::GetPredictTime() { int t1 = 1; int t2 = 1; for (int i = 0; i < m * k; ++i) { - a[i] = t1 + rand() % t2; + unsigned int seed = 100; + a[i] = t1 + rand_r(&seed) % t2; } for (int i = 0; i < k * n; ++i) { - b[i] = t1 + rand() % t2; + unsigned int seed = 200; + b[i] = t1 + rand_r(&seed) % t2; } paddle_mobile::operators::math::Gemm gemm; auto time1 = paddle_mobile::time(); @@ -215,13 +217,13 @@ double PaddleMobile::GetPredictTime() { cl_int status; cl_uint nPlatform; clGetPlatformIDs(0, NULL, &nPlatform); - cl_platform_id *listPlatform = - (cl_platform_id *)malloc(nPlatform * sizeof(cl_platform_id)); + cl_platform_id *listPlatform = reinterpret_cast( + malloc(nPlatform * sizeof(cl_platform_id))); clGetPlatformIDs(nPlatform, listPlatform, NULL); cl_uint nDevice = 0; clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice); cl_device_id *listDevice = - (cl_device_id *)malloc(nDevice * sizeof(cl_device_id)); + reinterpret_cast(malloc(nDevice * sizeof(cl_device_id))); clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice, NULL); cl_context context = @@ -277,41 +279,66 @@ double PaddleMobile::GetPredictTime() { clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL); cl_kernel kernel = clCreateKernel(program, "feed", &status); + int out_H = 224; + int out_W = 224; + int out_C = 3; + int Stride2 = out_C * out_H * out_W; + int Stride1 = out_H * out_W; + int Stride0 = out_W; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &input_w); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &input_h); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); CL_CHECK_ERRORS(status); - size_t global_work_size[2] = {input_w, input_h}; + size_t global_work_size[3] = {1, 224, 224}; // cl_event out_event = param.Out()->GetClEvent(); - status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, + status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); + out_H = 3; + out_W = 3; + out_C = 3; + Stride2 = out_C * out_H * out_W; + Stride1 = out_H * out_W; + Stride0 = out_W; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &filter_w); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &filter_h); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); + status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); CL_CHECK_ERRORS(status); - size_t global_work_size1[2] = {filter_w, filter_h}; + size_t global_work_size1[3] = {1, 3, 96}; // cl_event out_event = param.Out()->GetClEvent(); - status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size1, + status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); @@ -378,13 +405,16 @@ double PaddleMobile::GetPredictTime() { auto time2 = paddle_mobile::time(); paddle_mobile::memory::Free(input); paddle_mobile::memory::Free(filter); - return paddle_mobile::time_diff(time1, time2); + if (status == CL_SUCCESS) { + return paddle_mobile::time_diff(time1, time2); + } else { + return -1; + } } template int PaddleMobile::readText( const char *kernelPath, - char **pcode) // 读取文本文件放入 pcode,返回字符串长度 -{ + char **pcode) { // 读取文本文件放入 pcode,返回字符串长度 FILE *fp; int size; // printf(" File: %s\n", kernelPath); @@ -402,7 +432,7 @@ int PaddleMobile::readText( return -1; } rewind(fp); - if ((*pcode = (char *)malloc(size + 1)) == NULL) { + if ((*pcode = reinterpret_cast(malloc(size + 1))) == NULL) { printf(" Allocate space failed\n"); return -1; } diff --git a/src/operators/kernel/cl/box_coder_kernel.cpp b/src/operators/kernel/cl/box_coder_kernel.cpp index 582f6131bf60f02fb67f0479622123a28e89f713..b98435f9b09864d8ced90955c6fd3167c770bf31 100644 --- a/src/operators/kernel/cl/box_coder_kernel.cpp +++ b/src/operators/kernel/cl/box_coder_kernel.cpp @@ -20,13 +20,57 @@ namespace paddle_mobile { namespace operators { template <> -bool BoxCoderKernel::Init(BoxCoderParam *param) { +bool BoxCoderKernel::Init(BoxCoderParam* param) { + if (param->CodeType() == "decode_center_size") { + this->cl_helper_.AddKernel("box_decoder", "box_coder_kernel.cl"); + } return true; } template <> void BoxCoderKernel::Compute( - const BoxCoderParam ¶m) {} + const BoxCoderParam& param) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputBox()); + const auto* input_priorbox = param.InputPriorBox(); + const auto* input_priorboxvar = param.InputPriorBoxVar(); + const auto* input_targetbox = param.InputTargetBox(); + const auto& code_type = param.CodeType(); + if (code_type == "decode_center_size") { + auto prior_box_image = input_priorbox->GetCLImage(); + auto prior_box_var_image = input_priorboxvar->GetCLImage(); + auto target_box_image = input_targetbox->GetCLImage(); + auto output_image = param.OutputBox()->GetCLImage(); + auto& outputDim = param.OutputBox()->dims(); + int new_dims[4] = {1, 1, 1, 1}; + for (int i = 0; i < outputDim.size(); i++) { + new_dims[4 - outputDim.size() + i] = outputDim[i]; + } + int out_C = new_dims[1]; + int out_H = new_dims[2]; + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "default_work_size=" << default_work_size; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prior_box_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &prior_box_var_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &target_box_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + size_t global_work_size[2] = {default_work_size[0], default_work_size[2]}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl b/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..60000c994ecbe421d1c951f14077d764d3665d4d --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl @@ -0,0 +1,147 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void box_decoder(__read_only image2d_t prior_box_image, + __read_only image2d_t prior_box_var_image, + __read_only image2d_t target_box_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H + ){ + const int out_c = get_global_id(0); + const int out_nh = get_global_id(1); + const int out_h = out_nh%out_H; + const int out_n = 1; + + const int prior_box_n = 1; + const int prior_box_c = 0; + const int prior_box_h = out_h; + + + const int prior_box_var_n = 1; + const int prior_box_var_c = 0; + const int prior_box_var_h = out_h; + + const int target_box_n = 1; + const int target_box_c = out_c; + const int target_box_h = out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int2 prior_box_pos; + int2 prior_box_var_pos; + int2 target_box_pos; + int2 output_pos; + + prior_box_pos.x = prior_box_c * 4; + prior_box_pos.y = prior_box_n * prior_box_h; + + prior_box_var_pos.x = prior_box_var_c * 4; + prior_box_var_pos.y = prior_box_var_n * prior_box_var_h; + + target_box_pos.x = target_box_c * 4; + target_box_pos.y = target_box_n * target_box_h; + + output_pos.x = out_c * 4; + output_pos.y = out_n * out_h; + + half4 prior_box_input[4]; + half4 prior_box_var_input[4]; + half4 target_box_input[4]; + + prior_box_input[0] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 0,prior_box_pos.y)); + prior_box_input[1] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 1,prior_box_pos.y)); + prior_box_input[2] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 2,prior_box_pos.y)); + prior_box_input[3] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 3,prior_box_pos.y)); + + prior_box_var_input[0] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 0,prior_box_var_pos.y)); + prior_box_var_input[1] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 1,prior_box_var_pos.y)); + prior_box_var_input[2] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 2,prior_box_var_pos.y)); + prior_box_var_input[3] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 3,prior_box_var_pos.y)); + + + + target_box_input[0] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 0,target_box_pos.y)); + target_box_input[1] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 1,target_box_pos.y)); + target_box_input[2] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 2,target_box_pos.y)); + target_box_input[3] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 3,target_box_pos.y)); + + half prior_box_width = prior_box_input[2].x - prior_box_input[0].x; + half prior_box_height = prior_box_input[3].x - prior_box_input[1].x; + half prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(half)2; + half prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(half)2; + + half4 target_box_center_x; + half4 target_box_center_y; + half4 target_box_width; + half4 target_box_height; + half4 output[4]; + + output[0] = 0.0f; + output[1] = 0.0f; + output[2] = 0.0f; + output[3] = 0.0f; + + target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x; + target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y; + target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width; + target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height; + + output[0].x = target_box_center_x.x - target_box_width.x/(half)2; + output[1].x = target_box_center_y.x - target_box_height.x/(half)2; + output[2].x = target_box_center_x.x + target_box_width.x/(half)2; + output[3].x = target_box_center_y.x + target_box_height.x/(half)2; + + if(out_C - out_c * 4 >= 2){ + target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x; + target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y; + target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width; + target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height; + output[0].y = target_box_center_x.y - target_box_width.y/(half)2; + output[1].y = target_box_center_y.y - target_box_height.y/(half)2; + output[2].y = target_box_center_x.y + target_box_width.y/(half)2; + output[3].y = target_box_center_y.y + target_box_height.y/(half)2; + + } + if(out_C - out_c * 4 >= 3){ + target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x; + target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y; + target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width; + target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height; + output[0].z = target_box_center_x.z - target_box_width.z/(half)2; + output[1].z = target_box_center_y.z - target_box_height.z/(half)2; + output[2].z = target_box_center_x.z + target_box_width.z/(half)2; + output[3].z = target_box_center_y.z + target_box_height.z/(half)2; + } + if(out_C - out_c * 4 >= 4){ + target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x; + target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y; + target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width; + target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height; + output[0].w = target_box_center_x.w - target_box_width.w/(half)2; + output[1].w = target_box_center_y.w - target_box_height.w/(half)2; + output[2].w = target_box_center_x.w + target_box_width.w/(half)2; + output[3].w = target_box_center_y.w + target_box_height.w/(half)2; + } + + + write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); + write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); + write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); + write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); + +} \ No newline at end of file diff --git a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b07ee4d819b25ef77729ed868c54b19a3d8699ae --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl @@ -0,0 +1,154 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +/* + +__kernel void concatByC(__read_only image2d_t input_image1, + __read_only image2d_t input_image2, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int out_C_Start, + __private const int in_W, + __private const int in_H, + __private const int int_C1, + __private const int int_C2) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + int out_c1 = (out_C_Start)/4 + in_c; + + int out_c2 = out_c1 + 1; + + int2 output_pos1; + int2 output_pos2; + + output_pos1.x = out_c1 * out_W + in_w; + output_pos1.y = in_nh; + + output_pos2.x = out_c2 * out_W + in_w; + output_pos2.y = in_nh; + + int2 input_pos1; + if(in_c==0){ + input_pos1.x = ((in_C1-1)/4) * in_W + in_w; + }else{ + input_pos1.x = (in_c - 1) * in_W + in_w; + } + + input_pos1.y = in_nh; + + int2 input_pos2; + input_pos2.x = in_c * in_W + in_w; + input_pos2.y = in_nh; + + half4 output1; + half4 output2; + half4 input1; + half4 input2; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + if(in_c==0){ + input1 = read_imageh(input_image1, sampler,input_pos1); + + }else { + input1 = read_imageh(input_image2, sampler,input_pos1); + } + input2 = read_imageh(input_image2, sampler,input_pos2); + output1 = input1; + + if(out_C_Start%4==0){ + output2 = input2; + + }else if(out_C_Start%4==1){ + output1.y = input2.x; + output1.z = input2.y; + output1.w = input2.z; + output2.x = input2.w; + output2.y = 0.0f; + output2.z = 0.0f; + output2.w = 0.0f; + + }else if(out_C_Start%4==2){ + output1.z = input2.x; + output1.w = input2.y; + output2.x = input2.z; + output2.y = input2.w; + output2.z = 0.0f; + output2.w = 0.0f; + + }else if(out_C_Start%4==3){ + output1.w = input2.x; + output2.x = input2.y; + output2.y = input2.z; + output2.z = input2.w; + output2.w = 0.0f; + } + write_imageh(output_image, output_pos1, output1); + write_imageh(output_image, output_pos2, output2); +} + +__kernel void concatByW0(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos = in_c * out_W + in_w; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + input = read_imageh(input_image, sampler,input_pos); + + write_imageh(output_image, input_pos, input); + +} +*/ + +__kernel void concatByH(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W, + __private const int out_H_Start) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos; + input_pos.x = in_c * out_W + in_w; + input_pos.y = in_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + input = read_imageh(input_image, sampler,input_pos); + + int2 output_pos; + output_pos.x = input_pos.x; + output_pos.y = out_H_Start + input_pos.y; + + write_imageh(output_image, output_pos, input); + +} + + diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 63e6e62345c8034ef914b4c385e6fd976b267c4c..2247df59fb77a67a87a00bd26de014f94e86a378 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0, int2 pos_of_weight; pos_of_weight.x = i * 3 + j % 3; pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - float4 weight_x = read_imagef(filter, sampler, pos_of_weight); + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); output.x += dot(input[j], weight_x); pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - float4 weight_y = read_imagef(filter, sampler, pos_of_weight); + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); output.y += dot(input[j], weight_y); pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - float4 weight_z = read_imagef(filter, sampler, pos_of_weight); + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); output.z += dot(input[j], weight_z); pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - float4 weight_w = read_imagef(filter, sampler, pos_of_weight); + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); output.w += dot(input[j], weight_w); } */ diff --git a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl index 200a221c9bda49c42f2caff374fc24d6e4df27e5..bb661f3cf7102d5ef35b57f2167face0957129bc 100644 --- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl @@ -13,26 +13,50 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void feed(__global float *in, __write_only image2d_t outputImage,int h,int w,int c) - { - int i = get_global_id(0); - int j = get_global_id(1); - half4 pixel; - pixel.x = convert_half(in[(i * w + j)]); - if(c>=2){ - pixel.y = convert_half(in[h * w + (i * w + j)]); - }else{ - pixel.y = 0.0; - } - if(c>=3){ - pixel.z = convert_half(in[2 * h * w + (i * w + j)]); - }else{ - pixel.z = 0.0; - } - pixel.w = 0.0; - int2 coords; - coords.x = j; - coords.y = i; - - write_imageh(outputImage,coords,pixel); +__kernel void feed(__global float *in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_C, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2){ + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh/out_H; + const int out_h = out_nh%out_H; + + const int in_n = out_n; + const int in_c0 = out_c * 4 + 0; + const int in_c1 = out_c * 4 + 1; + const int in_c2 = out_c * 4 + 2; + const int in_c3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + + int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + half4 output = (half4)0.0f; + output.x = convert_half(in[input_pos0]); + if(out_C - 4 * out_c>=2){ + output.y = convert_half(in[input_pos1]); + } + if(out_C - 4 * out_c>=3){ + output.z = convert_half(in[input_pos2]); + } + if(out_C - 4 * out_c>=4){ + output.w = convert_half(in[input_pos3]); + } + write_imageh(output_image, output_pos, output); + } diff --git a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl index 053c7b3f06249f426c4a2203ba9c89362ded6a08..886f62df687361fa40f9987659d0fe31e575de6a 100644 --- a/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl @@ -19,47 +19,52 @@ __kernel void prior_box(__private const int global_size_dim0, __private const int global_size_dim2, __global float *box_width, __global float *box_height, - __write_only image2d_t output_image, + __global float *variances_Buffer, + __write_only image2d_t output_boxes, + __write_only image2d_t output_variances, __private const float step_width, __private const float step_height, __private const float offset, __private const int img_width, __private const int img_height, __private const int num_priors, - __private const int C){ - + __private const int C, + __private const int clip){ const int out_c = get_global_id(0); const int out_nh = get_global_id(1); const int out_n = out_nh/num_priors; const int out_h = out_nh%num_priors; - if (out_c >= global_size_dim0 ||out_nh >= global_size_dim2) { - return; - } - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; int2 output_pos; output_pos.x = out_c * 4; output_pos.y = out_nh; - float center_x0 = (offset + out_c * 4) * step_width; - float center_x1 = (offset + out_c * 4 + 1) * step_width; - float center_x2 = (offset + out_c * 4 + 2) * step_width; - float center_x3 = (offset + out_c * 4 + 3) * step_width; - float center_y = (out_n + offset) * step_height; + float center_x0 = (offset + (float)(out_c * 4)) * step_width; + float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width; + float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width; + float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width; + float center_y = ((float)out_n + offset) * step_height; half4 output[4]; - output[0].x = convert_half((center_x0 - box_width[out_h]) / img_width); - output[1].x = convert_half((center_y - box_height[out_h]) / img_height); - output[2].x = convert_half((center_x0 + box_width[out_h]) / img_width); - output[3].x = convert_half((center_y + box_height[out_h]) / img_height); + half4 variances[4]; + output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width); + output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height); + output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width); + output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].x = convert_half(variances_Buffer[0]); + variances[1].x = convert_half(variances_Buffer[1]); + variances[2].x = convert_half(variances_Buffer[2]); + variances[3].x = convert_half(variances_Buffer[3]); if(C - 4 * out_c>=2){ - output[0].y = convert_half((center_x1 - box_width[out_h]) / img_width); - output[1].y = convert_half((center_y - box_height[out_h]) / img_height); - output[2].y = convert_half((center_x1 + box_width[out_h]) / img_width); - output[3].y = convert_half((center_y + box_height[out_h]) / img_height); + output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width); + output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height); + output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width); + output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].y = convert_half(variances_Buffer[0]); + variances[1].y = convert_half(variances_Buffer[1]); + variances[2].y = convert_half(variances_Buffer[2]); + variances[3].y = convert_half(variances_Buffer[3]); }else{ output[0].y = 0.0f; output[1].y = 0.0f; @@ -67,10 +72,14 @@ __kernel void prior_box(__private const int global_size_dim0, output[3].y = 0.0f; } if(C - 4 * out_c>=3){ - output[0].z = convert_half((center_x2 - box_width[out_h]) / img_width); - output[1].z = convert_half((center_y - box_height[out_h]) / img_height); - output[2].z = convert_half((center_x2 + box_width[out_h]) / img_width); - output[3].z = convert_half((center_y + box_height[out_h]) / img_height); + output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width); + output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height); + output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width); + output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].z = convert_half(variances_Buffer[0]); + variances[1].z = convert_half(variances_Buffer[1]); + variances[2].z = convert_half(variances_Buffer[2]); + variances[3].z = convert_half(variances_Buffer[3]); }else{ output[0].z = 0.0f; output[1].z = 0.0f; @@ -78,23 +87,43 @@ __kernel void prior_box(__private const int global_size_dim0, output[3].z = 0.0f; } if(C - 4 * out_c>=4){ - output[0].w = convert_half((center_x3 - box_width[out_h]) / img_width); - output[1].w = convert_half((center_y - box_height[out_h]) / img_height); - output[2].w = convert_half((center_x3 + box_width[out_h]) / img_width); - output[3].w = convert_half((center_y + box_height[out_h]) / img_height); + output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width); + output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height); + output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width); + output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height); + variances[0].w = convert_half(variances_Buffer[0]); + variances[1].w = convert_half(variances_Buffer[1]); + variances[2].w = convert_half(variances_Buffer[2]); + variances[3].w = convert_half(variances_Buffer[3]); }else{ - output[0].z = 0.0f; - output[1].z = 0.0f; - output[2].z = 0.0f; - output[3].z = 0.0f; + output[0].w = 0.0f; + output[1].w = 0.0f; + output[2].w = 0.0f; + output[3].w = 0.0f; + } + if(clip==1){ + output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); + output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); + output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); + output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); } - output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f)); - write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[0]); - write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[1]); - write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[2]); - write_imageh(output_image, (int2)(output_pos.x + 4, output_pos.y), output[3]); + /* + if(output_pos.x == 0 && output_pos.y == 1){ + float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x); + printf("output = %v4hlf \n", out); + + } + */ + + write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]); + write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]); + write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]); + write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]); + + write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]); + write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]); + write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]); + write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]); + } \ No newline at end of file diff --git a/src/operators/kernel/cl/cl_kernel/reshape.cl b/src/operators/kernel/cl/cl_kernel/reshape.cl index 0ffc64f15cd531879de4852f976769790b6bafe4..7957001c9659e94e706888b658ebf39640f61a0a 100644 --- a/src/operators/kernel/cl/cl_kernel/reshape.cl +++ b/src/operators/kernel/cl/cl_kernel/reshape.cl @@ -14,26 +14,150 @@ limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void reshape(__read_only image2d_t input, - __write_only image2d_t output, - __private const int d0, - __private const int d1, - __private const int d2, - __private const int d3, - __private const int x0, - __private const int x1, - __private const int x2, - __private const int x3) { - const int x = get_global_id(0); - const int y = get_global_id(1); +__kernel void reshape(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H, + __private const int in_Stride0, + __private const int in_Stride1, + __private const int in_Stride2, + __private const int out_Stride0, + __private const int out_Stride1, + __private const int out_Stride2) { - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh/out_H; + const int out_h = out_nh%out_H; + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4+ 2; + const int out_c3 = out_c * 4+ 3; + + int count0 = out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w; + int count1 = out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w; + int count2 = out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w; + int count3 = out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w; + + int in_n0 = count0/in_Stride2; + int in_n1 = count1/in_Stride2; + int in_n2 = count1/in_Stride2; + int in_n3 = count2/in_Stride2; + + count0 = count0%in_Stride2; + count1 = count1%in_Stride2; + count2 = count2%in_Stride2; + count3 = count3%in_Stride2; + + int in_c0 = count0/in_Stride1; + int in_c1 = count1/in_Stride1; + int in_c2 = count2/in_Stride1; + int in_c3 = count3/in_Stride1; + + int in_h0 = (count0%in_Stride1)/in_Stride0; + int in_h1 = (count1%in_Stride1)/in_Stride0; + int in_h2 = (count2%in_Stride1)/in_Stride0; + int in_h3 = (count3%in_Stride1)/in_Stride0; + + int in_w0 = (count0%in_Stride1)%in_Stride0; + int in_w1 = (count1%in_Stride1)%in_Stride0; + int in_w2 = (count2%in_Stride1)%in_Stride0; + int in_w3 = (count3%in_Stride1)%in_Stride0; + + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + + input_pos0.x = (in_c0/4) * in_W + in_w0; + input_pos0.y = in_n0 * in_H + in_h0; + + input_pos1.x = (in_c1/4) * in_W + in_w1; + input_pos1.y = in_n1 * in_H + in_h1; + + input_pos2.x = (in_c2/4) * in_W + in_w2; + input_pos2.y = in_n2 * in_H + in_h2; + + input_pos3.x = (in_c3/4) * in_W + in_w3; + input_pos3.y = in_n3 * in_H + in_h3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 input0; + half4 input1; + half4 input2; + half4 input3; + half4 output; + + input0 = read_imageh(input_image, sampler,input_pos0); + if(in_c0%4==0){ + output.x = input0.x; + }else if(in_c0%4==1){ + output.x = input0.y; + }else if(in_c0%4==2){ + output.x = input0.z; + }else{ + output.x = input0.w; + } + if(out_C - out_c * 4>=2){ + input1 = read_imageh(input_image, sampler,input_pos1); + if(in_c1%4==0){ + output.y = input1.x; + }else if(in_c1%4==1){ + output.y = input1.y; + }else if(in_c1%4==2){ + output.y = input1.z; + }else{ + output.y = input1.w; + } + + }else{ + output.y = 0.0f; + } + + if(out_C - out_c * 4>=3){ + input2 = read_imageh(input_image, sampler,input_pos2); + + if(in_c2%4==0){ + output.z = input2.x; + }else if(in_c2%4==1){ + output.z = input1.y; + }else if(in_c2%4==2){ + output.z = input2.z; + }else{ + output.z = input2.w; + } + }else{ + output.z = 0.0f; + } - half4 in = read_imageh(input, sampler, (int2)(x, y)); + if(out_C - out_c * 4>=4){ + input3 = read_imageh(input_image, sampler,input_pos3); + if(in_c3%4==0){ + output.w = input3.x; + }else if(in_c3%4==1){ + output.w = input3.y; + }else if(in_c3%4==2){ + output.w = input3.z; + }else{ + output.w = input3.w; + } + }else{ + output.w = 0.0f; + } - write_imageh(output, (int2)(x, y), in); + write_imageh(output_image, output_pos, output); } diff --git a/src/operators/kernel/cl/cl_kernel/softmax.cl b/src/operators/kernel/cl/cl_kernel/softmax.cl index 215ec69fc283dcb2b538300cb5591b2b9e4b6a13..a1fa014e00d021f6ad39ac49a841b54476b63639 100644 --- a/src/operators/kernel/cl/cl_kernel/softmax.cl +++ b/src/operators/kernel/cl/cl_kernel/softmax.cl @@ -16,35 +16,46 @@ limitations under the License. */ __kernel void softmax(__read_only image2d_t input_image, __write_only image2d_t output_image, - __private const int group + __private const int out_W ) { const int out_c = get_global_id(0); // block index const int out_w = get_global_id(1); // index in one block const int out_nh = get_global_id(2); + const int in_c = out_c; + const int in_w = out_w; + const int in_nh = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + int2 input_pos; + int2 output_pos; - half maxv = 0.0f; - for (int i = 0; i < group; ++i) { - half4 temp = read_imageh(input_image, sampler, (int2)(i, 0)); - maxv = max(maxv, max(temp.x, max(temp.y, max(temp.z, temp.w)))); - } + input_pos.x = in_c * out_W + in_w; + input_pos.y = in_nh; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; - half4 rsum = (half4)(0.0f); - for (int i = 0; i < group; ++i) { - half4 r = read_imageh(input_image, sampler, (int2)(i, 0)); - rsum += convert_half4(exp(convert_float4(r - maxv))); - } + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 input_max = 0.0f; + half4 input_tmp; + for(int i=0;i=2){ + input1 = read_imageh(input_image, sampler,input_pos1); + if(out_w%4==0){ + output.y = input1.x; + }else if(out_w%4==1){ + output.y = input1.y; + }else if(out_w%4==2){ + output.y = input1.z; + }else{ + output.y = input1.w; + } + + }else{ + output.y = 0.0f; + } + + if(out_C - out_c * 4>=3){ + input2 = read_imageh(input_image, sampler,input_pos2); + + if(out_w%4==0){ + output.z = input2.x; + }else if(out_w%4==1){ + output.z = input2.y; + }else if(out_w%4==2){ + output.z = input2.z; + }else{ + output.z = input2.w; + } + }else{ + output.z = 0.0f; + } + + if(out_C - out_c * 4>=4){ + input3 = read_imageh(input_image, sampler,input_pos3); + if(out_w%4==0){ + output.w = input3.x; + }else if(out_w%4==1){ + output.w = input3.y; + }else if(out_w%4==2){ + output.w = input3.z; + }else{ + output.w = input3.w; + } + }else{ + output.w = 0.0f; + } + write_imageh(output_image, output_pos, output); +} + +__kernel void transpose( __read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W + ){ + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh%out_H; + + const int in_n = 1; + const int in_c = out_c; + const int in_w = out_h; + const int in_h = out_w; + + int2 input_pos; + int2 output_pos; + + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_n * in_h; + + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_n * out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + half4 input; + half4 output; + input = read_imageh(input_image, sampler,input_pos); + + output = input; + write_imageh(output_image, output_pos, output); + +} \ No newline at end of file diff --git a/src/operators/kernel/cl/concat_kernel.cpp b/src/operators/kernel/cl/concat_kernel.cpp index 48fbd03315957f54873e5ee18bc95896ca306554..3deb31e7aa0c408cc2b87c523d324001f75ade88 100644 --- a/src/operators/kernel/cl/concat_kernel.cpp +++ b/src/operators/kernel/cl/concat_kernel.cpp @@ -21,11 +21,49 @@ namespace operators { template <> bool ConcatKernel::Init(ConcatParam *param) { + if (param->Out()->dims().size() < 4) { + this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl"); + } return true; } template <> -void ConcatKernel::Compute(const ConcatParam ¶m) {} +void ConcatKernel::Compute(const ConcatParam ¶m) { + if (param.Out()->dims().size() < 4) { + auto kernel = this->cl_helper_.KernelAt(0); + auto inputs = param.Inputs(); + auto *output_image = param.Out()->GetCLImage(); + int out_W = 0; + if (param.Out()->dims().size() == 3) { + out_W = param.Out()->dims()[2]; + } else if (param.Out()->dims().size() == 2) { + out_W = param.Out()->dims()[1]; + } + int out_H_Start = 0; + for (int i = 0; i < inputs.size(); i++) { + auto input_image = inputs[i]->GetCLImage(); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]); + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + if (param.Out()->dims().size() == 3) { + out_H_Start += inputs[i]->dims()[1]; + } else if (param.Out()->dims().size() == 2) { + out_H_Start += inputs[i]->dims()[0]; + } + } + } +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/feed_kernel.cpp b/src/operators/kernel/cl/feed_kernel.cpp index 941a6cb815541d1eca30ccc193161838ce28da80..e813d08e6b11fbfe330a39532561f062d175bfdb 100644 --- a/src/operators/kernel/cl/feed_kernel.cpp +++ b/src/operators/kernel/cl/feed_kernel.cpp @@ -27,6 +27,7 @@ bool FeedKernel::Init(FeedParam *param) { template <> void FeedKernel::Compute(const FeedParam ¶m) { auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out())); cl_int status; param.Out()->InitEmptyImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue(), param.Out()->dims()); @@ -35,10 +36,13 @@ void FeedKernel::Compute(const FeedParam ¶m) { // DLOG << *input; const float *input_data = input->data(); int numel = input->numel(); - cl_mem cl_image = output->GetCLImage(); - int c = input->dims()[1]; - int height = output->dims()[2]; - int width = output->dims()[3]; + cl_mem output_image = output->GetCLImage(); + const int out_C = output->dims()[1]; + const int out_H = output->dims()[2]; + const int out_W = output->dims()[3]; + const int Stride2 = out_C * out_H * out_W; + const int Stride1 = out_H * out_W; + const int Stride0 = out_W; CLTensor input_cl_tensor(this->cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); input_cl_tensor.Resize(input->dims()); @@ -46,21 +50,25 @@ void FeedKernel::Compute(const FeedParam ¶m) { status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_image); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 2, sizeof(cl_int), &width); + status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 3, sizeof(cl_int), &height); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 4, sizeof(cl_int), &c); + status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2); CL_CHECK_ERRORS(status); - size_t global_work_size[2] = {width, height}; - - // cl_event out_event = param.Out()->GetClEvent(); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/fetch_kernel.cpp b/src/operators/kernel/cl/fetch_kernel.cpp index 8ea0b3ad3d33f0352fba7697fc08ad7a2039e6ab..e13fbcaed6cd6caec495cafa31b9147a89d54753 100644 --- a/src/operators/kernel/cl/fetch_kernel.cpp +++ b/src/operators/kernel/cl/fetch_kernel.cpp @@ -22,11 +22,11 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { - if (param->InputX()->dims().size() <= 2) { - this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl"); - } else { - this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); - } + // if (param->InputX()->dims().size() <= 2) { + // this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl"); + // } else { + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + // } return true; } @@ -37,6 +37,7 @@ void FetchKernel::Compute(const FetchParam ¶m) { auto input = param.InputX()->GetCLImage(); auto *out = param.Out(); + out->Resize(param.InputX()->dims()); out->mutable_data(); const auto &dim = param.InputX()->dims(); size_t new_dims[] = {1, 1, 1, 1}; @@ -49,11 +50,11 @@ void FetchKernel::Compute(const FetchParam ¶m) { C = new_dims[1]; in_height = new_dims[2]; - if (dim.size() <= 2) { - in_width = param.InputX()->ImageWidth(); - } else { - in_width = new_dims[3]; - } + // if (dim.size() <= 2) { + // in_width = param.InputX()->ImageWidth(); + // } else { + in_width = new_dims[3]; + // } CLTensor out_cl_tensor(this->cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); @@ -64,16 +65,16 @@ void FetchKernel::Compute(const FetchParam ¶m) { clSetKernelArg(kernel, 1, sizeof(int), &in_width); clSetKernelArg(kernel, 2, sizeof(cl_mem), &input); clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer); - if (dim.size() > 2) { - int size_ch = in_height * in_width; - int size_block = size_ch * 4; - int size_batch = size_ch * C; - int out_c = new_dims[1]; - clSetKernelArg(kernel, 4, sizeof(int), &size_ch); - clSetKernelArg(kernel, 5, sizeof(int), &size_block); - clSetKernelArg(kernel, 6, sizeof(int), &size_batch); - clSetKernelArg(kernel, 7, sizeof(int), &out_c); - } + // if (dim.size() > 2) { + int size_ch = in_height * in_width; + int size_block = size_ch * 4; + int size_batch = size_ch * C; + int out_c = new_dims[1]; + clSetKernelArg(kernel, 4, sizeof(int), &size_ch); + clSetKernelArg(kernel, 5, sizeof(int), &size_block); + clSetKernelArg(kernel, 6, sizeof(int), &size_batch); + clSetKernelArg(kernel, 7, sizeof(int), &out_c); + // } // cl_event wait_event = param.InpdutX()->GetClEvent(); clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, @@ -93,8 +94,6 @@ void FetchKernel::Compute(const FetchParam ¶m) { // << "ms" << std::endl; memcpy(out->data(), out_cl_tensor.Data(), out->memory_size()); - DLOG << *param.InputX(); - DLOG << *out; } template class FetchKernel; diff --git a/src/operators/kernel/cl/multiclass_nms_kernel.cpp b/src/operators/kernel/cl/multiclass_nms_kernel.cpp index e7bf02cde4b5562fd17508a29bd2e820f2e668b6..31ccdc0df5ee50f804f471fc19dae49dcecde3e6 100644 --- a/src/operators/kernel/cl/multiclass_nms_kernel.cpp +++ b/src/operators/kernel/cl/multiclass_nms_kernel.cpp @@ -15,19 +15,323 @@ limitations under the License. */ #ifdef MULTICLASSNMS_OP #include "operators/kernel/multiclass_nms_kernel.h" - +#include "operators/math/poly_util.h" namespace paddle_mobile { namespace operators { template <> bool MultiClassNMSKernel::Init( - MultiClassNMSParam *param) { + MultiClassNMSParam* param) { + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); return true; } +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + T bbox1_area = math::PolyArea(box1, box_size, normalized); + T bbox2_area = math::PolyArea(box2, box_size, normalized); + T inter_area = math::PolyOverlapArea(box1, box2, box_size, normalized); + if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { + // If coordinate values are is invalid + // if area size <= 0, return 0. + return static_cast(0.); + } else { + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline void NMSFast(const framework::Tensor& bbox, + const framework::Tensor& scores, + const T score_threshold, const T nms_threshold, + const T eta, const int64_t top_k, + std::vector* selected_indices) { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = T(0.); + if (box_size == 4) { + overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + } else { + overlap = PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, box_size, true); + } + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } +} + +template +void MultiClassNMS(const framework::Tensor& scores, + const framework::Tensor& bboxes, + std::map>* indices, int* num_nmsed_out, + const int& background_label, const int& nms_top_k, + const int& keep_top_k, const T& nms_threshold, + const T& nms_eta, const T& score_threshold) { + int64_t class_num = scores.dims()[0]; + int64_t predict_dim = scores.dims()[1]; + int num_det = 0; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + framework::Tensor score = scores.Slice(c, c + 1); + /// [c] is key + NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, + nms_top_k, &((*indices)[c])); + num_det += (*indices)[c].size(); + } + + *num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : *indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + // PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(*indices); + *num_nmsed_out = keep_top_k; + } +} + +template +void MultiClassOutput(const framework::Tensor& scores, + const framework::Tensor& bboxes, + const std::map>& selected_indices, + framework::Tensor* outs) { + int predict_dim = scores.dims()[1]; + int box_size = bboxes.dims()[1]; + int out_dim = bboxes.dims()[1] + 2; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + + int count = 0; + for (const auto& it : selected_indices) { + /// one batch + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * box_size; + odata[count * out_dim] = label; // label + odata[count * out_dim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); + count++; + } + } +} + +template +void MultiClassNMSCompute(const MultiClassNMSParam& param, + cl_context context, cl_command_queue commandQueue, + cl_kernel kernel0, cl_kernel kernel1) { + auto* input_bboxes_image = param.InputBBoxes(); + auto& input_bboxes_dims = input_bboxes_image->dims(); + Tensor* input_bboxes = new Tensor(); + input_bboxes->Resize(input_bboxes_dims); + input_bboxes->mutable_data(); + DLOG << "yangfei20"; + framework::CLImageToTensor(input_bboxes_image, input_bboxes, context, + commandQueue, kernel0); + DLOG << "yangfei20"; + auto* input_scores_image = param.InputScores(); + auto& input_scores_dims = input_scores_image->dims(); + + Tensor* input_scores = new Tensor(); + input_scores->Resize(input_scores_dims); + input_scores->mutable_data(); + framework::CLImageToTensor(input_scores_image, input_scores, context, + commandQueue, kernel0); + DLOG << "yangfei20"; + auto outs_image = param.Out(); + Tensor* outs = new Tensor(); + outs->Resize(outs_image->dims()); + outs->mutable_data(); + DLOG << *input_bboxes; + DLOG << *input_scores; + DLOG << *outs; + auto background_label = param.BackGroundLabel(); + auto nms_top_k = param.NMSTopK(); + auto keep_top_k = param.KeepTopK(); + auto nms_threshold = param.NMSThreshold(); + auto nms_eta = param.NMSEta(); + auto score_threshold = param.ScoreThreshold(); + int64_t batch_size = input_scores_dims[0]; + int64_t class_num = input_scores_dims[1]; + int64_t predict_dim = input_scores_dims[2]; + int64_t box_dim = input_bboxes_dims[2]; + + std::vector>> all_indices; + std::vector batch_starts = {0}; + for (int64_t i = 0; i < batch_size; ++i) { + framework::Tensor ins_score = input_scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + + framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1); + ins_boxes.Resize({predict_dim, box_dim}); + + std::map> indices; + int num_nmsed_out = 0; + MultiClassNMS(ins_score, ins_boxes, &indices, &num_nmsed_out, + background_label, nms_top_k, keep_top_k, nms_threshold, + nms_eta, score_threshold); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + float* od = outs->mutable_data({1}); + od[0] = -1; + } else { + int64_t out_dim = box_dim + 2; + outs->mutable_data({num_kept, out_dim}); + for (int64_t i = 0; i < batch_size; ++i) { + framework::Tensor ins_score = input_scores->Slice(i, i + 1); + ins_score.Resize({class_num, predict_dim}); + + framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1); + ins_boxes.Resize({predict_dim, box_dim}); + + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + framework::Tensor out = outs->Slice(s, e); + MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out); + } + } + } + DLOG << "yangfei20"; + outs_image->InitEmptyImage(context, commandQueue, outs->dims()); + framework::TensorToCLImage(outs, outs_image, context, commandQueue, kernel1); + DLOG << *outs; + delete (input_bboxes); + delete (input_scores); + delete (outs); + DLOG << "yangfei20"; +} template <> void MultiClassNMSKernel::Compute( - const MultiClassNMSParam ¶m) {} + const MultiClassNMSParam& param) { + auto kernel0 = this->cl_helper_.KernelAt(0); + auto kernel1 = this->cl_helper_.KernelAt(1); + MultiClassNMSCompute(param, this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue(), kernel0, + kernel1); +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/prior_box_kernel.cpp b/src/operators/kernel/cl/prior_box_kernel.cpp index 1f8843787bc68c8be681e4c2a79053714b76dc4e..92764b379e8dad8070407fcf012b4bad73fd19a1 100644 --- a/src/operators/kernel/cl/prior_box_kernel.cpp +++ b/src/operators/kernel/cl/prior_box_kernel.cpp @@ -39,6 +39,10 @@ void PriorBoxKernel::Compute( const auto &input_aspect_ratio = param.AspectRatios(); const bool &flip = param.Flip(); const bool &clip = param.Clip(); + int isclip = 0; + if (clip) { + isclip = 1; + } const float &step_w = param.StepW(); const float &step_h = param.StepH(); const float &offset = param.Offset(); @@ -75,6 +79,8 @@ void PriorBoxKernel::Compute( paddle_mobile::memory::Alloc(sizeof(float) * num_priors)); float *box_height = static_cast( paddle_mobile::memory::Alloc(sizeof(float) * num_priors)); + float *variancesptr = + static_cast(paddle_mobile::memory::Alloc(sizeof(float) * 4)); int idx = 0; for (size_t s = 0; s < min_sizes.size(); ++s) { auto min_size = min_sizes[s]; @@ -108,6 +114,9 @@ void PriorBoxKernel::Compute( } } } + for (int i = 0; i < variances.size(); i++) { + variancesptr[i] = variances[i]; + } cl_int status; auto kernel = this->cl_helper_.KernelAt(0); auto default_work_size = @@ -116,7 +125,7 @@ void PriorBoxKernel::Compute( int w = default_work_size[1]; int nh = default_work_size[2]; - std::vector box_shape({1, 1, 1, num_priors}); + std::vector box_shape({num_priors}); framework::DDim ddim = framework::make_ddim(box_shape); framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(), @@ -131,16 +140,33 @@ void PriorBoxKernel::Compute( cl_mem box_height_Buffer = box_height_cl_tensor.mutable_with_data(box_height); - DLOG << "c_block:" << c_block; - DLOG << "w:" << w; - DLOG << "nh:" << nh; - DLOG << "step_width:" << step_width; - DLOG << "step_height:" << step_height; - DLOG << "offset:" << offset; - DLOG << "img_width:" << img_width; - DLOG << "img_height:" << img_height; - DLOG << "num_priors:" << num_priors; - DLOG << "C:" << C; + framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + + std::vector variances_shape({4}); + framework::DDim vddim = framework::make_ddim(variances_shape); + + variances_cl_tensor.Resize(vddim); + cl_mem variances_Buffer = + variances_cl_tensor.mutable_with_data(variancesptr); + + // DLOG << "c_block:" << c_block; + // DLOG << "w:" << w; + // DLOG << "nh:" << nh; + // DLOG << "step_width:" << step_width; + // DLOG << "step_height:" << step_height; + // DLOG << "offset:" << offset; + // DLOG << "img_width:" << img_width; + // DLOG << "img_height:" << img_height; + // DLOG << "num_priors:" << num_priors; + // DLOG << "C:" << C; + // DLOG << "isclip:" << isclip; + // printf("param.MinMaxAspectRatiosOrder() = + // %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i < + // num_priors; i++) { + // DLOG << box_width[i]; + // DLOG << box_height[i]; + // } status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(int), &w); @@ -151,28 +177,36 @@ void PriorBoxKernel::Compute( CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 6, sizeof(float), &step_width); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 7, sizeof(float), &step_height); + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 8, sizeof(float), &offset); + status = clSetKernelArg(kernel, 8, sizeof(float), &step_width); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 9, sizeof(int), &img_width); + status = clSetKernelArg(kernel, 9, sizeof(float), &step_height); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 10, sizeof(int), &img_height); + status = clSetKernelArg(kernel, 10, sizeof(float), &offset); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors); + status = clSetKernelArg(kernel, 11, sizeof(int), &img_width); CL_CHECK_ERRORS(status); - status = clSetKernelArg(kernel, 12, sizeof(int), &C); + status = clSetKernelArg(kernel, 12, sizeof(int), &img_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 15, sizeof(int), &isclip); CL_CHECK_ERRORS(status); size_t global_work_size[2] = {c_block, nh}; status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); + paddle_mobile::memory::Free(box_width); paddle_mobile::memory::Free(box_height); + paddle_mobile::memory::Free(variancesptr); } template class PriorBoxKernel; diff --git a/src/operators/kernel/cl/reshape_kernel.cpp b/src/operators/kernel/cl/reshape_kernel.cpp index fb3aa9b52f722b21cdc30e54eafadf9dffcfef7a..4e8d3e1d60a4f1fc2cc2f7a93feda18ed0ac0a97 100644 --- a/src/operators/kernel/cl/reshape_kernel.cpp +++ b/src/operators/kernel/cl/reshape_kernel.cpp @@ -26,40 +26,76 @@ bool ReshapeKernel::Init(ReshapeParam *param) { template <> void ReshapeKernel::Compute(const ReshapeParam ¶m) { auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); const auto *input = param.InputX(); auto *output = param.Out(); - auto inputImage = input->GetCLImage(); - auto outputImage = output->GetCLImage(); - clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); - clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + auto input_image = input->GetCLImage(); + auto output_image = output->GetCLImage(); const auto &inputDim = input->dims(); const auto &outputDim = output->dims(); - int dims[4] = {1, 1, 1, 1}; - int odims[4] = {1, 1, 1, 1}; + int input_dims[4] = {1, 1, 1, 1}; + int output_dims[4] = {1, 1, 1, 1}; // 1 1000 1 1 for (int i = 0; i < inputDim.size(); i++) { - dims[4 - inputDim.size() + i] = inputDim[i]; + input_dims[4 - inputDim.size() + i] = inputDim[i]; } // 1 1 1 1000 for (int i = 0; i < outputDim.size(); i++) { - odims[4 - outputDim.size() + i] = outputDim[i]; + output_dims[4 - outputDim.size() + i] = outputDim[i]; } - clSetKernelArg(kernel, 2, sizeof(cl_int), &dims); - clSetKernelArg(kernel, 3, sizeof(cl_int), &dims[1]); - clSetKernelArg(kernel, 4, sizeof(cl_int), &dims[2]); - clSetKernelArg(kernel, 5, sizeof(cl_int), &dims[3]); - clSetKernelArg(kernel, 6, sizeof(cl_int), &odims); - clSetKernelArg(kernel, 7, sizeof(cl_int), &odims[1]); - clSetKernelArg(kernel, 8, sizeof(cl_int), &odims[1]); - clSetKernelArg(kernel, 9, sizeof(cl_int), &odims[1]); - const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()}; - // cl_event out_event = param.Out()->GetClEvent(); - // cl_event wait_event = param.InputX()->GetClEvent(); - - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, - work_size, NULL, 0, NULL, NULL); + int out_C = output_dims[1]; + int out_H = output_dims[2]; + int out_W = output_dims[3]; + int in_W = input_dims[3]; + int in_H = input_dims[2]; + int in_Stride0 = in_W; + int in_Stride1 = input_dims[2] * input_dims[3]; + int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3]; + int out_Stride0 = out_W; + int out_Stride1 = out_H * out_W; + int out_Stride2 = out_C * out_H * out_W; + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "out_W=" << out_W; + DLOG << "in_W=" << in_W; + DLOG << "default_work_size=" << default_work_size; + DLOG << "in_Stride0=" << in_Stride0; + DLOG << "in_Stride1=" << in_Stride1; + DLOG << "out_Stride0=" << out_Stride0; + DLOG << "out_Stride1=" << out_Stride1; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(int), &in_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); } template class ReshapeKernel; diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp index 22e6672ee462b963476dc72895329a9117fc16a8..6447b68d3376a23e89df6ee635537a8c2ab3bde8 100644 --- a/src/operators/kernel/cl/softmax_kernel.cpp +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -33,31 +33,24 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { auto *output = param.Out(); auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); + const auto &outputDim = output->dims(); - int group = output->ImageWidth(); + int dims[4] = {1, 1, 1, 1}; + + for (int i = 0; i < outputDim.size(); i++) { + dims[4 - outputDim.size() + i] = outputDim[i]; + } + + const int out_W = dims[3]; cl_int status; status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + CL_CHECK_ERRORS(status); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); - status = clSetKernelArg(kernel, 2, sizeof(int), &group); - - // const auto &inputDim = input->dims(); - // - // int dims[4] = {1, 1, 1, 1}; - // - // for (int i = 0; i < inputDim.size(); i++) { - // dims[4 - inputDim.size() + i] = inputDim[i]; - // } - // - // clSetKernelArg(kernel, 2, sizeof(int), &dims); - // clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); - // clSetKernelArg(kernel, 4, sizeof(int), &dims[2]); - // clSetKernelArg(kernel, 5, sizeof(int), &dims[3]); - - // cl_event out_event = param.Out()->GetClEvent(); - // cl_event wait_event = param.InputX()->GetClEvent(); - + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); status = clEnqueueNDRangeKernel( this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, default_work_size.data(), NULL, 0, NULL, NULL); diff --git a/src/operators/kernel/cl/transpose_kernel.cpp b/src/operators/kernel/cl/transpose_kernel.cpp index 3b41753f186967b7dd419fed05201989d45252b6..d3133449b916193b9bdb57f5398e7a1082a65749 100644 --- a/src/operators/kernel/cl/transpose_kernel.cpp +++ b/src/operators/kernel/cl/transpose_kernel.cpp @@ -20,12 +20,113 @@ namespace operators { template <> bool TransposeKernel::Init(TransposeParam *param) { + if (param->Out()->dims().size() == 4) { + this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl"); + } else if (param->Out()->dims().size() < 4) { + this->cl_helper_.AddKernel("transpose", "transpose_kernel.cl"); + } return true; } template <> void TransposeKernel::Compute( - const TransposeParam ¶m) {} + const TransposeParam ¶m) { + if (param.Out()->dims().size() == 4) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); + int out_C = param.Out()->dims()[1]; + int out_H = param.Out()->dims()[2]; + int out_W = param.Out()->dims()[3]; + int in_W = param.InputX()->dims()[3]; + auto output_image = param.Out()->GetCLImage(); + auto input_image = param.InputX()->GetCLImage(); + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "out_W=" << out_W; + DLOG << "in_C=" << in_W; + DLOG << "default_work_size=" << default_work_size; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } else if (param.Out()->dims().size() == 3) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); + int out_C = param.Out()->dims()[0]; + int out_H = param.Out()->dims()[1]; + int out_W = param.Out()->dims()[2]; + int in_W = param.InputX()->dims()[2]; + auto output_image = param.Out()->GetCLImage(); + auto input_image = param.InputX()->GetCLImage(); + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "out_W=" << out_W; + DLOG << "in_C=" << in_W; + DLOG << "default_work_size=" << default_work_size; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + + } else if (param.Out()->dims().size() == 2) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); + int out_C = 1; + int out_H = param.Out()->dims()[0]; + int out_W = param.Out()->dims()[1]; + int in_W = param.InputX()->dims()[1]; + auto output_image = param.Out()->GetCLImage(); + auto input_image = param.InputX()->GetCLImage(); + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "out_W=" << out_W; + DLOG << "in_C=" << in_W; + DLOG << "default_work_size=" << default_work_size; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + } +} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp similarity index 58% rename from src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp rename to src/operators/kernel/fpga/V1/conv_add_kernel.cpp index 6c99750eb824940b32a857ee2baffc72bce05a7a..5ad4c86441f7870b00e6639e7cda22083d3c10d5 100644 --- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp @@ -12,49 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef FUSION_CONVADDBNRELU_OP +#ifdef FUSION_CONVADD_OP -#include "operators/kernel/conv_add_bn_relu_kernel.h" +#include "operators/kernel/conv_add_kernel.h" namespace paddle_mobile { namespace operators { template <> -bool ConvAddBNReluKernel::Init( - FusionConvAddBNReluParam *param) { - bool relu_enabled = true; +bool ConvAddKernel::Init(FusionConvAddParam *param) { + bool relu_enabled = false; auto input = const_cast(param->Input()); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); auto out = param->Output(); - auto bn_mean_ptr = param->InputMean()->data(); - auto bn_var_ptr = param->InputVariance()->data(); - auto bn_scale_ptr = param->InputScale()->data(); - auto bn_bias_ptr = param->InputBias()->data(); - const float epsilon = param->Epsilon(); - PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] && - bias->dims()[0] == param->InputBias()->dims()[0], - "Output channel should be equal to bias number"); - const int channel = out->dims()[1]; + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - auto new_scale = new Tensor(); - auto new_bias = new Tensor(); - auto new_scale_ptr = new_scale->mutable_data({channel}); - auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { - new_scale_ptr[i] = bn_scale_ptr[i] / - static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); - new_bias_ptr[i] = - bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i + 2] = new_scale_ptr[i]; - bs_ptr[i] = new_bias_ptr[i]; + bs_ptr[i + channel] = 1; + bs_ptr[i] = bias_ptr[i]; } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); @@ -75,8 +57,8 @@ bool ConvAddBNReluKernel::Init( } template <> -void ConvAddBNReluKernel::Compute( - const FusionConvAddBNReluParam ¶m) { +void ConvAddKernel::Compute( + const FusionConvAddParam ¶m) { fpga::ComputeFpgaConv(param.FpgaArgs()); } diff --git a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp similarity index 70% rename from src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp rename to src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 3284ddcdece3ab7fcf4fb4458a59d39c452ad1ce..39d7e818976b56eaea8649392784e7b5dc8b7e1f 100644 --- a/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -12,21 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef CONV_TRANSPOSE_OP +#ifdef FUSION_DECONVADD_OP -#include "operators/kernel/conv_transpose_kernel.h" +#include "operators/kernel/deconv_add_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { template <> -bool ConvTransposeKernel::Init(ConvTransposeParam *param) { +bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { return true; } template <> -void ConvTransposeKernel::Compute( - const ConvTransposeParam ¶m) {} +void DeconvAddKernel::Compute( + const FusionDeconvAddParam ¶m) {} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ef2556208a8650a86522264f40f42cb596ec4190 --- /dev/null +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDRELU_OP + +#include "operators/kernel/deconv_add_relu_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvAddReluKernel::Init( + FusionDeconvAddReluParam *param) { + return true; +} + +template <> +void DeconvAddReluKernel::Compute( + const FusionDeconvAddReluParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f8eeb53159411276fbab957c676a01cb31b597c8 --- /dev/null +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef ELEMENTWISEADD_OP + +#include "operators/kernel/elementwise_add_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { + bool relu_enabled = false; + auto *input_x = const_cast(param->InputX()); + auto *input_y = const_cast(param->InputY()); + auto *out = param->Out(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + fpga::format_fp16_ofm(out); + auto out_ptr = out->mutable_data(); + + fpga::EWAddArgs ewaddArgs = {0}; + ewaddArgs.relu_enabled = relu_enabled; + ewaddArgs.const0 = 0x3c00; // =1 + ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + param->SetFpgaArgs(ewaddArgs); + return true; +} + +template <> +void ElementwiseAddKernel::Compute( + const ElementwiseAddParam ¶m) { + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp deleted file mode 100644 index 2c6b616689dca14474d1cbdc3769b438de1358e4..0000000000000000000000000000000000000000 --- a/src/operators/kernel/fpga/V1/fc_relu_kernel.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FCRELU_OP -#include "operators/kernel/fc_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcReluKernel::Init(FusionFcReluParam *param) { - bool relu_enabled = true; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - auto input_z = param->InputZ(); - auto input_z_ptr = input_z->data(); - auto out = param->Out(); - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - "Image channel should be equal to weight number"); - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = input_z_ptr[i]; - } - - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - out->Resize(framework::make_ddim({1, channel, 1, 1})); - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(filter); - fpga::format_fc_filter(filter, max_value); - - int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); - - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, - 0, 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} -template <> -void FusionFcReluKernel::Compute( - const FusionFcReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/src/operators/kernel/fpga/V1/split_kernel.cpp b/src/operators/kernel/fpga/V1/split_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b8c0bb3be64d2393b61b0f82375c695000f52b65 --- /dev/null +++ b/src/operators/kernel/fpga/V1/split_kernel.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SPLIT_OP + +#include "operators/kernel/split_kernel.h" + +namespace paddle_mobile { +namespace operators { +template <> +bool SplitKernel::Init(SplitParam *param) { + auto *in = const_cast(param->InputX()); + auto outs = param->Outs(); + auto sections = param->Sections(); + int axis = param->Axis(); + PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension"); + PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(), + "Output number should be equal to section number"); + auto image_num = (uint32_t)outs.size(); + auto images_out = + reinterpret_cast(fpga::fpga_malloc(image_num * sizeof(void *))); + auto scales_out = reinterpret_cast( + fpga::fpga_malloc(image_num * sizeof(float *))); + auto out_channels = reinterpret_cast( + fpga::fpga_malloc(image_num * sizeof(uint32_t))); + for (int i = 0; i < image_num; i++) { + fpga::format_fp16_ofm(outs[i]); + images_out[i] = outs[i]->mutable_data(); + scales_out[i] = outs[i]->scale; + out_channels[i] = (uint32_t)sections[i]; + } + + fpga::SplitArgs arg = {0}; + arg.image_num = image_num; + arg.image_in = (half *)in->data(); + arg.scale_in = in->scale; + arg.images_out = images_out; + arg.scales_out = scales_out; + arg.out_channel_nums = out_channels; + arg.height = (uint32_t)in->dims()[2]; + arg.width = (uint32_t)in->dims()[3]; + + param->SetFpgaArgs(arg); + return true; +} +template <> +void SplitKernel::Compute(const SplitParam ¶m) { + fpga::ComputeFPGASplit(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/src/operators/kernel/fpga/V1/tanh_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..216cb726e3fe93e9ebfaf328a9ab4ca0725b6bb1 --- /dev/null +++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TANH_OP + +#include "operators/kernel/tanh_kernel.h" +#include +namespace paddle_mobile { +namespace operators { + +template <> +bool TanhKernel::Init(TanhParam *param) { + auto input = const_cast(param->InputX()); + auto input_ptr = input->data(); + auto float_input = new Tensor; + float_input->mutable_data( + {1, input->dims()[1], input->dims()[2], input->dims()[3]}); + fpga::format_fp32_ofm(float_input); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input_ptr; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = float_input->data(); + args.output.scale_address = float_input->scale; + param->SetFloatInput(float_input); + param->SetFpgaArgs(args); + return true; +} + +#define EXP_MAX_INPUT 40.0 +template +T Tanh(const T a) { + T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} +template +void tanhFuntor(Tensor *input, Tensor *output) { + auto *input_ptr = input->data(); + auto *output_ptr = output->mutable_data(); + for (int i = 0; i < input->numel(); i++) { + *(output_ptr + i) = Tanh(*(input_ptr + i)); + } +} +template <> +void TanhKernel::Compute(const TanhParam ¶m) { + Tensor *in_x = param.FloatInput(); + Tensor *out = param.Out(); + + fpga::PerformBypass(param.FpgaArgs()); + fpga::fpga_invalidate((void *)in_x->data(), + in_x->numel() * sizeof(float)); + tanhFuntor(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4505401f434c320003e8122a3a0e197441ae8921 --- /dev/null +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef TRANSPOSE2_OP + +#include "operators/kernel/transpose2_kernel.h" +#include "operators/kernel/central-arm-func/transpose2_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool Transpose2Kernel::Init(Transpose2Param *param) { + param->Out()->ShareDataWith(*param->InputX()); + return true; +} + +template <> +void Transpose2Kernel::Compute( + const Transpose2Param ¶m) { + // Transpose2Compute(param); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp deleted file mode 100644 index ba869aaca7f3f5d5c598feb3837a59a3a738493b..0000000000000000000000000000000000000000 --- a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef FUSION_FCRELU_OP -#include "operators/kernel/fc_relu_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool FusionFcReluKernel::Init(FusionFcReluParam *param) { - bool relu_enabled = true; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - auto input_z = param->InputZ(); - auto input_z_ptr = input_z->data(); - auto out = param->Out(); - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - "Image channel should be equal to weight number"); - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = input_z_ptr[i]; - } - - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - out->Resize(framework::make_ddim({1, channel, 1, 1})); - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - fpga::format_fc_data(filter, out, bs_ptr); - - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, - 0, 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} -template <> -void FusionFcReluKernel::Compute( - const FusionFcReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile -#endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 1dc1ec8933fbf9a7402a5093d80d7b0400d73bb6..b6597b55a97afbe6c5e3efe8c72e627d34206f0e 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -863,6 +863,8 @@ class PriorBoxParam : public OpParam { if (HasAttr("min_max_aspect_ratios_order", attrs)) { min_max_aspect_ratios_order_ = GetAttr("min_max_aspect_ratios_order", attrs); + } else { + min_max_aspect_ratios_order_ = false; } flip_ = GetAttr("flip", attrs); clip_ = GetAttr("clip", attrs); @@ -1030,9 +1032,9 @@ class MultiClassNMSParam : public OpParam { score_threshold_ = GetAttr("score_threshold", attrs); } - const RType *InputBBoxes() const { return input_bboxes_; } + RType *InputBBoxes() const { return input_bboxes_; } - const RType *InputScores() const { return input_scores_; } + RType *InputScores() const { return input_scores_; } RType *Out() const { return out_; } @@ -1566,6 +1568,20 @@ class TanhParam : public OpParam { private: RType *input_x_; RType *out_; +#ifdef PADDLE_MOBILE_FPGA + + private: + std::shared_ptr float_input_x_; + fpga::BypassArgs fpga_bypass_args; + + public: + RType *FloatInput() const { + return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); + } + void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } + const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } + void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } +#endif }; #endif @@ -2223,7 +2239,6 @@ class DropoutParam : public OpParam { }; #endif -#ifdef CONV_TRANSPOSE_OP template class ConvTransposeParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; @@ -2278,7 +2293,7 @@ class ConvTransposeParam : public OpParam { void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; } #endif }; -#endif + #ifdef FUSION_DECONVADD_OP template class FusionDeconvAddParam : public ConvTransposeParam { @@ -2434,6 +2449,15 @@ class SplitParam : public OpParam { int num; std::vector sections; // std::vector out_ts_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::SplitArgs fpga_split_args; + + public: + const fpga::SplitArgs &FpgaArgs() const { return fpga_split_args; } + void SetFpgaArgs(const fpga::SplitArgs &args) { fpga_split_args = args; } +#endif }; #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3202e9248ba55afb438a4b4fc894095531aa534f..bfd125ce5b75091cfac1a2a4e2f2f025da0178dc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -67,7 +67,7 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet50 paddle-mobile) - ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) + ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-densebox paddle-mobile) set(FOUND_MATCH ON) @@ -81,7 +81,7 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-pe fpga/test_pe.cpp) target_link_libraries(test-pe paddle-mobile) - ADD_EXECUTABLE(test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) + ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-densebox paddle-mobile) set(FOUND_MATCH ON) @@ -372,5 +372,8 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h) target_link_libraries(test-eng paddle-mobile) + # gen test + ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h) + target_link_libraries(test-super paddle-mobile) #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) endif () diff --git a/test/fpga/test_densebox_combine.cpp b/test/fpga/test_densebox_combine.cpp new file mode 100644 index 0000000000000000000000000000000000000000..056bbe52d89f69a444174846e602b44d9c581d03 --- /dev/null +++ b/test/fpga/test_densebox_combine.cpp @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +static const char *g_densebox_combine = "../models/densebox"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); + if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model", + std::string(g_densebox_combine) + "/params", true)) { + // std::vector input; + // std::vector dims{1, 3, 512, 1024}; + // GetInput(g_test_image_1x3x224x224_banana, &input, dims); + + // auto vec_result = paddle_mobile.Predict(input, dims); + + Tensor input_tensor; + SetupTensor(&input_tensor, {1, 3, 512, 1024}, static_cast(0), + static_cast(1)); + // readStream(g_image_src_float, + // input_tensor.mutable_data({1, 3, 224, 224})); + paddle_mobile.FeedData(input_tensor); + paddle_mobile.Predict_To(-1); + } + + return 0; +} diff --git a/test/net/test_super.cpp b/test/net/test_super.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1ee1fc0f852cd9f30391069032e556796e922fda --- /dev/null +++ b/test/net/test_super.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../../src/common/types.h" +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + // paddle_mobile.SetThreadNum(4); + auto time1 = paddle_mobile::time(); +#ifdef PADDLE_MOBILE_CL + paddle_mobile.SetCLPath("/data/local/tmp/bin"); +#endif + + auto isok = paddle_mobile.Load(std::string(g_super) + "/model", + std::string(g_super) + "/params", true, false, + 1, true); + + // auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true); + if (isok) { + auto time2 = paddle_mobile::time(); + std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" + << std::endl; + + std::vector input; + std::vector dims{1, 1, 300, 300}; + GetInput(g_yolo_img, &input, dims); + + std::vector vec_result; + + auto time3 = paddle_mobile::time(); + int max = 10; + for (int i = 0; i < max; ++i) { + vec_result = paddle_mobile.Predict(input, dims); + } + auto time4 = paddle_mobile::time(); + + std::cout << "predict cost :" + << paddle_mobile::time_diff(time3, time4) / max << "ms" + << std::endl; + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + } + + std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " + "是否存在?" + << std::endl; + return 0; +} diff --git a/test/test_helper.h b/test/test_helper.h index 0eb11efd19b7d937f93eec14e163c8c42cb77f12..60e907fa6fb301b2e944a6d938f662225a743d41 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -36,16 +36,19 @@ static const char *g_squeezenet = "../models/squeezenet"; static const char *g_googlenet = "../models/googlenet"; static const char *g_googlenet_quali = "../models/googlenet_combine_quali"; static const char *g_mobilenet = "../models/mobilenet"; -static const char *g_mobilenet_mul = "../models/mobilenet_mul"; +static const char *g_mobilenet_mul = "../models/r"; static const char *g_alexnet = "../models/alexnet"; static const char *g_inceptionv4 = "../models/inceptionv4"; +static const char *g_inceptionv3 = + "../models/InceptionV3_Spatial_Attention_Model"; static const char *g_nlp = "../models/nlp"; +static const char *g_super = "../models/superresoltion"; static const char *g_resnet_50 = "../models/resnet_50"; static const char *g_resnet = "../models/resnet"; static const char *g_googlenet_combine = "../models/googlenet_combine"; static const char *g_yolo = "../models/yolo"; static const char *g_yolo_combined = "../models/yolo_combined"; -static const char *g_yolo_mul = "../models/yolo_mul"; +static const char *g_yolo_mul = "../models/d"; static const char *g_fluid_fssd_new = "../models/fluid_fssd_new"; static const char *g_test_image_1x3x224x224 = "../images/test_image_1x3x224x224_float"; diff --git a/tools/op.cmake b/tools/op.cmake index 5901a23a1ff50c357d69cfff63cdfd543dbf8f9d..3a4a0597a44694c4edea8173af47627cb5680df2 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -102,7 +102,6 @@ if (CON GREATER -1) set(MUL_OP ON) set(RESHAPE_OP ON) set(SOFTMAX_OP ON) - set(FOUND_MATCH ON) endif() @@ -110,24 +109,25 @@ list(FIND NET "FPGA_NET_V1" CON) if (CON GREATER -1) message("FPGA_NET_V1 enabled") set(FUSION_CONVADDRELU_OP ON) - set(FUSION_CONVADDBNRELU_OP ON) - set(FUSION_CONVADDBN_OP ON) set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_FC_OP ON) - set(FUSION_FCRELU_OP ON) set(POOL_OP ON) - set(CONCAT_OP ON) set(SOFTMAX_OP ON) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) + set(TANH_OP ON) + set(ELEMENTWISEADD_OP ON) + set(TRANSPOSE2_OP ON) set(FUSION_CONVADD_OP ON) + set(SPLIT_OP ON) + set(FUSION_DECONVADD_OP ON) + set(FUSION_DECONVADDRELU_OP ON) set(FOUND_MATCH ON) endif() list(FIND NET "FPGA_NET_V2" CON) if (CON GREATER -1) message("FPGA_NET_V2 enabled") - set(FEED_OP ON) set(FUSION_CONVADDRELU_OP ON) set(FUSION_ELEMENTWISEADDRELU_OP ON) set(FUSION_FC_OP ON) @@ -135,9 +135,6 @@ if (CON GREATER -1) set(SOFTMAX_OP ON) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) - set(CONV_TRANSPOSE_OP ON) - set(FUSION_DECONVRELU_OP ON) - #set(SLICE_OP ON) set(TANH_OP ON) set(ELEMENTWISEADD_OP ON) set(TRANSPOSE2_OP ON) @@ -145,8 +142,6 @@ if (CON GREATER -1) set(SPLIT_OP ON) set(FUSION_DECONVADD_OP ON) set(FUSION_DECONVADDRELU_OP ON) - - set(FOUND_MATCH ON) endif()