diff --git a/.gitignore b/.gitignore index ed131bdbbad6bd4dad500fa29f40a29fddeb7593..9823f8c945c1be8e717b622a993d402c49517b7c 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/MobileNetDemo/MobileNetDemo/Resources + +build* diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake index 82dd60f4b4391c89e09533418874e0d6d8174d84..3db715ba74945d9e501637af5ef3086e4f11b294 100644 --- a/cmake/cross_compiling/postproject.cmake +++ b/cmake/cross_compiling/postproject.cmake @@ -57,22 +57,20 @@ function(check_linker_flag) endforeach() set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE) endfunction() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if (LITE_ON_TINY_PUBLISH) if((NOT LITE_WITH_PYTHON)) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") endif() + if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") + endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections") check_linker_flag(-Wl,--gc-sections) endif() -if(LITE_WITH_OPENCL) - if(ARM_TARGET_LANG STREQUAL "clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions") - endif() -endif() - if(LITE_WITH_OPENMP) find_package(OpenMP REQUIRED) if(OPENMP_FOUND OR OpenMP_CXX_FOUND) diff --git a/cmake/lite.cmake b/cmake/lite.cmake index d69232a0d95518217fba9eb3a7e15f7441695778..d16e7af3d7a61fff0ef13cf7cfcbd7af542e7c3f 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels") set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt") file(WRITE ${kernels_src_list} "") # clean + +# file to record faked kernels for opt python lib +set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt") +file(WRITE ${fake_kernels_src_list} "") # clean + if(LITE_BUILD_TAILOR) set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list") file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) @@ -313,56 +318,65 @@ function(add_kernel TARGET device level) return() endif() - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - # the source list will collect for model_optimize_tool to fake kernel generation. - foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - return() - endif() - - # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated, - # no need to continue the compilation of the true kernel source. - if (LITE_ON_MODEL_OPTIMIZE_TOOL) - return() - endif(LITE_ON_MODEL_OPTIMIZE_TOOL) - if ("${device}" STREQUAL "Host") set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "ARM") if (NOT LITE_WITH_ARM) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "X86") if (NOT LITE_WITH_X86) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + elseif (LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "NPU") if (NOT LITE_WITH_NPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "XPU") if (NOT LITE_WITH_XPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "FPGA") if (NOT LITE_WITH_FPGA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "BM") if (NOT LITE_WITH_BM) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") @@ -375,6 +389,9 @@ function(add_kernel TARGET device level) endif() if ("${device}" STREQUAL "OPENCL") if (NOT LITE_WITH_OPENCL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "") @@ -382,6 +399,9 @@ function(add_kernel TARGET device level) if ("${device}" STREQUAL "CUDA") if (NOT LITE_WITH_CUDA) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "") diff --git a/docs/benchmark/benchmark_tools.md b/docs/benchmark/benchmark_tools.md index 36bf8831f142b1bd6c988b0ece7192437643fcbf..3cf1486307ad79a47dfbfe199e3d6d708c99db4b 100644 --- a/docs/benchmark/benchmark_tools.md +++ b/docs/benchmark/benchmark_tools.md @@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true > 不同手机,不同版本,测试模型的性能数据不同。 ```shell -run benchmark armv7 +run benchmark armv8 -------------------------------------- PaddleLite Benchmark Threads=1 Warmup=10 Repeats=30 --- mnasnet avg = 159.8427 ms --- mobilenet_v1 avg = 235.0072 ms --- mobilenet_v2 avg = 173.0387 ms --- shufflenet_v2 avg = 76.0040 ms --- squeezenet_v11 avg = 164.2957 ms +mnasnet min = 19.83500 max = 19.38500 average = 19.65503 +mobilenetv1 min = 32.00600 max = 31.56900 average = 31.81983 +mobilenetv2 min = 22.37900 max = 22.08700 average = 22.28623 +shufflenetv2 min = 10.80400 max = 10.62900 average = 10.68890 +squeezenet min = 17.67400 max = 17.47900 average = 17.57677 Threads=2 Warmup=10 Repeats=30 --- mnasnet avg = 83.1287 ms --- mobilenet_v1 avg = 121.6029 ms --- mobilenet_v2 avg = 86.6175 ms --- shufflenet_v2 avg = 41.5761 ms --- squeezenet_v11 avg = 87.8678 ms +mnasnet min = 11.85600 max = 11.72000 average = 11.77127 +mobilenetv1 min = 18.75000 max = 18.64300 average = 18.70593 +mobilenetv2 min = 14.05100 max = 13.59900 average = 13.71450 +shufflenetv2 min = 6.67200 max = 6.58300 average = 6.63400 +squeezenet min = 12.07100 max = 11.33400 average = 11.41253 Threads=4 Warmup=10 Repeats=30 --- mnasnet avg = 73.3880 ms --- mobilenet_v1 avg = 119.0739 ms --- mobilenet_v2 avg = 85.3050 ms --- shufflenet_v2 avg = 38.0762 ms --- squeezenet_v11 avg = 64.2201 ms +mnasnet min = 7.19300 max = 7.02600 average = 7.08480 +mobilenetv1 min = 10.42000 max = 10.29100 average = 10.34267 +mobilenetv2 min = 8.61900 max = 8.46900 average = 8.54707 +shufflenetv2 min = 4.55200 max = 4.41900 average = 4.46477 +squeezenet min = 8.60000 max = 7.85200 average = 7.98407 -------------------------------------- -run benchmark armv8 +run benchmark armv7 -------------------------------------- PaddleLite Benchmark Threads=1 Warmup=10 Repeats=30 --- mnasnet avg = 165.3073 ms --- mobilenet_v1 avg = 306.0188 ms --- mobilenet_v2 avg = 195.1884 ms --- shufflenet_v2 avg = 99.3692 ms --- squeezenet_v11 avg = 156.6971 ms +mnasnet min = 20.98300 max = 20.81400 average = 20.92527 +mobilenetv1 min = 33.19000 max = 32.81700 average = 33.08490 +mobilenetv2 min = 25.91400 max = 25.61700 average = 25.73097 +shufflenetv2 min = 11.14300 max = 10.97600 average = 11.06757 +squeezenet min = 19.31800 max = 19.20000 average = 19.26530 Threads=2 Warmup=10 Repeats=30 --- mnasnet avg = 90.2290 ms --- mobilenet_v1 avg = 157.0007 ms --- mobilenet_v2 avg = 118.1607 ms --- shufflenet_v2 avg = 68.6804 ms --- squeezenet_v11 avg = 91.3090 ms +mnasnet min = 12.59900 max = 12.46600 average = 12.52207 +mobilenetv1 min = 19.05800 max = 18.94700 average = 18.97897 +mobilenetv2 min = 15.28400 max = 15.11300 average = 15.19843 +shufflenetv2 min = 6.97000 max = 6.81400 average = 6.90863 +squeezenet min = 12.87900 max = 12.12900 average = 12.22530 Threads=4 Warmup=10 Repeats=30 --- mnasnet avg = 179.9730 ms --- mobilenet_v1 avg = 204.0684 ms --- mobilenet_v2 avg = 181.6486 ms --- shufflenet_v2 avg = 123.2728 ms --- squeezenet_v11 avg = 412.9046 ms +mnasnet min = 7.31400 max = 7.12900 average = 7.20357 +mobilenetv1 min = 11.44000 max = 10.86900 average = 10.94383 +mobilenetv2 min = 9.14900 max = 9.03800 average = 9.09907 +shufflenetv2 min = 4.60600 max = 4.49400 average = 4.53360 +squeezenet min = 8.27000 max = 8.10600 average = 8.19000 -------------------------------------- ``` diff --git a/docs/demo_guides/npu.md b/docs/demo_guides/npu.md index 9722ff6aabda87cb02adc111dd1b29e9bdcf3f55..0bdec8d73a881c186d9c4141e2d59a1b2bf11d8b 100644 --- a/docs/demo_guides/npu.md +++ b/docs/demo_guides/npu.md @@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an --optimize_out_type=(protobuf|naive_buffer) \ --optimize_out= \ --valid_targets=npu,arm \ - --prefer_int8_kernel=(true|false) \ --record_tailoring_info =(true|false) ``` - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。 diff --git a/docs/demo_guides/opencl.md b/docs/demo_guides/opencl.md index 325a772df31ce3873941f74e8a4ed1069e0b3da2..e255038575796f0c1079f47fb859f8402ac79c1f 100644 --- a/docs/demo_guides/opencl.md +++ b/docs/demo_guides/opencl.md @@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h --arm_os=android \ --arm_abi=armv8 \ --arm_lang=gcc \ - build_test_arm_opencl + build_opencl ``` +注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。 + ### 编译产物说明 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物: diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md index e32500bd5851ddad0de3784fb47a7b6326aff6f4..cf0641b7314f112e9cb7ac4f0a9094bdbdaa7ca6 100644 --- a/docs/user_guides/library_tailoring.md +++ b/docs/user_guides/library_tailoring.md @@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 例如: ```bash -./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish +./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish ``` **注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径 @@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 #include #include #include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -182,4 +179,4 @@ int main(int argc, char** argv) { 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。 2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename`和`--param_filename`指定。 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。 -4. 需要使用Paddle-Lite 最新版本(release/v2.1.0之后)代码编译出的model_optimize_tool。 +4. 需要使用Paddle-Lite `release/v2.1.0`之后版本代码编译出的模型优化工具。 diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md index 47f663dc75cdcf0950c87bfe45a78e65604ccbaf..c3d5f527048519e851cc8b9e785dc39668e971a4 100644 --- a/docs/user_guides/model_optimize_tool.md +++ b/docs/user_guides/model_optimize_tool.md @@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式: --optimize_out_type=(protobuf|naive_buffer) \ --optimize_out= \ --valid_targets=(arm|opencl|x86|npu|xpu) \ - --prefer_int8_kernel=(true|false) \ --record_tailoring_info =(true|false) ``` @@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式: | --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 | | --optimize_out | 优化模型的输出路径。 | | --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 | -| --prefer_int8_kernel | 若待优化模型为int8量化模型(如量化训练得到的量化模型),则设置该选项为true以使用int8内核函数进行推理加速,默认为false。 | | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 | * 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file`和`--param_file`。 * 如果待优化的fluid模型是combined形式,请设置`--model_file`和`--param_file`,忽略`--model_dir`。 * 优化后的模型为以`.nb`名称结尾的单个文件。 +* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。 ### 功能二:统计模型算子信息、判断是否支持 diff --git a/docs/user_guides/model_quantization.md b/docs/user_guides/model_quantization.md index d90fa4bae34cccbcf809bdb2cd102eaf8c468b01..cf506cfa61e3942452ddaf1218d9d55c2fffa3fc 100644 --- a/docs/user_guides/model_quantization.md +++ b/docs/user_guides/model_quantization.md @@ -245,7 +245,6 @@ python compress.py \ --optimize_out_type=naive_buffer \ --optimize_out=mobilenet_v1_quant_opt \ --valid_targets=arm \ ---prefer_int8_kernel=true ``` 如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。 diff --git a/docs/user_guides/post_quant_no_data.md b/docs/user_guides/post_quant_no_data.md index 206045822b896e07fca2651768b32c89c7615cb2..4068249ff7544f42c5f2643c971eb003836b1f59 100644 --- a/docs/user_guides/post_quant_no_data.md +++ b/docs/user_guides/post_quant_no_data.md @@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir, 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。 -因为该模型会将量化的权重反量化,然后实际加载并执行FP32预测模型,所以opt命令的输入参数--prefer_int8_kernel不需要设置为true,同时其他参数按照实际情况参考文档设置。 比如在安卓手机ARM端进行预测,模型转换的命令为: ```bash ./opt --model_dir=./mobilenet_v1_quant \ diff --git a/docs/user_guides/post_quant_with_data.md b/docs/user_guides/post_quant_with_data.md index 8b293cc7e47a33037de3706a30fd583c5516d165..0044b47610a2a211859bdc42f83f1921a681d50b 100644 --- a/docs/user_guides/post_quant_with_data.md +++ b/docs/user_guides/post_quant_with_data.md @@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'): 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 -参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true,其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测,模型转换的命令为: +参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具,参数按照实际情况设置。比如在安卓手机ARM端进行预测,模型转换的命令为: ```bash ./opt --model_dir=./mobilenet_v1_quant \ --optimize_out_type=naive_buffer \ --optimize_out=mobilenet_v1_quant_opt \ - --valid_targets=arm \ - --prefer_int8_kernel=true + --valid_targets=arm ``` ### 3.2 量化模型预测 diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md index 6bb71938cab16a92e1c33e3d8276872fbcea580a..8f8aeb6af124bc4805c281e22e39cca51b507651 100644 --- a/docs/user_guides/tutorial.md +++ b/docs/user_guides/tutorial.md @@ -24,8 +24,7 @@ $ ./opt \ --param_file= \ --optimize_out_type=(protobuf|naive_buffer) \ --optimize_out= \ - --valid_targets=(arm|opencl|x86) \ - --prefer_int8_kernel=(ture|false) + --valid_targets=(arm|opencl|x86) ``` 其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。 diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 361a518c280150d167ace0c737a2822665b73ff9..12dd17c5a302259fb8f903735115106526716194 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") if (LITE_WITH_PYTHON) add_custom_target(publish_inference_python_lib ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so") + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + add_custom_target(publish_inference_python_installer ${TARGET} + COMMAND python setup.py bdist_wheel + WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ + DEPENDS publish_inference_python_lib) add_custom_target(publish_inference_python_light_demo ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/") @@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON) endif() add_dependencies(publish_inference_python_lib lite_pybind) add_dependencies(publish_inference publish_inference_python_lib) + add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_light_demo) endif() @@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) add_dependencies(publish_inference tiny_publish_cxx_lib) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD + COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so) endif() endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 8425d7923447d4245c0895a3f2e9409bfeaecd79..2a93331f4ac179cc35acb65bd9271c68a93d71ad 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH) return() endif() + +# add library for opt_base +lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils) +add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h) + if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling opt") lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index b739c78f7c883d62b39d88ae1a7f4bf76ae8932c..556a9e0af01854ff5c57a14dade72b81ed255964 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc, inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + + const std::vector quant_dequant_op = { + "fake_quantize_abs_max", + "fake_quantize_range_abs_max", + "fake_quantize_moving_average_abs_max", + "fake_quantize_dequantize_moving_average_abs_max", + "fake_dequantize_max_abs", + "fake_channel_wise_dequantize_max_abs"}; + bool is_quantized_model = false; + for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model; + ++i) { + auto *block_desc = program_desc_.GetBlock(i); + for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) { + auto *op_desc = block_desc->GetOp(j); + std::string op_type = op_desc->Type(); + if (std::find(quant_dequant_op.begin(), + quant_dequant_op.end(), + op_type) != quant_dequant_op.end()) { + is_quantized_model = true; + } + } + } + if (is_quantized_model) { + inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)}); + } + Program program(desc, scope_, inner_places); core::KernelPickFactor factor; diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc index 8c929e9c8700a65c868e2facd763b0ec36719e23..eec17cc30e308e7169b7d8c394c0e47eee0c1c3e 100644 --- a/lite/api/cxx_api_bin.cc +++ b/lite/api/cxx_api_bin.cc @@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) { int main(int argc, char** argv) { CHECK_EQ(argc, 3) << "usage: ./cmd "; - paddle::lite::Run(argv[1], std::stoi(argv[2])); + paddle::lite::Run(argv[1], atoi(argv[2])); return 0; } diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc index 3965843250abe45c43490bdbb4aaed58915e0908..cdf5b7fb06df35b2e7fb72fc4e33ccb721a0f7f7 100644 --- a/lite/api/light_api_impl.cc +++ b/lite/api/light_api_impl.cc @@ -58,6 +58,7 @@ void LightPredictorImpl::Run() { std::shared_ptr LightPredictorImpl::Clone() { LOG(FATAL) << "The Clone API is not supported in LigthPredictor"; + return nullptr; } std::string LightPredictorImpl::GetVersion() const { return lite::version(); } diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index 9164129dcf4566fc02803c1c7dcffd9e97a830d6..5342a36ec154b2bdde44fa72bc21e9d430ad4efe 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -95,7 +95,7 @@ void TestModel(const std::vector& valid_places, if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; + double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1; for (int i = 0; i < ref.size(); ++i) { for (int j = 0; j < ref[i].size(); ++j) { auto result = pdata[j * step + (out->dims()[1] * i)]; @@ -119,21 +119,21 @@ void TestModel(const std::vector& valid_places, // Get detailed result size_t output_tensor_num = predictor.GetOutputNames().size(); - VLOG(1) << "output tesnor num:" << output_tensor_num; + VLOG(1) << "output tensor num:" << output_tensor_num; for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { auto* output_tensor = predictor.GetOutput(tidx); VLOG(1) << "============= output tensor " << tidx << " =============\n"; auto out_dims = output_tensor->dims(); - VLOG(1) << "out_dims:" << out_dims; - - float sum = 0.f; - for (int i = 0; i < out_dims.production(); ++i) { - sum += output_tensor->data()[i]; - } - VLOG(1) << "out_dims.production():" << out_dims.production(); - VLOG(1) << "output tensor sum value:" << sum; - VLOG(1) << "output tensor mean value:" << sum / out_dims.production(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, out_dims.production()); + auto out_std_dev = compute_standard_deviation( + out_data, out_dims.production(), true, out_mean); + + VLOG(1) << "output tensor dims:" << out_dims; + VLOG(1) << "output tensor elements num:" << out_dims.production(); + VLOG(1) << "output tensor standard deviation:" << out_std_dev; + VLOG(1) << "output tensor mean value:" << out_mean; // print result for (int i = 0; i < out_dims.production(); ++i) { diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc index 26b9dc93da73e8f637c01fca8f7ea99a8e5e9af0..465f82056c6bb80b706cfb7d875773d75735911b 100644 --- a/lite/api/mobilenetv2_test.cc +++ b/lite/api/mobilenetv2_test.cc @@ -97,7 +97,7 @@ void TestModel(const std::vector& valid_places, if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { ASSERT_EQ(out->dims().production(), 1000); - double eps = 0.1; + double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1; for (int i = 0; i < ref.size(); ++i) { for (int j = 0; j < ref[i].size(); ++j) { auto result = pdata[j * step + (out->dims()[1] * i)]; @@ -121,21 +121,21 @@ void TestModel(const std::vector& valid_places, // Get detailed result size_t output_tensor_num = predictor.GetOutputNames().size(); - VLOG(1) << "output tesnor num:" << output_tensor_num; + VLOG(1) << "output tensor num:" << output_tensor_num; for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { auto* output_tensor = predictor.GetOutput(tidx); VLOG(1) << "============= output tensor " << tidx << " =============\n"; auto out_dims = output_tensor->dims(); - VLOG(1) << "out_dims:" << out_dims; - - float sum = 0.f; - for (int i = 0; i < out_dims.production(); ++i) { - sum += output_tensor->data()[i]; - } - VLOG(1) << "out_dims.production():" << out_dims.production(); - VLOG(1) << "output tensor sum value:" << sum; - VLOG(1) << "output tensor mean value:" << sum / out_dims.production(); + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, out_dims.production()); + auto out_std_dev = compute_standard_deviation( + out_data, out_dims.production(), true, out_mean); + + VLOG(1) << "output tensor dims:" << out_dims; + VLOG(1) << "output tensor elements num:" << out_dims.production(); + VLOG(1) << "output tensor standard deviation:" << out_std_dev; + VLOG(1) << "output tensor mean value:" << out_mean; // print result for (int i = 0; i < out_dims.production(); ++i) { diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index ed4ab75366a0ab669fb8fe6e1d15ad9fd2f5aef5..b0f7a0479f0db91b816838f9d0ee1cc31b9b232a 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -138,7 +138,7 @@ void Run(const std::vector>& input_shapes, std::ofstream out(FLAGS_arg_name + ".txt"); for (size_t i = 0; i < arg_num; ++i) { sum += arg_tensor->data()[i]; - out << std::to_string(arg_tensor->data()[i]) << "\n"; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; } LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() << ", mean value is " << sum * 1. / arg_num; diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc index bea0ab15e49dc55e0a8f5f29d455b5504345cf19..375d249476bf5323d69ea41c3f11d07e9c8bc711 100644 --- a/lite/api/model_test_classify.cc +++ b/lite/api/model_test_classify.cc @@ -250,7 +250,7 @@ void Run(const std::vector>& input_shapes, std::ofstream out(FLAGS_arg_name + ".txt"); for (size_t i = 0; i < arg_num; ++i) { sum += arg_tensor->data()[i]; - out << std::to_string(arg_tensor->data()[i]) << "\n"; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; } LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() << ", mean value is " << sum * 1. / arg_num; diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc index 36a23999d33b38d8c54f604850bf5d4120ce3d72..f9be12b2c78c623a2b2c9852850576cc11815bd3 100644 --- a/lite/api/model_test_detection.cc +++ b/lite/api/model_test_detection.cc @@ -264,7 +264,7 @@ void Run(const std::vector>& input_shapes, std::ofstream out(FLAGS_arg_name + ".txt"); for (size_t i = 0; i < arg_num; ++i) { sum += arg_tensor->data()[i]; - out << std::to_string(arg_tensor->data()[i]) << "\n"; + out << paddle::lite::to_string(arg_tensor->data()[i]) << "\n"; } LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() << ", mean value is " << sum * 1. / arg_num; diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 0b995fa8abde5850acefed8dee384b9206258f6a..51f9b565196d30520f0cf73ea41a01fed0cc49e8 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -67,7 +67,6 @@ DEFINE_string(valid_targets, "arm", "The targets this model optimized for, should be one of (arm, " "opencl, x86), splitted by space"); -DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels"); DEFINE_bool(print_supported_ops, false, "Print supported operators on the inputed target"); @@ -123,11 +122,6 @@ std::vector ParserValidPlaces() { << "At least one target should be set, should set the " "command argument 'valid_targets'"; - if (FLAGS_prefer_int8_kernel) { - LOG(WARNING) << "Int8 mode is only support by ARM target"; - valid_places.insert(valid_places.begin(), - Place{TARGET(kARM), PRECISION(kInt8)}); - } return valid_places; } @@ -257,7 +251,6 @@ void PrintHelpInfo() { " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" - " `--prefer_int8_kernel=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd86f486248a2daccde13da078ae3860d8e31169 --- /dev/null +++ b/lite/api/opt_base.cc @@ -0,0 +1,364 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/api/opt_base.h" +#include "all_kernel_faked.cc" // NOLINT + +namespace paddle { +namespace lite_api { + +void OptBase::SetModelDir(const std::string& model_path) { + opt_config_.set_model_dir(model_path); +} + +void OptBase::SetModelFile(const std::string& model_path) { + opt_config_.set_model_file(model_path); +} + +void OptBase::SetParamFile(const std::string& param_path) { + opt_config_.set_param_file(param_path); +} + +void OptBase::SetModelType(std::string optimize_out_type) { + if (optimize_out_type == "protobuf") { + model_type_ = LiteModelType::kProtobuf; + } else if (optimize_out_type == "naive_buffer") { + model_type_ = LiteModelType::kNaiveBuffer; + } else { + LOG(FATAL) << "Unsupported Model type :" << optimize_out_type; + } +} + +void OptBase::SetValidPlaces(const std::string& valid_places) { + valid_places_.clear(); + auto target_reprs = lite::Split(valid_places, ","); + for (auto& target_repr : target_reprs) { + if (target_repr == "arm") { + valid_places_.emplace_back(TARGET(kARM)); + } else if (target_repr == "opencl") { + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}); + valid_places_.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}); + valid_places_.emplace_back( + TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel + } else if (target_repr == "x86") { + valid_places_.emplace_back(TARGET(kX86)); + } else if (target_repr == "npu") { + valid_places_.emplace_back(TARGET(kNPU)); + } else if (target_repr == "xpu") { + valid_places_.emplace_back(TARGET(kXPU)); + } else { + LOG(FATAL) << lite::string_format( + "Wrong target '%s' found, please check the command flag " + "'valid_targets'", + target_repr.c_str()); + } + } + CHECK(!valid_places_.empty()) + << "At least one target should be set, should set the " + "command argument 'valid_targets'"; +} + +void OptBase::SetOptimizeOut(const std::string& optimized_out_path) { + optimize_out_path_ = optimized_out_path; +} + +void OptBase::RunOptimize(bool record_strip_info) { + CheckIfModelSupported(false); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + opt_config_.set_valid_places(valid_places_); + if (model_set_dir_ != "") { + RunOptimizeFromModelSet(record_strip_info); + } else { + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + optimize_out_path_, model_type_, record_strip_info); + auto resulted_model_name = + record_strip_info ? "information of striped model" : "optimized model"; + std::cout << "Save the " << resulted_model_name + << " into :" << optimize_out_path_ << "successfully"; + } +} + +// collect ops info of modelset +void CollectModelMetaInfo(const std::string& output_dir, + const std::vector& models, + const std::string& filename) { + std::set total; + for (const auto& name : models) { + std::string model_path = + lite::Join({output_dir, name, filename}, "/"); + auto lines = lite::ReadLines(model_path); + total.insert(lines.begin(), lines.end()); + } + std::string output_path = + lite::Join({output_dir, filename}, "/"); + lite::WriteLines(std::vector(total.begin(), total.end()), + output_path); +} + +void OptBase::SetModelSetDir(const std::string& model_set_path) { + model_set_dir_ = model_set_path; +} +void OptBase::RunOptimizeFromModelSet(bool record_strip_info) { + // 1. mkdir of outputed optimized model set. + lite::MkDirRecur(optimize_out_path_); + auto model_dirs = lite::ListDir(model_set_dir_, true); + if (model_dirs.size() == 0) { + LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model"; + } + + // 2. optimize each model in inputed model set dir. + std::string model_file = opt_config_.model_file(); + std::string param_file = opt_config_.param_file(); + for (const auto& name : model_dirs) { + std::string input_model_dir = + lite::Join({model_set_dir_, name}, "/"); + std::string output_model_dir = + lite::Join({optimize_out_path_, name}, "/"); + + if (opt_config_.model_file() != "" && opt_config_.param_file() != "") { + auto model_file_path = + lite::Join({input_model_dir, model_file}, "/"); + auto param_file_path = + lite::Join({input_model_dir, param_file}, "/"); + } + + std::cout << "Start optimize model: " << input_model_dir; + + opt_config_.set_model_dir(input_model_dir); + opt_config_.set_model_file(model_file); + opt_config_.set_param_file(param_file); + + auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_); + opt_predictor->SaveOptimizedModel( + optimize_out_path_, model_type_, record_strip_info); + + std::cout << "Optimize done. "; + } + + // 3. if record_strip_info = true, we will record striping info + if (record_strip_info) { + // Collect all models information + CollectModelMetaInfo( + optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME); + CollectModelMetaInfo(optimize_out_path_, + model_dirs, + lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); + std::cout << "Record the information of stripped models into :" + << optimize_out_path_ << "successfully"; + } +} + +void OptBase::PrintHelpInfo() { + const std::string opt_version = lite::version(); + const char help_info[] = + "At least one argument should be inputed. Valid arguments are listed " + "below:\n" + " Arguments of help information:\n" + " `help()` Print help infomation\n" + " Arguments of model optimization:\n" + " `set_model_dir(model_dir)`\n" + " `set_model_file(model_file_path)`\n" + " `set_param_file(param_file_path)`\n" + " `set_model_type(protobuf|naive_buffer)`\n" + " `set_optimize_out(output_optimize_model_dir)`\n" + " `set_valid_places(arm|opencl|x86|npu|xpu)`\n" + " `run_optimize(false|true)`\n" + " ` ----fasle&true refer to whether to record ops info for " + "tailoring lib, false by default`\n" + " Arguments of model checking and ops information:\n" + " `print_all_ops()` Display all the valid operators of " + "Paddle-Lite\n" + " `print_supported_ops` Display supported operators of valid " + "places\n" + " `check_if_model_supported()` Check if the input model is " + "supported\n"; + + std::cout << "opt version:" << opt_version << std::endl + << help_info << std::endl; +} +// 2. Print supported info of inputed ops +void OptBase::PrintOpsInfo(const std::set& valid_ops) { + std::vector lite_supported_targets = {"kHost", + "kX86", + "kCUDA", + "kARM", + "kOpenCL", + "kFPGA", + "kNPU", + "kXPU", + "kAny", + "kUnk"}; + // Get the lengh of the first column: maximum length of the op_type + size_t maximum_optype_length = 0; + for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) { + maximum_optype_length = it->first.size() > maximum_optype_length + ? it->first.size() + : maximum_optype_length; + } + std::cout << std::setiosflags(std::ios::internal); + // Print the first row: OP_nam taget1 target2 ... + std::cout << std::setw(maximum_optype_length) << "OP_name"; + for (size_t i = 0; i < lite_supported_targets.size(); i++) { + std::cout << std::setw(10) << lite_supported_targets[i].substr(1); + } + std::cout << std::endl; + // Print the name of supported ops and mark if it's supported by each target + // print the support info of inputed ops: valid_ops + for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) { + std::cout << std::setw(maximum_optype_length) << *op; + // Check: If this kernel doesn't match any operator, we will skip it. + if (supported_ops.find(*op) == supported_ops.end()) { + continue; + } + // Print OP info. + auto ops_valid_places = supported_ops.at(*op); + for (size_t i = 0; i < lite_supported_targets.size(); i++) { + if (std::find(ops_valid_places.begin(), + ops_valid_places.end(), + lite_supported_targets[i]) != ops_valid_places.end()) { + std::cout << std::setw(10) << "Y"; + } else { + std::cout << std::setw(10) << " "; + } + } + std::cout << std::endl; + } +} + +void OptBase::DisplayKernelsInfo() { // Display kernel information + std::cout << ::paddle::lite::KernelRegistry::Global().DebugString(); +} +void OptBase::PrintAllOps() { + // 1. Get supported ops on these targets + std::set valid_ops; + for (size_t i = 0; i < supported_ops_target.size(); i++) { + auto ops = supported_ops_target[i]; + valid_ops.insert(ops.begin(), ops.end()); + } + // 2. Print support info of these ops + PrintOpsInfo(valid_ops); +} + +void OptBase::PrintSupportedOps() { + // 1. Get the valid hardware targets + std::vector target_types = {}; + for (size_t i = 0; i < valid_places_.size(); i++) { + target_types.push_back(valid_places_[i].target); + } + std::string targets_str = TargetToStr(target_types[0]); + for (size_t i = 1; i < target_types.size(); i++) { + targets_str = targets_str + TargetToStr(target_types[i]); + } + std::cout << "Supported OPs on '" << targets_str << "': " << std::endl; + target_types.push_back(TARGET(kHost)); + target_types.push_back(TARGET(kUnk)); + + // 2. Get supported ops on these targets + std::set valid_ops; + for (size_t i = 0; i < target_types.size(); i++) { + auto ops = supported_ops_target[static_cast(target_types[i])]; + valid_ops.insert(ops.begin(), ops.end()); + } + // 3. Print support info of these ops + PrintOpsInfo(valid_ops); +} + +// test whether this model is supported +void OptBase::CheckIfModelSupported(bool print_ops_info) { + // 1. parse valid places and valid targets + auto valid_ops = supported_ops_target[static_cast(TARGET(kHost))]; + auto valid_unktype_ops = supported_ops_target[static_cast(TARGET(kUnk))]; + valid_ops.insert( + valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end()); + for (size_t i = 0; i < valid_places_.size(); i++) { + auto target = valid_places_[i].target; + auto ops = supported_ops_target[static_cast(target)]; + valid_ops.insert(valid_ops.end(), ops.begin(), ops.end()); + } + // get valid ops + std::set valid_ops_set(valid_ops.begin(), valid_ops.end()); + + // 2.Load model into program to get ops in model + std::string prog_path = opt_config_.model_dir() + "/__model__"; + if (!(opt_config_.model_file()).empty() && + !(opt_config_.param_file()).empty()) { + prog_path = opt_config_.model_file(); + } + lite::cpp::ProgramDesc cpp_prog; + framework::proto::ProgramDesc pb_proto_prog = + *lite::LoadProgram(prog_path, false); + lite::pb::ProgramDesc pb_prog(&pb_proto_prog); + // Transform to cpp::ProgramDesc + lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog); + + std::set unsupported_ops; + std::set input_model_ops; + for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) { + auto current_block = cpp_prog.GetBlock(index); + for (size_t i = 0; i < current_block->OpsSize(); ++i) { + auto& op_desc = *current_block->GetOp(i); + auto op_type = op_desc.Type(); + input_model_ops.insert(op_type); + if (valid_ops_set.count(op_type) == 0) { + unsupported_ops.insert(op_type); + } + } + } + // 3. Print ops_info of input model and check if this model is supported + if (print_ops_info) { + std::cout << "OPs in the input model include:\n"; + PrintOpsInfo(input_model_ops); + } + if (!unsupported_ops.empty()) { + std::string unsupported_ops_str = *unsupported_ops.begin(); + for (auto op_str = ++unsupported_ops.begin(); + op_str != unsupported_ops.end(); + op_str++) { + unsupported_ops_str = unsupported_ops_str + ", " + *op_str; + } + std::vector targets = {}; + for (size_t i = 0; i < valid_places_.size(); i++) { + targets.push_back(valid_places_[i].target); + } + std::sort(targets.begin(), targets.end()); + targets.erase(unique(targets.begin(), targets.end()), targets.end()); + std::string targets_str = TargetToStr(targets[0]); + for (size_t i = 1; i < targets.size(); i++) { + targets_str = targets_str + "," + TargetToStr(targets[i]); + } + + LOG(ERROR) << "Error: This model is not supported, because " + << unsupported_ops.size() << " ops are not supported on '" + << targets_str << "'. These unsupported ops are: '" + << unsupported_ops_str << "'."; + exit(1); + } + if (print_ops_info) { + std::cout << "Paddle-Lite supports this model!" << std::endl; + exit(1); + } +} +} // namespace lite_api +} // namespace paddle diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h new file mode 100644 index 0000000000000000000000000000000000000000..a8d6d0390ccd3f1c9b0291b1bcf6eb1ecc47a248 --- /dev/null +++ b/lite/api/opt_base.h @@ -0,0 +1,86 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines Opt and basic functions about model transformation. + */ + +#ifndef PADDLE_LITE_OPT_H_ // NOLINT +#define PADDLE_LITE_OPT_H_ +#include +#include +#include +#include +#include +// stores the map that records the source_file path of each kernel. +#include "kernel_src_map.h" // NOLINT +#include "lite/api/cxx_api.h" +// version of Paddle-lite +#include "lite/core/version.h" +// model parser functions to pre-load model to verify if this model is supported +#include "lite/model_parser/compatible_pb.h" +#include "lite/model_parser/pb/program_desc.h" +#include "lite/utils/string.h" +// recorded all the ops supported by paddle-lite +#include "supported_kernel_op_info.h" // NOLINT + +namespace paddle { +namespace lite_api { + +/// The PaddlePredictor defines the basic interfaces for different kinds of +/// predictors. +class LITE_API OptBase { + public: + OptBase() = default; + void SetModelSetDir(const std::string &model_set_path); + void SetModelDir(const std::string &model_path); + void SetModelFile(const std::string &model_path); + void SetParamFile(const std::string ¶m_path); + void SetValidPlaces(const std::string &valid_places); + void SetOptimizeOut(const std::string &optimized_out_path); + // set optimized_model type + void SetModelType(std::string model_type); + // transform and save the optimized model + void RunOptimize(bool record_strip_info = false); + + // fuctions of printing info + // 1. help info + void PrintHelpInfo(); + // 2. PrintOpsInfo + void PrintOpsInfo(const std::set &valid_ops = + {}); // print supported ops on target_types + void PrintAllOps(); // print all ops + void PrintSupportedOps(); // print ops supported on valid_places_ + void DisplayKernelsInfo(); // Display kernel information + // 3. Check if this model is supported + void CheckIfModelSupported(bool print_ops_info = true); + + private: + CxxConfig opt_config_; + // valid places for the optimized_model + std::vector valid_places_; + // filename of the optimized_model + std::string optimize_out_path_; + // type of the optimized_model, kNaiveBuffer default. + LiteModelType model_type_{LiteModelType::kNaiveBuffer}; + // Dir path of a set of models, this should be combined with model + std::string model_set_dir_; + + void RunOptimizeFromModelSet(bool record_strip_info = false); +}; + +} // namespace lite_api +} // namespace paddle + +#endif // NOLINT diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt index 43178a37c663bb09acb7c025e021cbc91bf0cc5d..ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8 100644 --- a/lite/api/python/CMakeLists.txt +++ b/lite/api/python/CMakeLists.txt @@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON) return() endif() +# to create setup.py for packeting whl for Paddle-Lite and opt + +execute_process( + COMMAND git describe --tags --exact-match + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_LITE_TAG + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_LITE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE +) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) add_subdirectory(pybind) #add_subdirectory(interface) diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/lite/api/python/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt index eabb6b150b93a722282118c3932676cd1aee5da8..b1de18d50c1582b0f872ad38d24939665ab1d3b0 100644 --- a/lite/api/python/pybind/CMakeLists.txt +++ b/lite/api/python/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ set(PYBIND_DEPS pybind python paddle_api_light paddle_api) if (NOT LITE_ON_TINY_PUBLISH) - set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full) + set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) endif() lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index 40b6db6d42c3a065ec09d535f0e9da22e8fa0399..942d7f8b540a6ff7ae6d62e98e6e573e1af12aa8 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -26,11 +26,12 @@ #ifndef LITE_ON_TINY_PUBLISH #include "lite/api/cxx_api.h" -#include "lite/api/paddle_use_passes.h" +#include "lite/api/opt_base.h" #endif #include "lite/api/light_api.h" #include "lite/api/paddle_api.h" +#include "lite/core/tensor.h" namespace py = pybind11; @@ -48,10 +49,27 @@ using lite_api::DataLayoutType; using lite_api::Place; using lite_api::MLUCoreVersion; using lite::LightPredictorImpl; +using lite_api::OptBase; #ifndef LITE_ON_TINY_PUBLISH using lite::CxxPaddleApiImpl; static void BindLiteCxxPredictor(py::module *m); +void BindLiteOpt(py::module *m) { + py::class_ opt_base(*m, "Opt"); + opt_base.def(py::init<>()) + .def("set_model_dir", &OptBase::SetModelDir) + .def("set_modelset_dir", &OptBase::SetModelSetDir) + .def("set_model_file", &OptBase::SetModelFile) + .def("set_param_file", &OptBase::SetParamFile) + .def("set_valid_places", &OptBase::SetValidPlaces) + .def("set_optimize_out", &OptBase::SetOptimizeOut) + .def("set_model_type", &OptBase::SetModelType) + .def("run_optimize", &OptBase::RunOptimize) + .def("help", &OptBase::PrintHelpInfo) + .def("print_supported_ops", &OptBase::PrintSupportedOps) + .def("display_kernels_info", &OptBase::DisplayKernelsInfo) + .def("print_all_ops", &OptBase::PrintAllOps); +} #endif static void BindLiteLightPredictor(py::module *m); static void BindLiteCxxConfig(py::module *m); diff --git a/lite/api/python/pybind/pybind.h b/lite/api/python/pybind/pybind.h index ca05f24b32fd0b0418d9cf595fe6134b34fa725f..15609957e05391be54466262f962e151594ef383 100644 --- a/lite/api/python/pybind/pybind.h +++ b/lite/api/python/pybind/pybind.h @@ -22,11 +22,15 @@ namespace lite { namespace pybind { void BindLiteApi(pybind11::module *m); +void BindLiteOpt(pybind11::module *m); -PYBIND11_MODULE(lite_core, m) { +PYBIND11_MODULE(lite, m) { m.doc() = "C++ core of Paddle-Lite"; BindLiteApi(&m); +#ifndef LITE_ON_TINY_PUBLISH + BindLiteOpt(&m); +#endif } } // namespace pybind diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in new file mode 100644 index 0000000000000000000000000000000000000000..79028fb7493bf55eab74aa76ee51ac79f418ba0a --- /dev/null +++ b/lite/api/python/setup.py.in @@ -0,0 +1,72 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# module of pack whl installer for Paddle-lite + +import shutil +import os +from setuptools import setup, Distribution + + +class BinaryDistribution(Distribution): + 'binary distribution' + def has_ext_modules(foo): + return True + + +# get paddle-lite version, if it's not based on a release tag, we use commit id instead +PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@" +PADDLELITE_TAG = "@PADDLE_LITE_TAG@" +if PADDLELITE_TAG == "": + PADDLELITE_VERSION = PADDLELITE_COMMITE +else: + PADDLELITE_VERSION = PADDLELITE_TAG + +# core lib of paddlelite is stored as lite.so +LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' +PACKAGE_DATA = {'paddlelite': ['lite.so']} +# put all thirdparty libraries in paddlelite.libs +PACKAGE_DATA['paddlelite.libs'] = [] +LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' +if '${WITH_MKL}' == 'ON': + shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) + shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] + +# link lite.so to paddlelite.libs +COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ +/inference_lite_lib/python/install/lite/lite.so" +if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + +# remove unused paddle/libs/__init__.py +if os.path.isfile(LIB_PATH+'/__init__.py'): + os.remove(LIB_PATH+'/__init__.py') + +# set dir path of each package +PACKAGE_DIR = { + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddlelite.libs': LIB_PATH, + 'paddlelite': LITE_PATH +} + +setup( + name='paddlelite', + version=PADDLELITE_VERSION, + description='Paddle-Lite Library', + packages=['paddlelite', 'paddlelite.libs'], + package_dir=PACKAGE_DIR, + package_data=PACKAGE_DATA, + distclass=BinaryDistribution +) diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h index 71752c942bb53e7f2ed289ac0d965ae1d1007c55..a17fc331310cfe17ec36be504b94ddacc724e90f 100644 --- a/lite/api/test_helper.h +++ b/lite/api/test_helper.h @@ -17,6 +17,7 @@ #include #include #include +#include // for eval DEFINE_string(model_dir, "", "model dir"); @@ -43,5 +44,31 @@ inline double GetCurrentUS() { return 1e+6 * time.tv_sec + time.tv_usec; } +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} + +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc index 186ad19735799dcb91641354af4b4f09692bfce9..47a4d427f5400212a80fc31336e462a1c48bd640 100644 --- a/lite/backends/arm/math/elementwise.cc +++ b/lite/backends/arm/math/elementwise.cc @@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast(const float* dinx, } } +template <> +void elementwise_add_grad(const float* dout_grad, + float* x_grad, + int num) { + int cnt = num >> 4; + int remain = num & 0x0f; +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + const float* out_data = dout_grad + 16 * i; + float* x_data = x_grad + 16 * i; + float32x4_t din0 = vld1q_f32(out_data); + float32x4_t din1 = vld1q_f32(out_data + 4); + float32x4_t din2 = vld1q_f32(out_data + 8); + float32x4_t din3 = vld1q_f32(out_data + 12); + vst1q_f32(x_data, din0); + vst1q_f32(x_data + 4, din1); + vst1q_f32(x_data + 8, din2); + vst1q_f32(x_data + 12, din3); + } + if (remain > 0) { + const float* out_data = dout_grad + 16 * cnt; + float* x_data = x_grad + 16 * cnt; + for (int i = 0; i < remain; ++i) { + x_data[i] = out_data[i]; + } + } +} +// we assume that y_data numel less than x_data, otherwise, call this function +// by change x_grad and y_grad position +template <> +void elementwise_add_grad_broadcast(const float* dout_grad, + float* x_grad, + float* y_grad, + int pre, + int n, + int post) { + if (x_grad) { + elementwise_add_grad(dout_grad, x_grad, pre * n * post); + } + if (y_grad) { + memset(y_grad, 0, n * sizeof(float)); +#pragma omp parallel for + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + float sum = 0; + int cnt = post >> 2; + int remain = post & 0x03; + const float* out_data = dout_grad + (i * n + j) * post; + float32x4_t sum_v = vdupq_n_f32(0); + for (int ci = 0; ci < cnt; ++ci) { + float32x4_t din = vld1q_f32(out_data + 4 * ci); + sum_v = vaddq_f32(sum_v, din); + } + out_data += 4 * cnt; + for (int ci = 0; ci < remain; ++ci) { + sum += out_data[ci]; + } + float32x2_t high = vget_high_f32(sum_v); + float32x2_t low = vget_low_f32(sum_v); + sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) + + vget_lane_f32(low, 0) + vget_lane_f32(low, 1); + y_grad[j] += sum; + } + } + } +} template <> void elementwise_sub(const float* dinx, const float* diny, @@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast(const float* dinx, } } } +// we assume the formula is x-y +template <> +void elementwise_sub_grad(const float* dout_grad, + float* x_grad, + float* y_grad, + int num) { + if (x_grad) { + elementwise_add_grad(dout_grad, x_grad, num); + } + if (y_grad) { + int cnt = num >> 4; + int remain = num & 0x0f; + float32x4_t minus = vdupq_n_f32(-1); +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + const float* out_data = dout_grad + 16 * i; + float* y_data = y_grad + 16 * i; + float32x4_t din0 = vld1q_f32(out_data); + float32x4_t din1 = vld1q_f32(out_data + 4); + float32x4_t din2 = vld1q_f32(out_data + 8); + float32x4_t din3 = vld1q_f32(out_data + 12); + din0 = vmulq_f32(din0, minus); + din1 = vmulq_f32(din1, minus); + din2 = vmulq_f32(din2, minus); + din3 = vmulq_f32(din3, minus); + vst1q_f32(y_data, din0); + vst1q_f32(y_data + 4, din1); + vst1q_f32(y_data + 8, din2); + vst1q_f32(y_data + 12, din3); + } + if (remain > 0) { + const float* out_data = dout_grad + 16 * cnt; + float* y_data = y_grad + 16 * cnt; + for (int i = 0; i < remain; ++i) { + y_data[i] = -out_data[i]; + } + } + } +} +// we assume that y_data numel less than x_data, otherwise, call this function +// by change x_grad and y_grad position +template <> +void elementwise_sub_grad_broadcast(const float* dout_grad, + float* x_grad, + float* y_grad, + int pre, + int n, + int post) { + if (x_grad) { + elementwise_add_grad(dout_grad, x_grad, pre * n * post); + } + if (y_grad) { + memset(y_grad, 0, n * sizeof(float)); +#pragma omp parallel for + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + float sum = 0; + int cnt = post << 2; + int remain = post & 0x03; + const float* out_data = dout_grad + (i * n + j) * post; + float32x4_t sum_v = vdupq_n_f32(0); + for (int ci = 0; ci < cnt; ++ci) { + float32x4_t din = vld1q_f32(out_data + 4 * ci); + sum_v = vaddq_f32(sum_v, din); + } + out_data += 4 * cnt; + for (int ci = 0; ci < remain; ++ci) { + sum -= out_data[ci]; + } + float32x2_t high = vget_high_f32(sum_v); + float32x2_t low = vget_low_f32(sum_v); + sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) + + vget_lane_f32(low, 0) + vget_lane_f32(low, 1); + y_grad[j] += sum; + } + } + } +} template <> void elementwise_mul(const float* dinx, diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h index 60d702742dec58f1502837617f5d4059dbb43e22..06ecab08edcaf06614de94b99084be2ee80647aa 100644 --- a/lite/backends/arm/math/elementwise.h +++ b/lite/backends/arm/math/elementwise.h @@ -183,6 +183,13 @@ template void elementwise_add_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_add_grad(const T* dout, T* dinx, int num); + +template +void elementwise_add_grad_broadcast( + const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post); + template void elementwise_sub(const T* dinx, const T* diny, T* dout, int num); @@ -197,6 +204,13 @@ template void elementwise_sub_relu_broadcast( const T* dinx, const T* diny, T* dout, int batch, int channels, int num); +template +void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num); + +template +void elementwise_sub_grad_broadcast( + const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post); + template void elementwise_mul(const T* dinx, const T* diny, T* dout, int num); diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc index d17ce0dea4640899482ba9dd87d0646ca2de705d..a7d4322326c9413878264400ba8118b510fade10 100644 --- a/lite/backends/arm/math/sgemv.cc +++ b/lite/backends/arm/math/sgemv.cc @@ -983,10 +983,12 @@ void sgemv_trans(const int M, "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \ "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \ "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \ - "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ - "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ "vmla.f32 q0, q4, q6 @ mul add\n" \ + "vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \ "vmla.f32 q1, q4, q8 @ mul add\n" \ + "vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \ + /*"vmla.f32 q0, q4, q6 @ mul add\n" */ \ + /*"vmla.f32 q1, q4, q8 @ mul add\n" */ \ "vmla.f32 q2, q4, q10 @ mul add\n" \ "vmla.f32 q3, q4, q12 @ mul add\n" \ "subs %[cnt], #1 @ sub loop count \n" \ diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..83b8dff70eb8de7cf1d117585d47118fed539a15 100755 --- a/lite/backends/fpga/KD/debugger.hpp +++ b/lite/backends/fpga/KD/debugger.hpp @@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) { inline void save_float(float* data, const std::string& name, int len) { static int counter = 0; - std::string old_string = std::to_string(counter); + std::string old_string = paddle::lite::to_string(counter); std::string new_string = std::string(3 - old_string.length(), '0') + old_string; diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp index 988bc1bb507036de8f13a6c6549c549718bd1256..12a60bd27da832b338dc6b1ca11b1c7d6aa192e4 100644 --- a/lite/backends/fpga/KD/tensor.hpp +++ b/lite/backends/fpga/KD/tensor.hpp @@ -351,10 +351,10 @@ class Tensor { void printScale(std::string type) { printScale(); } std::string dimsFileName() { - return std::to_string(shape_->num()) + "_" + - std::to_string(shape_->channel()) + "_" + - std::to_string(shape_->height()) + "_" + - std::to_string(shape_->width()) + ".txt"; + return paddle::lite::to_string(shape_->num()) + "_" + + paddle::lite::to_string(shape_->channel()) + "_" + + paddle::lite::to_string(shape_->height()) + "_" + + paddle::lite::to_string(shape_->width()) + ".txt"; } void saveToFile() { std::string path = dimsFileName(); } @@ -374,7 +374,7 @@ class Tensor { invalidate(); std::ofstream ofs; static int counter = 0; - std::string npath = std::to_string(counter) + "_" + path; + std::string npath = paddle::lite::to_string(counter) + "_" + path; counter++; save_file_with_name(npath); } diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc index d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1..345b239c320f04eba8426483a23a352e77a71036 100644 --- a/lite/backends/npu/device.cc +++ b/lite/backends/npu/device.cc @@ -19,8 +19,8 @@ namespace paddle { namespace lite { namespace npu { -std::unique_ptr Device::Build( - std::string& model_name, // NOLINT +std::shared_ptr Device::Build( + const std::string model_name, // NOLINT std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ) { @@ -41,15 +41,15 @@ std::unique_ptr Device::Build( ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } + // Create a HiAI model manager client to load the HiAI om model - std::unique_ptr model_client( + std::shared_ptr model_client( new hiai::AiModelMngerClient()); if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; ir_build.ReleaseModelBuff(om_model_buf); return nullptr; } - model_name = "model_" + std::to_string(model_count_++) + ".om"; auto model_desc = std::make_shared( model_name, freq_level(), framework_type(), model_type(), device_type()); model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h index 411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a..6733a7f6dfa085d2c64274a81ba2a028ebe88f3f 100644 --- a/lite/backends/npu/device.h +++ b/lite/backends/npu/device.h @@ -40,8 +40,8 @@ class Device { // Build the HiAI IR graph to om model, return HiAI model manager client to // load om model and run inference. - std::unique_ptr Build( - std::string& model_name, // NOLINT + std::shared_ptr Build( + const std::string model_name, // NOLINT std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ); // NOLINT @@ -51,7 +51,6 @@ class Device { int framework_type_{0}; int model_type_{0}; int device_type_{0}; - int model_count_{0}; }; } // namespace npu diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt index 3b504fbed6a5cef6ab3cff46c0e9b7009459ac80..0ac8cf310370f34ae5743113efe1d71579979daf 100644 --- a/lite/backends/opencl/CMakeLists.txt +++ b/lite/backends/opencl/CMakeLists.txt @@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper) -lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper) add_dependencies(cl_wrapper opencl_clhpp) diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl similarity index 99% rename from lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl rename to lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index d840195dd42c71bab5afda32a11d805f5a96b114..4b2d5ba32072e7eb31adbf347360e0bbcee7bc5b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -1,6 +1,6 @@ #include -__kernel void conv2d_1x1(__private const int global_size_dim0, +__kernel void conv2d_1x1_opt(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, __read_only image2d_t input_image, diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index 468dd1a8a30ca572d76ed0e20acf59e6906e0e1c..79f3922e89549fc15b7a849efb0e2b6595357102 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -14,21 +14,22 @@ limitations under the License. */ #include -__kernel void conv2d_3x3_opt(__private const int item_ch, +__kernel void conv2d_3x3_opt(__private const int item_ch, __private const int item_w, - __private const int item_h, + __private const int item_h, __read_only image2d_t input_image, __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) __read_only image2d_t bias, #endif - __write_only image2d_t output_image, + __write_only image2d_t output_image, __private const int stride, - __private const int pad, + __private const int pad, __private const int dilation, - __private const int in_ch, + __private const int batch, + __private const int in_ch, __private const int in_w, - __private const int in_h, + __private const int in_h, __private const int out_w, __private const int out_h) { @@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, #ifdef BIASE_CH CL_DTYPE4 output[5]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); output[1] = output[0]; output[2] = output[0]; output[3] = output[0]; @@ -69,23 +71,33 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, #elif defined(BIASE_ELE) CL_DTYPE4 output[5]; - output[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); if (out_w_id1 < out_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, - (int2)(out_w_base_id + out_w_id1, item_h_id)); + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); } if (out_w_id2 < out_w) { - output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, - (int2)(out_w_base_id + out_w_id2, item_h_id)); + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); } if (out_w_id3 < out_w) { - output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, - (int2)(out_w_base_id + out_w_id3, item_h_id)); + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); } if (out_w_id4 < out_w) { - output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, - (int2)(out_w_base_id + out_w_id4, item_h_id)); + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); } #else CL_DTYPE4 output[5] = {0.0f}; @@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, int filter_w_val = ch * 3; for (int h = 0; h < 3; h++) { - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + int in_h_val = select(out_batch_id * in_h + in_h_id + h, + -1, (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); for (int w = 0; w < 3; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, sampler, + filter[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, sampler, + filter[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, sampler, + filter[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, sampler, + filter[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - filter_trans[0] = (CL_DTYPE4)(filter[0].x, filter[1].x, filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (CL_DTYPE4)(filter[0].y, filter[1].y, filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (CL_DTYPE4)(filter[0].z, filter[1].z, filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (CL_DTYPE4)(filter[0].w, filter[1].w, filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = - READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); output[0] = mad(input[0].x, filter_trans[0], output[0]); output[1] = mad(input[1].x, filter_trans[0], output[1]); @@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); } if (out_w_id2 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); } if (out_w_id3 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); } if (out_w_id4 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); } } +// support batch > 1 +__kernel void conv2d_3x3_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..4ed2e072022dc4b457a86d634bf4bc21ab62bc45 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -0,0 +1,516 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// opt version of conv5x5 +__kernel void conv2d_5x5_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 5; + const int filter_h = 5; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = + select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} +// support batch > 1 +__kernel void conv2d_5x5_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 5; + const int filter_h = 5; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..d82f4b4c96b586b6ecf948827402afd0766dcea4 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -0,0 +1,516 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +// opt version of con7x7 +__kernel void conv2d_7x7_opt(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = + select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} +// support batch > 1 +__kernel void conv2d_7x7_multi_batch(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; + +#ifdef BIASE_CH + + CL_DTYPE4 output[5]; + output[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; + +#elif defined(BIASE_ELE) + + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 output[5] = {0.0f}; +#endif + + CL_DTYPE4 filter[4] = {0.0f}; + CL_DTYPE4 filter_trans[4] = {0.0f}; + CL_DTYPE4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + out_batch_id * in_h + in_h_id + h, + -1, + (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || + out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, + -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, + -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, + -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + filter_image, + sampler, + (int2)(filter_w_val + w, + filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (CL_DTYPE4)(filter[0].x, + filter[1].x, + filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (CL_DTYPE4)(filter[0].y, + filter[1].y, + filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (CL_DTYPE4)(filter[0].z, + filter[1].z, + filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (CL_DTYPE4)(filter[0].w, + filter[1].w, + filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } + } + } + + output[0] = activation_type4(output[0]); + output[1] = activation_type4(output[1]); + output[2] = activation_type4(output[2]); + output[3] = activation_type4(output[3]); + output[4] = activation_type4(output[4]); + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } +} \ No newline at end of file diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h index 39920195b245e1c44ff68ab91af94d25c949bd02..4317d558c6252e9163bc545cba4859fbcb89f804 100644 --- a/lite/backends/x86/jit/gen/blas.h +++ b/lite/backends/x86/jit/gen/blas.h @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "lite/backends/x86/jit/gen/jitcode.h" +#include "lite/utils/string.h" namespace paddle { namespace lite { @@ -64,7 +65,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); - base += "_D" + std::to_string(num_); + base += "_D" + paddle::lite::to_string(num_); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h index 7cae76f9dd99cf904e831b196bd493623ff7eb1d..999960ece4170d561419ad24bd94c512ce167eb0 100644 --- a/lite/backends/x86/jit/gen/embseqpool.h +++ b/lite/backends/x86/jit/gen/embseqpool.h @@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode { } else if (type_ == SeqPoolType::kSqrt) { base += "_Sqrt"; } - base += ("_W" + std::to_string(tbl_w_)); + base += ("_W" + paddle::lite::to_string(tbl_w_)); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h index b1b302b7904a5d92952f4385c483eccdc5df3592..e7be6750cf0d232b41d3be61001eb0af4c52a129 100644 --- a/lite/backends/x86/jit/gen/matmul.h +++ b/lite/backends/x86/jit/gen/matmul.h @@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode { std::string name() const override { std::string base = "MatMulJitCode"; - base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + - std::to_string(k_); + base = base + "_M" + paddle::lite::to_string(m_) + "_N" + + paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_); return base; } void genCode() override; diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h index 346179cfbbd0e8291dc17b266366c5df07114b7f..60e27993057b58eb8a4a07fcd0a368fc0a9441fc 100644 --- a/lite/backends/x86/jit/gen/seqpool.h +++ b/lite/backends/x86/jit/gen/seqpool.h @@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode { } else if (type_ == SeqPoolType::kSqrt) { base += "_Sqrt"; } - base += ("_W" + std::to_string(w_)); + base += ("_W" + paddle::lite::to_string(w_)); return base; } void genCode() override; diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index e3ee1056d6c0816463bb21e95ea38101fd5d27ba..35aad501070282b49cdd8df72185ad9d21dab9fe 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -94,9 +94,13 @@ add_custom_command( OUTPUT ops.h # not a real path to the output to force it execute every time. ) # generate fake kernels for memory_optimize_tool + +#-------------------------------opt---------------------------------------------------------------- +# tricks to create headfiles for opt add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py ${kernels_src_list} + ${fake_kernels_src_list} ${CMAKE_BINARY_DIR}/all_kernel_faked.cc ${CMAKE_BINARY_DIR}/kernel_src_map.h OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time. @@ -104,12 +108,12 @@ add_custom_command( add_custom_target(op_list_h DEPENDS ops.h) add_custom_target(kernel_list_h DEPENDS kernels.h) add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) -#add_custom_target(opencl_kernels_source_cc DEPENDS opencl_kernels_source.cc) # create headfile to restore ops info sorted by suppported platforms add_custom_command( COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py ${kernels_src_list} + ${fake_kernels_src_list} ${ops_src_list} ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time. diff --git a/lite/core/context.h b/lite/core/context.h index 5f711a51434e90d27ca206724bc5b37593e6f70e..cdab4e473bf44c1b5b4ec6c0715ce44074ac63cf 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -490,7 +490,7 @@ class ContextScheduler { } break; #endif default: -#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL +#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) LOG(FATAL) << "unsupported target " << TargetToStr(target); #endif break; diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index d3e7a625a7a768936db178b3c325e6ac84a0057e..28ec814fa85451b5292bfde6bddc6b64b57b2f08 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) { auto attr_type = op_info->GetAttrType(attr_name); switch (attr_type) { case AttrType::INT: - os << ":int:" << std::to_string(op_info->GetAttr(attr_name)); + os << ":int:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); break; case AttrType::FLOAT: - os << ":float:" << std::to_string(op_info->GetAttr(attr_name)); + os << ":float:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); break; case AttrType::BOOLEAN: - os << ":int:" << std::to_string(op_info->GetAttr(attr_name)); + os << ":int:" + << paddle::lite::to_string(op_info->GetAttr(attr_name)); break; case AttrType::STRING: os << ":string: \"" diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index ee78fac9a88aa339514778dcc03e2c907487fb39..38293ede76ed35bf05767ce1333947b7dfdbc4ac 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( // non-tensor(like tensor_array) variables will not be reused for (auto& node : graph->nodes()) { - if (node.IsArg() && !node.arg()->type->IsTensor()) { + if (node.IsArg() && (node.arg()->type != nullptr) && + !node.arg()->type->IsTensor()) { invalid_var_names.insert(node.arg()->name); } } @@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan( if (reuse_table.count(name) && reuse_table.at(name) != name) { auto replace_name = reuse_table.at(name); input_node->AsArg().name = - replace_name + "(" + std::to_string(node_append_idx) + ")"; + replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")"; node_append_idx++; } } @@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan( if (reuse_table.count(name) && reuse_table.at(name) != name) { auto replace_name = reuse_table.at(name); out_node->AsArg().name = - replace_name + "(" + std::to_string(node_append_idx) + ")"; + replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")"; node_append_idx++; } } diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h index e7c44d2be689a9d890158c097e198314413d1ba3..45b15812fadb0789edea3f89fb00b4612bdb010f 100644 --- a/lite/core/mir/node.h +++ b/lite/core/mir/node.h @@ -85,7 +85,7 @@ class Node { struct Arg { std::string name; int id{0}; - const Type* type{}; + const Type* type{nullptr}; // Weight is a special kind of argument, it is marked as weight explicitly // so that some weight related optimization can take place. bool is_weight{false}; diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc index 54a4e779c6b6d0150cad966a4454f30624fe6dae..40cad8f6af75300ab85753b16e391daeeadc6c2f 100644 --- a/lite/core/mir/quantized_op_attributes_inference_pass.cc +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply( } if (found) { inst.mutable_op_info()->SetAttr("output_scale", output_scale); + } else if (op_info->HasAttr("output_scale")) { + int bit_length = op_info->GetAttr("bit_length"); + int range = (1 << (bit_length - 1)) - 1; + output_scale = op_info->GetAttr("output_scale"); + inst.mutable_op_info()->SetAttr("output_scale", output_scale / range); } if (op_info->HasAttr("output_scale")) { inst.mutable_op_info()->SetAttr("enable_int8", true); diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index 6628444338830a35cb4ca78334398b0d4378bf3b..6d45be3b898271f0801d289d16235d3fb5fdd706 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass { } VLOG(4) << "[score(final)]:" << final_score; - VLOG(4) << "-------- pick summary --------"; - VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + VLOG(2) << "-------- pick summary for " << instruct.op_type() + << " --------"; + VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); - VLOG(4) << " ===> kernel.place():" + VLOG(2) << " ===> kernel.place():" << PrecisionToStr(kernel.place().precision) << " " << DataLayoutToStr(kernel.place().layout) << " " << TargetToStr(kernel.place().target); diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 943adc96b4be66eea1da6c71c189702834ccd295..91aa04d99505eac5fa9abc50a5008ec7b5de4fbf 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() { } else { exists_ops[op_type]++; } - auto op_name = op_type + std::to_string(exists_ops[op_type]); + auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]); std::string op_color = "white"; if (subgraph_indices.count(node)) { auto subgraph_idx = subgraph_indices[node]; - op_name += "_subgraph_" + std::to_string(subgraph_idx); + op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx); op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()]; } dot.AddNode(op_name, @@ -223,6 +223,7 @@ std::unordered_set SubgraphDetector::GetExcludedNodesFromConfigFile() { std::vector lines = ReadLines(config_file_path); for (std::string line : lines) { + if (line.empty()) continue; std::vector node_info = Split(line, ":"); std::string op_type = node_info.at(0); std::vector in_vars_name; @@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, cpp::OpDesc subgraph_op_desc; subgraph_op_desc.SetType("subgraph"); - // Create a new sub block desc for storing all of Ops an Vars of the target + // Create a new sub block desc for storing all of Ops and Vars of the target // subgraph and sub_block_idx is set as a attribute of subgraph op, // sub_block_idx < 0 means it's a new subgraph op int sub_block_idx = -(subgraph_idx + 1); diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index e96a080d574fbdf4dbf05d79c28e64b2148a98e2..974772a9839c1e089359be3ae98e1833645ccd7a 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -39,7 +39,7 @@ std::vector AddFCDesc( CHECK_EQ(input_var_names.size(), 1); CHECK_EQ(wshape.size(), 2); static int id = 0; - std::string prefix = "fc_" + std::to_string(id); + std::string prefix = "fc_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* wgt = block_desc->AddVar(); @@ -76,7 +76,7 @@ std::vector AddElementwiseAddDesc( const std::vector& input_Y_names) { // CHECK_EQ(input_var_names.size(), 2); static int id = 0; - std::string prefix = "elementwise_add_" + std::to_string(id); + std::string prefix = "elementwise_add_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -100,7 +100,7 @@ std::vector AddFeedDesc( const std::vector& input_X_names) { // CHECK_EQ(input_var_names.size(), 1); static int id = 0; - std::string prefix = "feed_" + std::to_string(id); + std::string prefix = "feed_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); @@ -123,7 +123,7 @@ std::vector AddFetchDesc( const std::vector& input_X_names) { // CHECK_EQ(input_var_names.size(), 1); static int id = 0; - std::string prefix = "fetch_" + std::to_string(id); + std::string prefix = "fetch_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); auto* out = block_desc->AddVar(); diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 7cf65a23b8c5646c8ff6c77917dde53b7f036b9c..7117e1b3399fe823194f7f1a4d4c239099580955 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -17,6 +17,7 @@ #include "lite/api/paddle_api.h" #include "lite/api/test_helper.h" #include "lite/utils/cp_logging.h" +#include "lite/utils/string.h" DEFINE_string(model_file, "", "model file path of combined protobuf model"); DEFINE_string(params_file, "", "params file path of combined protobuf model"); @@ -31,43 +32,17 @@ namespace lite { // The helper functions for loading and running model from command line and // verifying output data std::vector TypeParsing(std::string text) { - std::vector types; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string type = text.substr(0, index); - VLOG(3) << type; - types.push_back(type); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); - } - } - return types; + return Split(text, ":"); } std::vector> ShapeParsing(std::string text) { std::vector> shapes; - while (!text.empty()) { - size_t index = text.find_first_of(":"); - std::string slice = text.substr(0, index); - std::vector shape; - while (!slice.empty()) { - size_t index = slice.find_first_of(","); - int d = atoi(slice.substr(0, index).c_str()); - VLOG(3) << d; - shape.push_back(d); - if (index == std::string::npos) { - break; - } else { - slice = slice.substr(index + 1); - } - } - shapes.push_back(shape); - if (index == std::string::npos) { - break; - } else { - text = text.substr(index + 1); + std::vector shape_strings = Split(text, ":"); + shapes.resize(shape_strings.size()); + for (int i = 0; i < shape_strings.size(); i++) { + std::vector shape_nums = Split(shape_strings[i], ","); + for (auto shape_num : shape_nums) { + shapes[i].push_back(atoi(shape_num.c_str())); } } return shapes; diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index f517a041200c32f04406bbcd877ed8484488e663..1133e5ba8203ec9fea177844a6311c993f6b8ff7 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr& graph) { VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; auto inlinks = node->inlinks; - VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc - << " inlinks.size():" << inlinks.size(); + VLOG(4) << "============== node->AsStmt().op_type():" + << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size() + << " ================"; for (auto* in : inlinks) { ComplementInputs(graph.get(), node, in); } @@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name)); auto decl_arg_type = inst.picked_kernel().GetInputDeclType(inst_in_tensor_name); + CHECK(in->AsArg().type); - VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name + VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name << "\n in->AsArg().name:" << in->AsArg().name << "\n *in->AsArg().type:" << *in->AsArg().type << "\n *decl_arg_type:" << *decl_arg_type << "\n inst.op()->DebugString():" << inst.op()->DebugString(); + // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL + // layout). + // not a good judge, but don't find the source of this issue from + // static_pick_kernel_pass + // to this pass. + auto* in_arg_type = const_cast(in->AsArg().type); + if (in_arg_type->target() == TARGET(kARM) && + in_arg_type->layout() == DATALAYOUT(kImageDefault)) { + return; + } + if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) { VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name << " for kernel " << inst.op()->DebugString() << " " diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index 25b367e73c4e27132b41cb2b5ec83b64a4bf226d..ecccf89fa76287a3f30756f7138fcce229e8f337 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from, CHECK(in->IsArg()); // auto node_id = [&] { return graph->nodes().size(); }; auto cast_op_output_name = in->AsArg().name + "/precision_trans"; - // in->AsArg().name + "/precision_trans/" + std::to_string(node_id()); + // in->AsArg().name + "/precision_trans/" + + // paddle::lite::to_string(node_id()); auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); cast_op_output_arg->AsArg().type = LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 5dec9ed7aace837e3eb085a55d7b9b5382f7dea3..77d8091b4b16cfbce2efc3d549f916a9136c61ab 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -65,6 +65,7 @@ class OpLite : public Registry { virtual bool CheckShape() const { return true; } // Inference the outputs' shape. virtual bool InferShape() const { return true; } + virtual bool SmartInferShape() { return this->InferShape(); } // Run this operator. virtual bool Run(); // Indicate whether the Op runs only once or not @@ -150,6 +151,10 @@ class OpLite : public Registry { std::vector valid_places_; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; std::unique_ptr op_info_; + std::vector last_input_shapes; + std::vector last_output_shapes; + std::vector>> last_output_lods; + std::vector>> last_input_lods; }; /* diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index d9111e5c46c9217b181e5a3e5a8c7981f46250df..39213a33cebd05d9cfa50d82cdfb09ad3f7ad637 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -22,18 +22,25 @@ #include #include "lite/core/program.h" +#ifdef LITE_WITH_OPENCL +#include "lite/backends/opencl/cl_image_converter.h" +#include "lite/backends/opencl/cl_include.h" +#include "lite/kernels/opencl/image_helper.h" +#endif + namespace paddle { namespace lite { namespace profile { template -static void write_tensorfile(const Tensor* tensor, const std::string& locate) { +static bool write_tensorfile(const Tensor* tensor, const std::string& locate) { if (locate.find('/') != std::string::npos) { - return; + return false; } FILE* fp = fopen(locate.c_str(), "w"); if (fp == nullptr) { LOG(ERROR) << "file open field " << locate; + return false; } else { const dtype* data = tensor->data(); for (int i = 0; i < tensor->numel(); ++i) { @@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) { } } fclose(fp); + return true; } class PrecisionProfiler { public: - explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {} - ~PrecisionProfiler() { - LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr() - << " on Target " << TargetToStr(inst_->kernel()->target()) << " " - << PrecisionToStr(inst_->kernel()->precision()); - auto tensor_mean = [](const Tensor* in, - PrecisionType ptype, - std::string name = "inst") -> double { - if (!in->data()) { - return -99999; - } - double sum = 0.; - switch (ptype) { + // TODO(ysh329): need to remove `explicit PrecisionProfiler` + // keep this method only for arm/math/conditional + explicit PrecisionProfiler(const Instruction* inst) { + std::string inst_precison_str = GetInstPrecision(inst); + } + + PrecisionProfiler() {} + + std::string GetSummaryHeader() { + using std::setw; + using std::left; + using std::fixed; + STL::stringstream ss; + ss << "========================================= " + << "Detailed Precision Profiler Summary " + << "=========================================" << std::endl; + ss << setw(45) << left << "operator:(kernel_info)" + << " " << setw(70) << left << "output_tensor_name:(tensor_info)" + << " " << setw(15) << left << "dims" + << " " << setw(15) << left << "mean" + << " " << setw(15) << left << "std_deviation" + << " " << setw(15) << left << "ave_grow_rate*" << std::endl; + + return ss.str(); + } + + template + double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; + } + + template + double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); + } + + template + double compute_average_grow_rate(const T* in, const size_t length) { + const double eps = 1e-5; + double ave_grow_rate = 0.0f; + for (size_t i = 1; i < length; ++i) { + ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps); + } + ave_grow_rate /= length; + return ave_grow_rate; + } + + // check if output tensor unused + bool is_unused(const Tensor* in) { + if (!in->data()) { + return true; + } + return false; + } + + void compute_tensor_precision_info(const Tensor* in, + TargetType target_type, + PrecisionType precision_type, + DataLayoutType layout_type, + double* mean, + double* std_dev, + double* ave_grow_rate, + std::string name = "inst", + bool write_result_to_file = false) { + std::string unsupported_error_log = + "Unsupported precision profile for kernel registered on" + + TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" + + DataLayoutToStr(layout_type); + + if (target_type == TARGET(kARM) || target_type == TARGET(kHost) || + target_type == TARGET(kX86)) { + switch (precision_type) { case PRECISION(kFloat): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kAny): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kInt8): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = + compute_standard_deviation(ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } case PRECISION(kInt32): { auto ptr = in->data(); - // write_tensorfile(in, name); - for (int i = 0; i < in->numel(); ++i) { - sum += ptr[i]; - } - return sum / in->numel(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = compute_standard_deviation( + ptr, in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(ptr, in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; } default: - LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype); - return 0.; + *mean = -333333333333; + *std_dev = -33333333333; + *ave_grow_rate = -33333333333; + LOG(ERROR) << unsupported_error_log; + return; } - }; - if (inst_->op()->op_info()->Type() != "fetch") { - auto op = const_cast(inst_->op()); - auto kernel = inst_->kernel(); +#ifdef LITE_WITH_OPENCL + } else if (target_type == TARGET(kOpenCL)) { + switch (layout_type) { + case DATALAYOUT(kImageDefault): { + paddle::lite::CLImageConverterDefault default_convertor; + auto image_shape = default_convertor.InitImageDimInfoWith(in->dims()); + size_t im_w = image_shape[0]; + size_t im_h = image_shape[1]; + VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " " + << im_h; + std::vector in_data_v(im_w * im_h * 4); + std::vector real_out_v(in->numel()); + const size_t cl_image2d_row_pitch{0}; + const size_t cl_image2d_slice_pitch{0}; + TargetWrapperCL::ImgcpySync(in_data_v.data(), + in->data(), + im_w, + im_h, + cl_image2d_row_pitch, + cl_image2d_slice_pitch, + IoDirection::DtoH); + default_convertor.ImageToNCHW( + in_data_v.data(), real_out_v.data(), image_shape, in->dims()); + CHECK(real_out_v.size() == in->numel()); + *mean = compute_mean(real_out_v.data(), real_out_v.size()); + *std_dev = compute_standard_deviation( + real_out_v.data(), in->numel(), true, *mean); + *ave_grow_rate = compute_average_grow_rate(real_out_v.data(), + real_out_v.size()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + case DATALAYOUT(kNCHW): { + std::vector in_data_v(in->numel(), 0); + TargetWrapperCL::MemcpySync(in_data_v.data(), + in->data(), + in->numel() * sizeof(float), + IoDirection::DtoH); + VLOG(1) << name << ":" << in->numel(); + *mean = compute_mean(in_data_v.data(), in->numel()); + *std_dev = compute_standard_deviation( + in_data_v.data(), in->numel(), true, *mean); + *ave_grow_rate = + compute_average_grow_rate(in_data_v.data(), in->numel()); + write_result_to_file&& write_tensorfile(in, name); + return; + } + default: + *mean = -222222222222; + *std_dev = -22222222222; + *ave_grow_rate = -22222222222; + LOG(ERROR) << unsupported_error_log; + return; + } +#endif + } else { + *mean = -111111111111; + *std_dev = -11111111111; + *ave_grow_rate = -11111111111; + LOG(ERROR) << unsupported_error_log; + return; + } + } + + std::string GetInstPrecision(const Instruction* inst = nullptr) { + using std::setw; + using std::left; + using std::fixed; + STL::stringstream ss; + bool write_result_to_file = false; + + VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr() + << " registered on " << TargetToStr(inst->kernel()->target()) << "/" + << PrecisionToStr(inst->kernel()->precision()) << "/" + << DataLayoutToStr(inst->kernel()->layout()); + + std::string kernel_repr = inst->op()->op_info()->Repr(); + std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" + + PrecisionToStr(inst->kernel()->precision()) + + "/" + DataLayoutToStr(inst->kernel()->layout()); + std::string op_name = inst->op()->op_info()->Type(); + + if (inst->op()->op_info()->Type() != "fetch") { + auto op = const_cast(inst->op()); + auto kernel = inst->kernel(); auto op_scope = op->scope(); auto out_names = op->op_info()->output_names(); for (auto& out_name : out_names) { @@ -106,32 +277,90 @@ class PrecisionProfiler { auto type = kernel->GetOutputDeclType(out_arg_name); if (type->IsTensor()) { - auto tout = op_scope->FindVar(out_name)->GetMutable(); - double mean = tensor_mean(tout, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean << " shape:" << tout->dims(); + const Tensor* tout = + op_scope->FindVar(out_name)->GetMutable(); + double mean = -999999; + double std_dev = -100000; + double ave_grow_rate = 99999; + std::string mean_str{"unused"}; + std::string std_dev_str{"unused"}; + std::string ave_grow_rate_str{"unused"}; + + if (!is_unused(tout)) { + compute_tensor_precision_info(tout, + type->target(), + type->precision(), + type->layout(), + &mean, + &std_dev, + &ave_grow_rate, + out_name, + write_result_to_file); + mean_str = std::to_string(mean); + std_dev_str = std::to_string(std_dev); + ave_grow_rate_str = std::to_string(ave_grow_rate); + } + std::string kernel_info = op_name + ":" + kernel_place; + std::string output_arg_info = out_name + ":" + + TargetToStr(type->target()) + "/" + + PrecisionToStr(type->precision()) + + "/" + DataLayoutToStr(type->layout()); + + ss << setw(45) << left << kernel_info << " " << setw(70) << left + << output_arg_info << " " << setw(15) << left << tout->dims() + << " " << setw(15) << left << mean_str << " " << setw(15) << left + << std_dev_str << " " << setw(15) << left << ave_grow_rate_str + << std::endl; } else if (type->IsTensorList()) { - auto tout = + auto touts = op_scope->FindVar(out_name)->GetMutable>(); - for (auto& t : *tout) { - double mean = tensor_mean(&t, type->precision(), out_name); - LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims() - << ", precision: " << PrecisionToStr(type->precision()) - << ", mean value: " << mean; + for (auto t : *touts) { + const Tensor* tout = &t; + double mean = -999999; + double std_dev = -100000; + double ave_grow_rate = 99999; + std::string mean_str{"unused"}; + std::string std_dev_str{"unused"}; + std::string ave_grow_rate_str{"unused"}; + + if (!is_unused(tout)) { + compute_tensor_precision_info(tout, + type->target(), + type->precision(), + type->layout(), + &mean, + &std_dev, + &ave_grow_rate, + out_name, + write_result_to_file); + mean_str = std::to_string(mean); + std_dev_str = std::to_string(std_dev); + ave_grow_rate_str = std::to_string(ave_grow_rate); + } + std::string kernel_info = op_name + ":" + kernel_place; + std::string output_arg_info = out_name + ":" + + TargetToStr(type->target()) + "/" + + PrecisionToStr(type->precision()) + + "/" + DataLayoutToStr(type->layout()); + + ss << setw(45) << left << kernel_info << " " << setw(70) << left + << output_arg_info << " " << setw(15) << left << tout->dims() + << " " << setw(15) << left << mean_str << " " << setw(15) << left + << std_dev_str << " " << setw(15) << left << ave_grow_rate_str + << std::endl; } } } } + return ss.str(); } - - private: - const Instruction* inst_{nullptr}; }; } // namespace profile } // namespace lite } // namespace paddle +// TODO(ysh329): need to remove. +// keep this method only for arm/math/conditional_block_compute #define LITE_PRECISION_PROFILE(inst) \ { auto a = paddle::lite::profile::PrecisionProfiler(&inst); } diff --git a/lite/core/program.cc b/lite/core/program.cc index 0895643a6adde0095f9d2892c41f263eedd4284f..580389fbad54c0de8efd65ef78c9b69fd3e72893 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { } void RuntimeProgram::Run() { +#ifdef LITE_WITH_PROFILE +#ifdef LITE_WITH_PRECISION_PROFILE + auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); + std::string precision_profiler_summary = + inst_precision_profiler.GetSummaryHeader(); +#endif +#endif + for (auto& inst : instructions_) { #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; @@ -144,13 +152,17 @@ void RuntimeProgram::Run() { #ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE #ifndef LITE_WITH_FPGA - LITE_PRECISION_PROFILE(inst) + precision_profiler_summary += + inst_precision_profiler.GetInstPrecision(&inst); #endif #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); +#ifdef LITE_WITH_PRECISION_PROFILE + LOG(INFO) << "\n" << precision_profiler_summary; +#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } @@ -274,7 +286,8 @@ void Instruction::Run() { return; } - op_->InferShape(); + // op_->InferShape(); + op_->SmartInferShape(); kernel_->Launch(); has_run_ = true; } diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h index edcbb101aa5ddb090cc585a16597967cb5114936..fbee253872237bce08f3f67b948da79becbae21a 100644 --- a/lite/core/program_fake_utils.h +++ b/lite/core/program_fake_utils.h @@ -30,9 +30,9 @@ Program FakeProgram() { auto add_fc = [&](int id, std::string x) { // create variables - std::string w1 = "w" + std::to_string(id); - std::string b1 = "b" + std::to_string(id); - std::string out1 = "out" + std::to_string(id); + std::string w1 = "w" + paddle::lite::to_string(id); + std::string b1 = "b" + paddle::lite::to_string(id); + std::string out1 = "out" + paddle::lite::to_string(id); auto w1v = program.scope()->Var(w1)->GetMutable(); auto b1v = program.scope()->Var(b1)->GetMutable(); auto out1v = program.scope()->Var(out1)->GetMutable(); diff --git a/lite/core/version.h.in b/lite/core/version.h.in index d34c32073b852a50b5d26984ed4812ac4f38a870..da2d5f3ed99631973d97a94741e1711391237261 100644 --- a/lite/core/version.h.in +++ b/lite/core/version.h.in @@ -53,9 +53,9 @@ static std::string version() { static int64_t int_version(const std::string& version) { const std::vector vec = Split(version, "."); if (vec.size() == 3) { - return std::stoi(vec[0]) * MAJOR_COEFF + - std::stoi(vec[1]) * MINOR_COEFF + - std::stoi(vec[2]) * PATCH_COEFF; + return atoi(vec[0].c_str()) * MAJOR_COEFF + + atoi(vec[1].c_str()) * MINOR_COEFF + + atoi(vec[2].c_str()) * PATCH_COEFF; } return -1; } diff --git a/lite/demo/cxx/mask_detection/mask_detection.cc b/lite/demo/cxx/mask_detection/mask_detection.cc index 67014aef9d1797312bffc05712b57357c4d8204c..09a9c0ee158e7d5913a78877711d831fc5738cf1 100644 --- a/lite/demo/cxx/mask_detection/mask_detection.cc +++ b/lite/demo/cxx/mask_detection/mask_detection.cc @@ -207,7 +207,8 @@ void RunModel(std::string det_model_file, cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h); // uncomment two lines below, save roi img to disk - // std::string roi_name = "roi_" + std::to_string(i) + ".jpg"; + // std::string roi_name = "roi_" + paddle::lite::to_string(i) + // + ".jpg"; // imwrite(roi_name, roi); // Do PreProcess diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index 3eaf63e7f9be80cf36c475476c644516bbc75fbd..150bcd231c27c25d8510fc8dfa3281a8351514dd 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) { return shape_str; } +template +double compute_mean(const T* in, const size_t length) { + double sum = 0.; + for (size_t i = 0; i < length; ++i) { + sum += in[i]; + } + return sum / length; +} + +template +double compute_standard_deviation(const T* in, + const size_t length, + bool has_mean = false, + double mean = 10000) { + if (!has_mean) { + mean = compute_mean(in, length); + } + + double variance = 0.; + for (size_t i = 0; i < length; ++i) { + variance += pow((in[i] - mean), 2); + } + variance /= length; + return sqrt(variance); +} + inline double GetCurrentUS() { struct timeval time; gettimeofday(&time, NULL); @@ -101,24 +128,24 @@ void RunModel(std::string model_dir, // 5. Get output std::cout << "\n====== output summary ====== " << std::endl; size_t output_tensor_num = predictor->GetOutputNames().size(); - std::cout << "output tesnor num:" << output_tensor_num << std::endl; + std::cout << "output tensor num:" << output_tensor_num << std::endl; for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { std::unique_ptr output_tensor = predictor->GetOutput(tidx); std::cout << "\n--- output tensor " << tidx << " ---" << std::endl; auto out_shape = output_tensor->shape(); - std::cout << "out_shape(NCHW):" << ShapePrint(out_shape) << std::endl; + auto out_data = output_tensor->data(); + auto out_mean = compute_mean(out_data, ShapeProduction(out_shape)); + auto out_std_dev = compute_standard_deviation( + out_data, ShapeProduction(out_shape), true, out_mean); - float sum = 0.f; - for (int i = 0; i < ShapeProduction(out_shape); ++i) { - sum += output_tensor->data()[i]; - } + std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl; std::cout << "output tensor " << tidx << " elem num:" << ShapeProduction(out_shape) << std::endl; - std::cout << "output tensor " << tidx << " sum value:" << sum << std::endl; std::cout << "output tensor " << tidx - << " mean value:" << sum / ShapeProduction(out_shape) + << " standard deviation:" << out_std_dev << std::endl; + std::cout << "output tensor " << tidx << " mean value:" << out_mean << std::endl; // print output diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc index 0d8f4d0d192f3563d00bb66778ca4e13a17b93b1..6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b 100644 --- a/lite/gen_code/gen_code.cc +++ b/lite/gen_code/gen_code.cc @@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id, switch (type) { case AttrType::INT: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::FLOAT: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::BOOLEAN: - return std::to_string(desc.GetAttr(name)); + return paddle::lite::to_string(desc.GetAttr(name)); case AttrType::STRING: return "\"" + desc.GetAttr(name) + "\""; case AttrType::FLOATS: { diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h index 58a7959f4eb34cb438bf0e25b49b36110435cc6b..d316eac43f99664fa71cba54b3ab5360852300a0 100644 --- a/lite/gen_code/gen_code.h +++ b/lite/gen_code/gen_code.h @@ -153,16 +153,16 @@ class Module { private: std::string WeightUniqueName() const { - return "w_" + std::to_string(weight_counter_++); + return "w_" + paddle::lite::to_string(weight_counter_++); } std::string TmpVarUniqueName() const { - return "tmp_" + std::to_string(tmp_var_counter_++); + return "tmp_" + paddle::lite::to_string(tmp_var_counter_++); } std::string OpUniqueName() const { - return "op_" + std::to_string(op_counter_++); + return "op_" + paddle::lite::to_string(op_counter_++); } std::string KernelUniqueName() const { - return "kernel_" + std::to_string(kernel_counter_++); + return "kernel_" + paddle::lite::to_string(kernel_counter_++); } std::string DataRepr(const std::string &raw_data, PrecisionType dtype); diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 514d6069b5db9f1cf0fdd5d8a87a7cf89411dd23..7550d770145d92ebd343f96a82c6f34d72c91ea5 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,6 +1,6 @@ # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered # to the model_optimize_tool. -if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) return() endif() @@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de if(LITE_WITH_TRAIN) add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) endif() diff --git a/lite/kernels/arm/elementwise_grad_compute.cc b/lite/kernels/arm/elementwise_grad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..93bc5853459005137ef4f948f3a5892d76441b7c --- /dev/null +++ b/lite/kernels/arm/elementwise_grad_compute.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/elementwise_grad_compute.h" +#include +#include +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +inline DDim trim_trailing_singular_dims(const DDim& dims) { + // Remove trailing dimensions of size 1 for y + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(); + } + return DDim(trim_dims); +} + +inline bool is_broadcast(const DDim& x_dims, + const DDim& y_dims, + int axis, + int* pre, + int* n, + int* post) { + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + DDim y_dim_trim = trim_trailing_singular_dims(y_dims); + axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis; + if (x_dims.size() == y_dim_trim.size()) { + return false; + } + *pre = 1; + *n = 1; + *post = 1; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dim_trim.size(); ++i) { + CHECK_EQ(x_dims[i + axis], y_dim_trim[i]) + << "Broadcast dimension mismatch."; + (*n) *= y_dim_trim[i]; + } + for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } + return true; +} + +void ElementwiseAddGradCompute::Run() { + auto& param = Param(); + const float* x_data = param.X->data(); + const float* y_data = param.Y->data(); + const float* out_grad_data = param.OutGrad->data(); + float* x_grad_data; + float* y_grad_data; + if (param.XGrad) { + x_grad_data = param.XGrad->mutable_data(); + } + if (param.YGrad) { + y_grad_data = param.YGrad->mutable_data(); + } + int axis = param.axis; + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int pre, n, post; + if (!param.XGrad) { + CHECK(param.YGrad); + lite::arm::math::elementwise_add_grad( + out_grad_data, y_grad_data, y_dims.production()); + return; + } + + if (!param.YGrad) { + CHECK(param.XGrad); + lite::arm::math::elementwise_add_grad( + out_grad_data, x_grad_data, x_dims.production()); + return; + } + + if (x_dims.size() < y_dims.size() && + is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_add_grad_broadcast( + out_grad_data, y_grad_data, x_grad_data, pre, n, post); + } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_add_grad_broadcast( + out_grad_data, x_grad_data, y_grad_data, pre, n, post); + } else { + lite::arm::math::elementwise_add_grad( + out_grad_data, x_grad_data, x_dims.production()); + lite::arm::math::elementwise_add_grad( + out_grad_data, y_grad_data, y_dims.production()); + } +} + +void ElementwiseSubGradCompute::Run() { + auto& param = Param(); + const float* x_data = param.X->data(); + const float* y_data = param.Y->data(); + const float* out_data = param.OutGrad->data(); + float* x_grad_data; + float* y_grad_data; + if (param.XGrad) { + x_grad_data = param.XGrad->mutable_data(); + } + if (param.YGrad) { + y_grad_data = param.YGrad->mutable_data(); + } + int axis = param.axis; + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + int pre, n, post; + + if (!param.XGrad || !param.YGrad) { + CHECK(param.XGrad || param.YGrad); + lite::arm::math::elementwise_sub_grad( + out_data, x_grad_data, y_grad_data, y_dims.production()); + return; + } + + if (x_dims.size() < y_dims.size()) { + LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size"; + } + if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) { + lite::arm::math::elementwise_sub_grad_broadcast( + out_data, x_grad_data, y_grad_data, pre, n, post); + } else { + lite::arm::math::elementwise_sub_grad( + out_data, x_grad_data, y_grad_data, x_dims.production()); + } +} + +template +void ElementwiseMulGradCompute::Run() { + LOG(FATAL) << "elementwise mul_grad not implement yet"; +} + +void ElementwiseMaxGradCompute::Run() { + LOG(FATAL) << "elementwise max_grad not implement yet"; +} + +void ElementwiseDivGradCompute::Run() { + LOG(FATAL) << "elementwise div_grad not implement yet"; +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +using elementwise_mul_grad_float = + paddle::lite::kernels::arm::ElementwiseMulGradCompute; + +REGISTER_LITE_KERNEL(elementwise_add_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ElementwiseAddGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_sub_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ElementwiseSubGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_div_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ElementwiseDivGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_max_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ElementwiseMaxGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/elementwise_grad_compute.h b/lite/kernels/arm/elementwise_grad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..1273d8317410ce6689637e28597f9867702e1c2c --- /dev/null +++ b/lite/kernels/arm/elementwise_grad_compute.h @@ -0,0 +1,68 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class ElementwiseAddGradCompute + : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseAddGradCompute() = default; +}; + +class ElementwiseSubGradCompute + : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseSubGradCompute() = default; +}; + +template +class ElementwiseMulGradCompute : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseMulGradCompute() = default; +}; + +class ElementwiseMaxGradCompute + : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseMaxGradCompute() = default; +}; + +class ElementwiseDivGradCompute + : public KernelLite { + public: + void Run() override; + + virtual ~ElementwiseDivGradCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/mul_grad_compute.cc b/lite/kernels/arm/mul_grad_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..405d61d2ac3e4e060234eac63173e5bdd898d2ae --- /dev/null +++ b/lite/kernels/arm/mul_grad_compute.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/mul_grad_compute.h" +#include +#include "lite/backends/arm/math/funcs.h" +#include "lite/backends/arm/math/sgemm.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void MulGradCompute::PrepareForRun() { + auto& ctx = this->ctx_->template As(); +} + +void MulGradCompute::Run() { + // step1 flatten_2d + auto& param = Param(); + const auto x_dims = param.x->dims(); + const auto y_dims = param.y->dims(); + const auto out_dims = param.output_grad->dims(); + + m_ = static_cast(x_dims.Slice(0, param.x_num_col_dims).production()); + + k_ = static_cast( + x_dims.Slice(param.x_num_col_dims, x_dims.size()).production()); + n_ = static_cast( + y_dims.Slice(param.y_num_col_dims, y_dims.size()).production()); + + const auto* out_grad_data = param.output_grad->data(); + const auto* x_data = param.x->data(); + const auto* y_data = param.y->data(); + float* x_grad_data; + float* y_grad_data; + if (param.x_grad) { + x_grad_data = param.x_grad->mutable_data(); + } + + if (param.y_grad) { + y_grad_data = param.y_grad->mutable_data(); + } + + paddle::lite::operators::ActivationParam act_param; + act_param.has_active = false; + // out_grad * y^T = x_grad + // (m, n), (n, k) -> (m, k) + auto& ctx = this->ctx_->template As(); + if (param.x_grad) { + if (m_ == 1) { + lite::arm::math::sgemv(y_data, + out_grad_data, + x_grad_data, + false, + k_, // M + n_, // N + false, + nullptr, + false, + lite_api::ActivationType::kIndentity, + &ctx); + } else { + paddle::lite::arm::math::sgemm(false, + true, // is_transB, + m_, // M + k_, // N + n_, // K + 1.0f, // alpha + out_grad_data, // A + n_, // lda + y_data, // B + n_, // ldb + 0.f, // beta + x_grad_data, // C + k_, // ldc + NULL, // bias + false, // is_bias + act_param, // act_param + &ctx); // ctx + } + } + + // x^T * out_grad = y_grad + // (k, m) (m, n) -> (k, n) + if (param.y_grad) { + if (n_ == 1) { + lite::arm::math::sgemv(x_data, + out_grad_data, + y_grad_data, + true, + k_, // M + m_, // N + false, + nullptr, + false, + lite_api::ActivationType::kIndentity, + &ctx); + } else { + paddle::lite::arm::math::sgemm(true, // is_transA + false, // is_transB, + k_, // M + n_, // N + m_, // K + 1.0f, // alpha + x_data, // A + k_, // lda + out_grad_data, // B + n_, // ldb + 0.f, // beta + y_grad_data, // C + n_, // ldc + NULL, // bias + false, // is_bias + act_param, // act_param + &ctx); // ctx + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(mul_grad, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::MulGradCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/mul_grad_compute.h b/lite/kernels/arm/mul_grad_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..2cdaff3f10ce0a3c0a9509765f858c7371a75f0c --- /dev/null +++ b/lite/kernels/arm/mul_grad_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class MulGradCompute : public KernelLite { + public: + using param_t = operators::MulGradParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~MulGradCompute() = default; + + private: + int m_, n_, k_; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/bm/bridges/utility.cc b/lite/kernels/bm/bridges/utility.cc index aa61462d046e1d21b49517a6362b54a884a6b6de..ffbefa137b9c9caab388fcee865469cea87b83e4 100644 --- a/lite/kernels/bm/bridges/utility.cc +++ b/lite/kernels/bm/bridges/utility.cc @@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) { counter = ++(it->second); } - return prefix + "_" + std::to_string(counter); + return prefix + "_" + paddle::lite::to_string(counter); } bool HasInputArg(const OpInfo* op_info, diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index 9ec335ce81bff6e69fbc5b12914110a445f0afb6..3fb3136bfc0787f9d8e539039811d25559919f4e 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA)) +if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA)) return() endif() diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt index f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479..1f9b84e7db0b98ce45e620cb1840842ba397953e 100755 --- a/lite/kernels/fpga/CMakeLists.txt +++ b/lite/kernels/fpga/CMakeLists.txt @@ -1,4 +1,4 @@ -if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA)) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA)) return() endif() diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h index cc4a7e2a7ce062090ca890d90e21aa643e37a0d3..67d8a2b1cc708f7530532840df3e71770b5a3695 100644 --- a/lite/kernels/npu/bridges/graph.h +++ b/lite/kernels/npu/bridges/graph.h @@ -87,7 +87,8 @@ class Graph { auto idx = Add(name, node); CHECK_GE(idx, 1); // Generate a unique name for the created HiAI IR - node->set_data(std::make_shared(name + "__" + std::to_string(idx))); + node->set_data( + std::make_shared(name + "__" + paddle::lite::to_string(idx))); return node; } diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc index 44786220d7dd7fa24e012073e63935d6c824eb98..ef2bdb68fa9988b6a1985a34d22320193256de7b 100644 --- a/lite/kernels/npu/bridges/split_op.cc +++ b/lite/kernels/npu/bridges/split_op.cc @@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { split_op->create_dynamic_output_y(out_names.size()); int idx = 1; for (auto& out_name : out_names) { - auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0); + auto zero_node = + graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0); auto add_node = graph->Add(out_name); auto add_op = add_node->data(); - add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx)); + add_op->set_input_x1(*split_node->data(), + "y" + paddle::lite::to_string(idx)); add_op->set_input_x2(*zero_node->data()); idx++; } diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc index 770ea345b633034972cb71cb4f1236ecefff36d7..d7b14a9319951eb827cbc9d346ee8e59e9571aee 100644 --- a/lite/kernels/npu/subgraph_compute.cc +++ b/lite/kernels/npu/subgraph_compute.cc @@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() { << "[NPU] No input nodes found for building NPU model"; CHECK(!device_onames_.empty()) << "[NPU] No output nodes found for building NPU model"; + // Build the HiAI IR graph to HiAI om model as the device program - device_program_ = lite::npu::Device::Global().Build( + if (device_program_map_.count(inputs_shape_) > 0) { + return status; + } + auto device_client = lite::npu::Device::Global().Build( model_name_, device_inodes, device_onodes); - if (device_program_ == nullptr) { + if (device_client == nullptr) { LOG(WARNING) << "[NPU] Build model failed!"; return subgraph::FAILED; } + auto device_program = std::make_shared(device_client); + device_program_map_[inputs_shape_] = device_program; // Query and check the dimensions of valid input and output tensors std::vector device_idims, device_odims; - if (device_program_->GetModelIOTensorDim( + if (device_program->client->GetModelIOTensorDim( model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { LOG(WARNING) << "[NPU] Get the dimensions of input and output tensors failed!"; return subgraph::FAILED; } + device_program->device_idims = device_idims; + device_program->device_odims = device_odims; + CHECK_EQ(device_idims.size(), device_inames_.size()); CHECK_EQ(device_odims.size(), device_onames_.size()); origin_idims_.resize(device_inames_.size()); @@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() { origin_odims_.resize(device_onames_.size()); origin_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size()); + for (int i = 0; i < device_inames_.size(); i++) { auto node = graph.Get(device_inames_[i]); auto precision = node->precision(); @@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() { device_itensors_[i].reset(new hiai::AiTensor); device_itensors_[i]->Init(&(device_idims[i])); } + device_program->origin_idims = origin_idims_; + for (int i = 0; i < device_onames_.size(); i++) { auto node = graph.Get(device_onames_[i]); auto precision = node->precision(); @@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() { << PrecisionToStr(precision); break; } + device_program->origin_odims = origin_odims_; + CHECK_EQ(origin_odims_[i].production(), device_odims[i].GetNumber() * device_odims[i].GetChannel() * device_odims[i].GetHeight() * device_odims[i].GetWidth()); @@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() { int SubgraphEngine::LaunchDeviceProgram() { // Copy the data of origin input tensors to the buffer of input HiAI tensors + // init device_itensors_, device_otensors_, origin_otensors_ + auto device_program = device_program_map_[inputs_shape_]; for (size_t i = 0; i < device_itensors_.size(); i++) { + device_itensors_[i]->Init(&(device_program->device_idims[i])); std::memcpy(device_itensors_[i]->GetBuffer(), origin_itensors_[i]->raw_data(), origin_itensors_[i]->memory_size()); } + for (size_t i = 0; i < device_otensors_.size(); i++) { + device_otensors_[i]->Init(&(device_program->device_odims[i])); + } + for (size_t i = 0; i < origin_otensors_.size(); i++) { + origin_otensors_[i]->Resize(device_program->origin_odims[i]); + } + // Run the HiAI model by name std::string key = "model_name"; // Note: key seems must be model_name - model_context_.AddPara(key, model_name_); + hiai::AiContext model_context; + model_context.AddPara(key, model_name_); auto GetCurrentUS = []() -> double { struct timeval time; gettimeofday(&time, NULL); @@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() { }; int istamp; auto start_time = GetCurrentUS(); - CHECK_EQ( - device_program_->Process( - model_context_, device_itensors_, device_otensors_, 1000, istamp), - hiai::AI_SUCCESS); + CHECK_EQ(device_program->client->Process( + model_context, device_itensors_, device_otensors_, 1000, istamp), + hiai::AI_SUCCESS); VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; + // Copy the data of output HiAI tensor to the buffer of origin output tensors for (size_t i = 0; i < device_otensors_.size(); i++) { std::memcpy(const_cast(origin_otensors_[i]->raw_data()), @@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() { return 0; } +bool SubgraphEngine::InputShapeChanged() { + std::vector> new_shape; + for (auto origin_itensor : origin_itensors_) { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } + inputs_shape_ = new_shape; + if (device_program_map_.count(inputs_shape_) > 0) { + return false; + } + return true; +} + void SubgraphCompute::PrepareForRun() { auto& param = this->Param(); engine_.reset(new SubgraphEngine(ctx_.get(), diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h index 29aeb01cdb50e2a9dd6d066a2f11106fd4cb20fb..801f61b0365c03d59c36e2a62ac3c2bb61f46607 100644 --- a/lite/kernels/npu/subgraph_compute.h +++ b/lite/kernels/npu/subgraph_compute.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine { : subgraph::Engine( ctx, block_idx, block_desc, input_names, output_names, scope) {} + struct device_program_t { + explicit device_program_t(std::shared_ptr _client) + : client(_client) {} + std::shared_ptr client{nullptr}; + std::vector origin_idims{}; + std::vector origin_odims{}; + std::vector device_idims{}; + std::vector device_odims{}; + }; + protected: int BuildDeviceProgram() override; int LaunchDeviceProgram() override; + bool InputShapeChanged() override; - std::string model_name_; - hiai::AiContext model_context_; - std::vector device_inames_; - std::vector device_onames_; - std::vector> device_itensors_; - std::vector> device_otensors_; - std::unique_ptr device_program_{nullptr}; + std::string model_name_{"model.om"}; + std::vector> inputs_shape_{}; + std::map>, std::shared_ptr> + device_program_map_{}; + std::vector device_inames_{}; + std::vector device_onames_{}; + std::vector> device_itensors_{}; + std::vector> device_otensors_{}; }; class SubgraphCompute : public KernelLite { diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 25afb2fc399c6a4da8775440c1602031061267f7..652ce2593828c5131c0e3192db0a45a490b3cbc6 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -1,4 +1,4 @@ -if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL)) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL)) return () endif() @@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc # DEPS conv_opencl op_registry program context) +#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc +# DEPS tensor cl_context cl_wrapper cl_target_wrapper) + #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc # DEPS depthwise_conv2d_opencl op_registry program context) diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc index d24275f24b6503c777178257ae45305a7abdb02c..dbe487ba91d00c2de4c08edf140526d727bac6b5 100644 --- a/lite/kernels/opencl/activation_image_compute.cc +++ b/lite/kernels/opencl/activation_image_compute.cc @@ -101,6 +101,7 @@ class ActivationComputeImageDefault status = kernel.setArg(++arg_idx, scale_); CL_CHECK_FATAL(status); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " @@ -112,6 +113,7 @@ class ActivationComputeImageDefault VLOG(4) << "threshold:" << threshold_; VLOG(4) << "scale:" << scale_; VLOG(4) << "kernel func name:" << kernel_func_name_; +#endif auto global_work_size = cl::NDRange{static_cast(image_shape["width"]), @@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL( // exp REGISTER_LITE_KERNEL( - exp_act, + exp, kOpenCL, kFP16, kImageDefault, @@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL( // tanh REGISTER_LITE_KERNEL( - tanh_act, + tanh, kOpenCL, kFP16, kImageDefault, diff --git a/lite/kernels/opencl/activation_image_compute_test.cc b/lite/kernels/opencl/activation_image_compute_test.cc index 40751a44b2b81dae387e2614f281b4a5e4a7bace..2f30ec6743fd488fc88f0b9f9d6544b3ca7642bf 100644 --- a/lite/kernels/opencl/activation_image_compute_test.cc +++ b/lite/kernels/opencl/activation_image_compute_test.cc @@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) { func_name = "sigmoid"; break; case 6: // tanh - func_name = "tanh_act"; + func_name = "tanh"; break; case 7: // tanh func_name = "swish"; break; case 8: // tanh - func_name = "exp_act"; + func_name = "exp"; break; } LOG(INFO) << "func_name: " << func_name; @@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); // exp -USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault); +USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault); // swish USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault); @@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault); // tanh act -USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault); +USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault); // relu image2d fp16 USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault); diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc index eeab8b043b3344b492fd9bafc3259e8d8ed08438..7e32010c0b5ff5cedad8b0da7ce7233fbf73da6f 100644 --- a/lite/kernels/opencl/bilinear_interp_image_compute.cc +++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc @@ -77,17 +77,21 @@ class BilinearInterpImageCompute int out_h = out_dims[2]; int out_w = out_dims[3]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "out->dims():" << out_dims; +#endif auto out_image_shape = InitImageDimInfoWith(out_dims); auto* x_img = x->data(); - // VLOG(4) << "x_image: " << x_img; auto* out_img = out->mutable_data( out_image_shape["width"], out_image_shape["height"]); + +#ifndef LITE_SHUTDOWN_LOG + // VLOG(4) << "x_image: " << x_img; // VLOG(4) << "out_image: " << out_img; VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " " << out_image_shape["height"]; @@ -96,6 +100,7 @@ class BilinearInterpImageCompute << ", align_delta: " << align_delta; VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w; VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; @@ -107,8 +112,10 @@ class BilinearInterpImageCompute DDim(std::vector{ static_cast(out_image_shape["width"]), static_cast(out_image_shape["height"])})); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "default_work_size: " << default_work_size[0] << ", " << default_work_size[1] << ", " << default_work_size[2]; +#endif cl_int status = kernel.setArg(arg_idx++, *x_img); CL_CHECK_FATAL(status); status = kernel.setArg(arg_idx++, *out_img); @@ -142,9 +149,10 @@ class BilinearInterpImageCompute event_.get()); CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_img, event_); - +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " << global_work_size[1] << " " << global_work_size[2]; +#endif } protected: diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc index f1b0cb21bb8ea68248c3caabb1146bbff461c6c9..95e64025662a4b87cd68c211ccc0b0fb7b84a9f2 100644 --- a/lite/kernels/opencl/concat_image_compute.cc +++ b/lite/kernels/opencl/concat_image_compute.cc @@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLitedims()[inputs[0]->dims().size() - 1]; - VLOG(4) << "concat 输入尺寸: "; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "concat input shape: "; for (size_t i = 0; i < inputs.size(); i++) { VLOG(4) << "inputs [" << i << "]" << "[" << inputs[i]->dims().size() << "D]:" @@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLitedims()[3]; } - VLOG(4) << "concat 输出尺寸: "; + VLOG(4) << "concat output shape: "; VLOG(4) << " out dims: " << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; VLOG(4) << "axis_: " << axis_; VLOG(4) << "flag_: " << flag_; +#endif auto global_work_size = cl::NDRange{static_cast(x_dims[x_dims.size() - 1]), @@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite(image_shape["height"])}; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << TargetToStr(param.output->target()); VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " << image_shape["height"]; @@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLiteGetKernel(kernel_key.str()); int out_w = x_dims[x_dims.size() - 1]; @@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLitedata(); int in_w = in_dims[in_dims.size() - 1]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " << image_shape["height"]; +#endif global_work_size = cl::NDRange{static_cast(in_dims[in_dims.size() - 1]), static_cast(image_shape["width"] / diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 8a6017d1ad295b7ae833cd15de47655e669e5b79..d664e37150fcc661e4bb97ed57a42364dd0d475d 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -78,6 +78,7 @@ void ConvImageCompute::PrepareForRun() { VLOG(3) << "dilation_equal:" << dilation_equal; VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " " << paddings[2] << " " << paddings[3]; + CHECK(pad_equal && stride_equal && dilation_equal); if (kernel_h == 1 && kernel_w == 1) { @@ -85,9 +86,9 @@ void ConvImageCompute::PrepareForRun() { if (param.x->dims()[1] % 4 == 0) { kernel_func_names_.push_back("conv2d_1x1_simple"); } else { - kernel_func_names_.push_back("conv2d_1x1"); + kernel_func_names_.push_back("conv2d_1x1_opt"); } - kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl"); + kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl"); CLImageConverterNWBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); @@ -97,7 +98,7 @@ void ConvImageCompute::PrepareForRun() { filter_gpu_image_.mutable_data( filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); - impl_ = &ConvImageCompute::Conv2d1x1; + impl_ = &ConvImageCompute::Conv2d1x1opt; #define DEPTH_CONV_USE_SPL #ifdef DEPTH_CONV_USE_SPL } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] && @@ -141,9 +142,10 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::DepthwiseConv2d; - } else if (kernel_h == 3 && kernel_h == 3) { + } else if (kernel_w == 3 && kernel_h == 3) { // conv2d_3x3 - kernel_func_names_.push_back("conv2d_3x3_opt"); + kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch" + : "conv2d_3x3_opt"); kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); CLImageConverterFolder converter; @@ -156,6 +158,8 @@ void ConvImageCompute::PrepareForRun() { impl_ = &ConvImageCompute::Conv2d3x3opt; } else if (kernel_h == 5 && kernel_w == 5) { +#define CONV_5x5_OPT +#ifndef CONV_5x5_OPT // conv2d_5x5 kernel_func_names_.push_back("conv2d_5x5"); kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl"); @@ -169,7 +173,27 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d5x5; +#else + // conv2d_5x5_opt + + kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch" + : "conv2d_5x5_opt"); + kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl"); + + CLImageConverterFolder converter; + const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); + std::vector filter_image_v(filter_image_dims[0] * + filter_image_dims[1] * 4); // 4 : RGBA + converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims); + filter_gpu_image_.mutable_data( + filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); + + impl_ = &ConvImageCompute::Conv2d5x5opt; +#endif +#undef CONV_5x5_OPT } else if (kernel_h == 7 && kernel_w == 7) { +#define CONV_7x7_OPT +#ifndef CONV_7x7_OPT // conv2d_7x7 kernel_func_names_.push_back("conv2d_7x7"); kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl"); @@ -183,6 +207,25 @@ void ConvImageCompute::PrepareForRun() { filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); impl_ = &ConvImageCompute::Conv2d7x7; + +#else + // conv2d_7x7 + kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch" + : "conv2d_7x7_opt"); + kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl"); + + CLImageConverterFolder converter; + const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); + std::vector filter_image_v(filter_image_dims[0] * + filter_image_dims[1] * 4); // 4 : RGBA + converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims); + this->filter_gpu_image_.mutable_data( + filter_image_dims[0], filter_image_dims[1], filter_image_v.data()); + + impl_ = &ConvImageCompute::Conv2d7x7opt; +#endif +#undef CONV_7x7_OPT + } else { LOG(FATAL) << "conv image compute not support this condition yet! "; } @@ -229,7 +272,7 @@ void ConvImageCompute::PrepareForRun() { } } -void ConvImageCompute::Conv2d1x1() { +void ConvImageCompute::Conv2d1x1opt() { const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); auto paddings = *param.paddings; @@ -269,6 +312,7 @@ void ConvImageCompute::Conv2d1x1() { int w = default_work_size[1]; int nh = default_work_size[2]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d_1x1 params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," << input_image_shape["height"]; @@ -290,7 +334,7 @@ void ConvImageCompute::Conv2d1x1() { VLOG(4) << "default work size{c_block, w, nh}: " << "{" << c_block << ", " << w << ", " << nh << "" << "}"; - +#endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); CHECK_GE(input_dims.size(), 4); @@ -313,10 +357,12 @@ void ConvImageCompute::Conv2d1x1() { auto kernel = context.cl_context()->GetKernel(kernel_key.str()); int maped_w = maptofactor(w, 4); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str(); VLOG(4) << "maped_w: " << maped_w; VLOG(4) << "hasbias: " << has_bias; +#endif cl_int status; int arg_idx = 0; @@ -363,21 +409,27 @@ void ConvImageCompute::Conv2d1x1() { static_cast(maped_w), static_cast(default_work_size.data()[2])}; +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif size_t max_work_group_size = 0; kernel.getWorkGroupInfo(CLRuntime::Global()->device(), CL_KERNEL_WORK_GROUP_SIZE, &max_work_group_size); cl::NDRange local_work_size = cl::NullRange; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "max_work_group_size: " << max_work_group_size; +#endif if (max_work_group_size > 0 && use_lws) { local_work_size = context.cl_context()->LocalWorkSize(global_work_size, max_work_group_size); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "}"; +#endif } status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( @@ -453,6 +505,7 @@ void ConvImageCompute::Conv2d3x3() { int w = default_work_size[1]; int nh = default_work_size[2]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," << input_image_shape["height"]; @@ -477,6 +530,7 @@ void ConvImageCompute::Conv2d3x3() { VLOG(4) << "default work size{c_block, w, nh}: " << "{" << c_block << ", " << w << ", " << nh << "" << "}"; +#endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -496,9 +550,12 @@ void ConvImageCompute::Conv2d3x3() { STL::stringstream kernel_key; kernel_key << kernel_func_names_[0] << build_options_[0]; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str(); VLOG(4) << "w: " << w; +#endif cl_int status; int arg_idx = 0; @@ -513,7 +570,9 @@ void ConvImageCompute::Conv2d3x3() { status = kernel.setArg(++arg_idx, *filter_image); CL_CHECK_FATAL(status); if (has_bias) { +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -553,9 +612,11 @@ void ConvImageCompute::Conv2d3x3() { static_cast(default_work_size.data()[1]), static_cast(default_work_size.data()[2])}; +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, @@ -586,7 +647,8 @@ void ConvImageCompute::Conv2d3x3opt() { int output_width = output_dims[3]; int output_height = output_dims[2]; int output_channel = output_dims[1]; - + CHECK_EQ(input_dims[0], output_dims[0]); + int batch = input_dims[0]; auto out_image_shape = InitImageDimInfoWith(output_dims); auto* out_image = param.output->mutable_data( out_image_shape["width"], out_image_shape["height"]); @@ -611,8 +673,9 @@ void ConvImageCompute::Conv2d3x3opt() { int h_blk_size = 1; int h_blk = (nh + h_blk_size - 1) / h_blk_size; - // default_work_size[2] = h_blk; +// default_work_size[2] = h_blk; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," // << input_image_shape["height"]; @@ -632,6 +695,7 @@ void ConvImageCompute::Conv2d3x3opt() { VLOG(4) << "default work size{c_block, w, nh}: " << "{" << c_block << ", " << w << ", " << nh << "" << "}"; +#endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -651,8 +715,11 @@ void ConvImageCompute::Conv2d3x3opt() { STL::stringstream kernel_key; kernel_key << kernel_func_names_[0] << build_options_[0]; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str(); +#endif cl_int status; int arg_idx = 0; @@ -667,7 +734,9 @@ void ConvImageCompute::Conv2d3x3opt() { status = kernel.setArg(++arg_idx, *filter_image); CL_CHECK_FATAL(status); if (has_bias) { +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -681,6 +750,8 @@ void ConvImageCompute::Conv2d3x3opt() { status = kernel.setArg(++arg_idx, dilations[0]); CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, batch); + CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, input_channel); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, input_width); @@ -696,22 +767,27 @@ void ConvImageCompute::Conv2d3x3opt() { cl::NDRange{static_cast(default_work_size.data()[0]), static_cast(w_blk), static_cast(h_blk)}; - +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif size_t max_work_group_size = 0; kernel.getWorkGroupInfo(CLRuntime::Global()->device(), CL_KERNEL_WORK_GROUP_SIZE, &max_work_group_size); cl::NDRange local_work_size = cl::NullRange; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "max_work_group_size: " << max_work_group_size; +#endif if (max_work_group_size > 0 && use_lws) { local_work_size = context.cl_context()->LocalWorkSize(global_work_size, max_work_group_size); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "}"; +#endif } status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( @@ -767,6 +843,7 @@ void ConvImageCompute::Conv2d5x5() { int w = default_work_size[1]; int nh = default_work_size[2]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," << input_image_shape["height"]; @@ -789,6 +866,7 @@ void ConvImageCompute::Conv2d5x5() { VLOG(4) << "default work size{c_block, w, nh}: " << "{" << c_block << ", " << w << ", " << nh << "" << "}"; +#endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -808,9 +886,12 @@ void ConvImageCompute::Conv2d5x5() { STL::stringstream kernel_key; kernel_key << kernel_func_names_[0] << build_options_[0]; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str(); VLOG(4) << "w: " << w; +#endif cl_int status; int arg_idx = 0; @@ -825,7 +906,9 @@ void ConvImageCompute::Conv2d5x5() { status = kernel.setArg(++arg_idx, *filter_image); CL_CHECK_FATAL(status); if (has_bias) { +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -855,9 +938,11 @@ void ConvImageCompute::Conv2d5x5() { static_cast(default_work_size.data()[1]), static_cast(default_work_size.data()[2])}; +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, @@ -870,6 +955,172 @@ void ConvImageCompute::Conv2d5x5() { context.cl_wait_list()->emplace(out_image, event_); } +void ConvImageCompute::Conv2d5x5opt() { + const auto& param = *param_.get_mutable(); + auto input_dims = param.x->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto dilations = *param.dilations; + + auto* input_image = param.x->data(); + auto* filter_image = filter_gpu_image_.data(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + int input_width = input_dims[3]; + int input_height = input_dims[2]; + int input_channel = input_dims[1]; + int output_width = output_dims[3]; + int output_height = output_dims[2]; + int output_channel = output_dims[1]; + CHECK_EQ(input_dims[0], output_dims[0]); + int batch = input_dims[0]; + + auto out_image_shape = InitImageDimInfoWith(output_dims); + auto* out_image = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + int w_blk_size = 5; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + // default_work_size[1] = w_blk; + + int h_blk_size = 1; + int h_blk = (nh + h_blk_size - 1) / h_blk_size; +// default_work_size[2] = h_blk; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "============ conv2d params ============"; + // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + // << input_image_shape["height"]; + // VLOG(4) << "input_image: " << input_image; + VLOG(4) << "input_dims: " << input_dims; + VLOG(4) << "filter_dims: " << filter_dims; + // VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; +#endif + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); + + const cl::Image2D* bias_image = nullptr; + if (has_bias) { + bias_image = bias_gpu_image_.data(); + } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); +#endif + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, w_blk); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, h_blk); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_image); + CL_CHECK_FATAL(status); + if (has_bias) { + status = kernel.setArg(++arg_idx, *bias_image); + CL_CHECK_FATAL(status); + } + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, strides[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, paddings[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, dilations[0]); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, batch); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_channel); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_height); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(w_blk), + static_cast(h_blk)}; + +// VLOG(4) << "out_image: " << out_image; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif + size_t max_work_group_size = 0; + kernel.getWorkGroupInfo(CLRuntime::Global()->device(), + CL_KERNEL_WORK_GROUP_SIZE, + &max_work_group_size); + cl::NDRange local_work_size = cl::NullRange; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "max_work_group_size: " << max_work_group_size; +#endif + if (max_work_group_size > 0 && use_lws) { + local_work_size = context.cl_context()->LocalWorkSize(global_work_size, + max_work_group_size); +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," + << local_work_size[1] << "," << local_work_size[2] << "}"; +#endif + } + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + local_work_size, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); +} + void ConvImageCompute::Conv2d7x7() { const auto& param = *param_.get_mutable(); auto input_dims = param.x->dims(); @@ -912,6 +1163,7 @@ void ConvImageCompute::Conv2d7x7() { int w = default_work_size[1]; int nh = default_work_size[2]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," << input_image_shape["height"]; @@ -934,6 +1186,7 @@ void ConvImageCompute::Conv2d7x7() { VLOG(4) << "default work size{c_block, w, nh}: " << "{" << c_block << ", " << w << ", " << nh << "" << "}"; +#endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -953,9 +1206,12 @@ void ConvImageCompute::Conv2d7x7() { STL::stringstream kernel_key; kernel_key << kernel_func_names_[0] << build_options_[0]; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str(); VLOG(4) << "w: " << w; +#endif cl_int status; int arg_idx = 0; @@ -970,7 +1226,9 @@ void ConvImageCompute::Conv2d7x7() { status = kernel.setArg(++arg_idx, *filter_image); CL_CHECK_FATAL(status); if (has_bias) { +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -1000,9 +1258,11 @@ void ConvImageCompute::Conv2d7x7() { static_cast(default_work_size.data()[1]), static_cast(default_work_size.data()[2])}; +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, @@ -1014,7 +1274,167 @@ void ConvImageCompute::Conv2d7x7() { CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_image, event_); } +void ConvImageCompute::Conv2d7x7opt() { + const auto& param = *param_.get_mutable(); + auto input_dims = param.x->dims(); + auto paddings = *param.paddings; + auto strides = param.strides; + auto dilations = *param.dilations; + auto* input_image = param.x->data(); + auto* filter_image = filter_gpu_image_.data(); + auto filter_dims = param.filter->dims(); + auto output_dims = param.output->dims(); + + int input_width = input_dims[3]; + int input_height = input_dims[2]; + int input_channel = input_dims[1]; + int output_width = output_dims[3]; + int output_height = output_dims[2]; + int output_channel = output_dims[1]; + CHECK_EQ(input_dims[0], output_dims[0]); + int batch = input_dims[0]; + auto out_image_shape = InitImageDimInfoWith(output_dims); + auto* out_image = param.output->mutable_data( + out_image_shape["width"], out_image_shape["height"]); + + const bool has_bias = param.bias != nullptr; + const bool is_element_wise_bias = + has_bias && param.output->dims() == param.bias->dims(); + + const std::vector& default_work_size = + DefaultWorkSize(output_dims, + DDim(std::vector{ + static_cast(out_image_shape["width"]), + static_cast(out_image_shape["height"])})); + + int c_block = default_work_size[0]; + int w = default_work_size[1]; + int nh = default_work_size[2]; + + int w_blk_size = 5; + int w_blk = (w + w_blk_size - 1) / w_blk_size; + // default_work_size[1] = w_blk; + + int h_blk_size = 1; + int h_blk = (nh + h_blk_size - 1) / h_blk_size; +// default_work_size[2] = h_blk; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "============ conv2d 7x7 params ============"; + // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," + // << input_image_shape["height"]; + // VLOG(4) << "input_image: " << input_image; + VLOG(4) << "input_dims: " << input_dims; + VLOG(4) << "filter_dims: " << filter_dims; + // VLOG(4) << "filter_image: " << filter_image; + VLOG(4) << "output_dims: " << output_dims; + VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", " + << out_image_shape["height"]; + VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1]; + VLOG(4) << "has bias: " << has_bias; + VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias; + VLOG(4) << "strides: " << strides[0] << "," << strides[1]; + VLOG(4) << "dilations.size : " << dilations.size(); + VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1]; + VLOG(4) << "default work size{c_block, w, nh}: " + << "{" << c_block << ", " << w << ", " << nh << "" + << "}"; +#endif + CHECK_GE(dilations.size(), 2); + CHECK(dilations[0] == dilations[1]); + CHECK_GE(input_dims.size(), 4); + CHECK_GE(paddings.size(), 2); + CHECK(paddings[0] == paddings[1]); + CHECK_GE(strides.size(), 2); + CHECK(strides[0] == strides[1]); + + const cl::Image2D* bias_image = nullptr; + if (has_bias) { + bias_image = bias_gpu_image_.data(); + } + + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_names_[0] << build_options_[0]; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "kernel_key: " << kernel_key.str(); + VLOG(4) << "kernel ready ... " << kernel_key.str(); +#endif + + cl_int status; + int arg_idx = 0; + status = kernel.setArg(arg_idx, c_block); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, w_blk); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, h_blk); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *input_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *filter_image); + CL_CHECK_FATAL(status); + if (has_bias) { + status = kernel.setArg(++arg_idx, *bias_image); + CL_CHECK_FATAL(status); + } + status = kernel.setArg(++arg_idx, *out_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, strides[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, paddings[0]); + CL_CHECK_FATAL(status); + + status = kernel.setArg(++arg_idx, dilations[0]); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, batch); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_channel); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, input_height); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_width); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, output_height); + CL_CHECK_FATAL(status); + + auto global_work_size = + cl::NDRange{static_cast(default_work_size.data()[0]), + static_cast(w_blk), + static_cast(h_blk)}; +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," + << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif + size_t max_work_group_size = 0; + kernel.getWorkGroupInfo(CLRuntime::Global()->device(), + CL_KERNEL_WORK_GROUP_SIZE, + &max_work_group_size); + cl::NDRange local_work_size = cl::NullRange; + if (max_work_group_size > 0 && use_lws) { + local_work_size = context.cl_context()->LocalWorkSize(global_work_size, + max_work_group_size); +#ifndef LITE_SHUTDOWN_LOG + VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," + << local_work_size[1] << "," << local_work_size[2] << "}"; +#endif + } + + status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( + kernel, + cl::NullRange, + global_work_size, + local_work_size, + nullptr, + event_.get()); + CL_CHECK_FATAL(status); + context.cl_wait_list()->emplace(out_image, event_); +} void ConvImageCompute::DepthwiseConv2d3x3s1() { const auto& param = *param_.get_mutable(); auto x_dims = param.x->dims(); @@ -1071,7 +1491,9 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { const cl::Image2D* bias_image = nullptr; if (has_bias) { bias_image = bias_gpu_image_.data(); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -1099,12 +1521,16 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { CL_KERNEL_WORK_GROUP_SIZE, &max_work_group_size); cl::NDRange local_work_size = cl::NullRange; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "max_work_group_size: " << max_work_group_size; +#endif if (max_work_group_size > 0 && use_lws) { local_work_size = context.cl_context()->LocalWorkSize(global_work_size, max_work_group_size); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "}"; +#endif } status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( @@ -1153,6 +1579,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() { int nh = output_dims[0] * output_dims[2]; auto global_work_size = cl::NDRange(c_block, w, nh); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "setArg"; VLOG(4) << "c_block = " << c_block; VLOG(4) << "w = " << w; @@ -1166,6 +1593,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() { VLOG(4) << "x_dims[2] = " << x_dims[2]; VLOG(4) << "output_dims[3] = " << output_dims[3]; VLOG(4) << "output_dims[2] = " << output_dims[2]; +#endif cl_int status; int arg_idx = 0; @@ -1185,7 +1613,9 @@ void ConvImageCompute::DepthwiseConv2d3x3() { const cl::Image2D* bias_image = nullptr; if (has_bias) { bias_image = bias_gpu_image_.data(); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -1261,6 +1691,7 @@ void ConvImageCompute::DepthwiseConv2d() { int w = default_work_size[1]; int nh = default_work_size[2]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "============ depthwise conv2d params ============"; VLOG(4) << "input_image_shape: " << input_image_shape["width"] << "," << input_image_shape["height"]; @@ -1282,6 +1713,7 @@ void ConvImageCompute::DepthwiseConv2d() { VLOG(4) << "default work size{c_block, w, nh}: " << "{" << c_block << ", " << w << ", " << nh << "" << "}"; +#endif CHECK_GE(dilations.size(), 2); CHECK(dilations[0] == dilations[1]); @@ -1303,9 +1735,12 @@ void ConvImageCompute::DepthwiseConv2d() { STL::stringstream kernel_key; kernel_key << kernel_func_names_[0] << build_options_[0]; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str(); VLOG(4) << "w: " << w; +#endif cl_int status; int arg_idx = 0; @@ -1320,7 +1755,9 @@ void ConvImageCompute::DepthwiseConv2d() { status = kernel.setArg(++arg_idx, *filter_image); CL_CHECK_FATAL(status); if (has_bias) { +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "set bias_image: "; +#endif status = kernel.setArg(++arg_idx, *bias_image); CL_CHECK_FATAL(status); } @@ -1354,9 +1791,11 @@ void ConvImageCompute::DepthwiseConv2d() { static_cast(default_work_size.data()[1]), static_cast(default_work_size.data()[2])}; +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image: " << out_image; VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "}"; +#endif status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 3f8db82f4a6b3f7cf0abad3cdac4198fd0b516d5..57e4b91e0a842487fc5dfce4799fab244348772d 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite(filter_v.data(), filter_dim); - // auto* filter_image2d = filter.mutable_data( // filter_image_width, // filter_image_height, diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc index 5dff529fb4fbfec023996b0169e948d597afa78e..3961ac7583917fdcd761614558c493e6917d3294 100644 --- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc +++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc @@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() { STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << TargetToStr(ele_param_->X->target()); VLOG(4) << TargetToStr(ele_param_->Y->target()); VLOG(4) << TargetToStr(ele_param_->Out->target()); +#endif int arg_idx = 0; cl_int status = kernel.setArg(arg_idx, *x_buf); CL_CHECK_FATAL(status); @@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() { for (int i = static_cast(y_dims.size() + axis); i < x_dims.size(); ++i) { num_ *= x_dims[i]; } +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "axis: " << axis; VLOG(4) << "batch: " << batch_; VLOG(4) << "channels: " << channels_; VLOG(4) << "num: " << num_; +#endif } } // namespace opencl diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc index e9015ab16044f9346bbaa997e4f47dfbcd9bb023..6d0ebf638f0a8967e27a657131e1cac89967ee0b 100644 --- a/lite/kernels/opencl/elementwise_add_image_compute.cc +++ b/lite/kernels/opencl/elementwise_add_image_compute.cc @@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() { auto* out = ele_param_->Out; auto axis = ele_param_->axis; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "y->target():" << TargetToStr(y->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); @@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() { VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "out->dims():" << out->dims(); VLOG(4) << "axis:" << axis; +#endif paddle::lite::CLImageConverterDefault default_convertor; auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h @@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() { auto* out_img = out->mutable_data(out_img_shape[0], out_img_shape[1]); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " " << out_img_shape[1]; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; @@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() { } else if (y_dims.size() == 1) { if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) { int tensor_w = x->dims()[x->dims().size() - 1]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "tensor_w:" << tensor_w; - +#endif cl_int status = kernel.setArg(arg_idx, *x_img); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *y_img); @@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() { auto global_work_size = cl::NDRange{static_cast(x_img_width), static_cast(x_img_height)}; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; +#endif auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc index c5e43616f957695aa598b9f383135bf603eb42b4..78a025566f24cb604910eb3766cb05c8647e1e03 100644 --- a/lite/kernels/opencl/elementwise_mul_image_compute.cc +++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc @@ -56,7 +56,7 @@ class ElementwiseMulImageCompute } else { kernel_func_name_ = "channel_mul_d2_hw"; } - } else if (y_dims.size() == 4) { + } else if (y_dims.size() == 4 || x_dims.size() == 4) { kernel_func_name_ = "channel_mul_d4"; } else { LOG(FATAL) << "ElementwiseMul not supported y_dims.size():" @@ -80,12 +80,14 @@ class ElementwiseMulImageCompute auto* y = ele_param_->Y; auto* out = ele_param_->Out; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "y->target():" << TargetToStr(y->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "x->dims():" << x->dims(); VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "out->dims():" << out->dims(); +#endif paddle::lite::CLImageConverterDefault default_convertor; auto x_img_shape = @@ -101,10 +103,12 @@ class ElementwiseMulImageCompute auto* out_img = out->mutable_data(out_img_shape[0], out_img_shape[1]); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " " << out_img_shape[1]; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; @@ -123,7 +127,9 @@ class ElementwiseMulImageCompute CL_CHECK_FATAL(status); } else if (y_dims.size() == 1 || y_dims.size() == 4) { auto tensor_w = x_dims[x_dims.size() - 1]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "tensor_w:" << tensor_w; +#endif // kernel: channel_mul_d1 / channel_mul_d4 cl_int status = kernel.setArg(arg_idx, *x_img); CL_CHECK_FATAL(status); @@ -136,7 +142,9 @@ class ElementwiseMulImageCompute } else if (y_dims.size() == 2) { if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) { auto tensor_w = x_dims[x_dims.size() - 1]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "tensor_w:" << tensor_w; +#endif // kernel: channel_mul_d2_nc cl_int status = kernel.setArg(arg_idx, *x_img); CL_CHECK_FATAL(status); @@ -149,7 +157,9 @@ class ElementwiseMulImageCompute } else { auto y_tensor_h = y->dims()[0]; auto y_tensor_w = y->dims()[1]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h; +#endif // kernel: channel_mul_d2_hw cl_int status = kernel.setArg(arg_idx, *x_img); CL_CHECK_FATAL(status); @@ -162,6 +172,18 @@ class ElementwiseMulImageCompute status = kernel.setArg(++arg_idx, static_cast(y_tensor_h)); CL_CHECK_FATAL(status); } + } else if (x_dims.size() == 4) { + auto tensor_w = y_dims[y_dims.size() - 1]; + VLOG(4) << "tensor_w:" << tensor_w; + // kernel: channel_mul_d4 + cl_int status = kernel.setArg(arg_idx, *y_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *x_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, *out_img); + CL_CHECK_FATAL(status); + status = kernel.setArg(++arg_idx, static_cast(tensor_w)); + CL_CHECK_FATAL(status); } else { LOG(FATAL) << "ElementwiseMul not supported y_dims.size():" << y_dims.size(); @@ -179,8 +201,9 @@ class ElementwiseMulImageCompute event_.get()); CL_CHECK_FATAL(status); context.cl_wait_list()->emplace(out_img, event_); - +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; +#endif } protected: diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc index 3a18501dfb38d3a11f432751c6abd51ce1c7a180..0bc867d7f124582660b7a0a9a95d026d910fc2d3 100644 --- a/lite/kernels/opencl/elementwise_sub_image_compute.cc +++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc @@ -62,6 +62,7 @@ void ElementwiseSubImageCompute::Run() { auto* out = ele_param_->Out; auto axis = ele_param_->axis; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "y->target():" << TargetToStr(y->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); @@ -69,6 +70,7 @@ void ElementwiseSubImageCompute::Run() { VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "out->dims():" << out->dims(); VLOG(4) << "axis:" << axis; +#endif paddle::lite::CLImageConverterDefault default_convertor; auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h @@ -83,10 +85,12 @@ void ElementwiseSubImageCompute::Run() { auto* out_img = out->mutable_data(out_img_shape[0], out_img_shape[1]); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " " << out_img_shape[1]; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; @@ -104,8 +108,9 @@ void ElementwiseSubImageCompute::Run() { } else if (y_dims.size() == 1) { if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) { int tensor_w = x->dims()[x->dims().size() - 1]; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "tensor_w:" << tensor_w; - +#endif cl_int status = kernel.setArg(arg_idx, *x_img); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *y_img); @@ -127,7 +132,10 @@ void ElementwiseSubImageCompute::Run() { auto global_work_size = cl::NDRange{static_cast(x_img_width), static_cast(x_img_height)}; +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; +#endif + auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( kernel, cl::NullRange, diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc index e174286ca1fefa3c56bca04b433015ac769cfcbf..243737a81331a7159834d30ccfb2fab181baeebe 100644 --- a/lite/kernels/opencl/grid_sampler_image_compute.cc +++ b/lite/kernels/opencl/grid_sampler_image_compute.cc @@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLitedims(); auto in_dims = x->dims(); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "out->dims():" << out_dims; +#endif auto out_image_shape = InitImageDimInfoWith(out_dims); auto* x_img = x->data(); @@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLitemutable_data( out_image_shape["width"], out_image_shape["height"]); +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image" << out_img; VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " << out_image_shape["height"]; - +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); @@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite{ static_cast(out_image_shape["width"]), static_cast(out_image_shape["height"])})); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "default_work_size: " << default_work_size[0] << ", " << default_work_size[1] << ", " << default_work_size[2]; +#endif cl_int status = kernel.setArg(arg_idx++, *x_img); CL_CHECK_FATAL(status); status = kernel.setArg(arg_idx++, *grid_img); @@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLiteemplace(out_img, event_); - +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " << global_work_size[1] << " " << global_work_size[2]; +#endif } protected: diff --git a/lite/backends/opencl/cl_im2col_test.cc b/lite/kernels/opencl/im2col_buffer_test.cc similarity index 100% rename from lite/backends/opencl/cl_im2col_test.cc rename to lite/kernels/opencl/im2col_buffer_test.cc diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc index d90acdb02d75958b72d986453d0fe6adacb43c0f..176b4149b2656c6197f43336753bc53d5fb18769 100644 --- a/lite/kernels/opencl/instance_norm_image_compute.cc +++ b/lite/kernels/opencl/instance_norm_image_compute.cc @@ -89,19 +89,23 @@ class InstanceNormImageCompute : public KernelLitetarget():" << TargetToStr(x->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "x->dims():" << in_dims; +#endif auto out_image_shape = InitImageDimInfoWith(in_dims); auto* x_img = x->data(); - auto* out_img = out->mutable_data( out_image_shape["width"], out_image_shape["height"]); + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " " << out_image_shape["height"]; VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w; +#endif int threads = 512; int group_size_x = (channel + 3) / 4; @@ -113,10 +117,13 @@ class InstanceNormImageCompute : public KernelLite(group_size_x * threads), static_cast(group_size_y), static_cast(1)}; + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "local_work_size:[2D]:" << local_work_size[0] << " " << local_work_size[1] << " " << local_work_size[2]; VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " << global_work_size[1] << " " << global_work_size[2]; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc index 0e9a5941c0a3484ffbb72012f64c07296694078b..6a49cc2577a58690e5e0b6a6ede82df0bdc99bb1 100644 --- a/lite/kernels/opencl/io_copy_buffer_compute.cc +++ b/lite/kernels/opencl/io_copy_buffer_compute.cc @@ -42,11 +42,13 @@ class IoCopyHostToOpenCLCompute CHECK(param.x->target() == TARGET(kHost) || param.x->target() == TARGET(kARM)); auto mem_size = param.x->memory_size(); +#ifndef LITE_SHUTDOWN_LOG VLOG(2) << "param.x->memory_size():" << mem_size; VLOG(2) << "param.x->dims().size():" << param.x->dims().size(); VLOG(2) << "param.x->dims():" << param.x->dims(); VLOG(2) << "param.y->dims().size():" << param.y->dims().size(); VLOG(2) << "param.y->dims():" << param.y->dims(); +#endif auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size); CopyFromHostSync(data, param.x->raw_data(), mem_size); } @@ -85,12 +87,14 @@ class IoCopykOpenCLToHostCompute CHECK(param.x->target() == TARGET(kOpenCL)); auto mem_size = param.x->memory_size(); +#ifndef LITE_SHUTDOWN_LOG VLOG(2) << "copy size " << mem_size; VLOG(2) << "param.x->dims().size():" << param.x->dims().size(); VLOG(2) << "param.x->dims():" << param.x->dims(); VLOG(2) << "param.y->dims().size():" << param.y->dims().size(); VLOG(2) << "param.y->dims():" << param.y->dims(); VLOG(2) << "param.process_type:" << param.process_type; +#endif auto* data = param.y->mutable_data(TARGET(kHost), mem_size); const cl::Buffer* x_ptr; @@ -104,7 +108,9 @@ class IoCopykOpenCLToHostCompute auto* wait_list = context.cl_wait_list(); auto it = wait_list->find(x_ptr); if (it != wait_list->end()) { +#ifndef LITE_SHUTDOWN_LOG VLOG(2) << "--- Find the sync event for the target cl tensor. ---"; +#endif auto& event = *(it->second); event.wait(); } else { diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc index 9ddaf9c6e5afd549ff950e2f708bc8336bed8f52..22b3533e123bc248b0ec59df593cd51fe0ad1391 100644 --- a/lite/kernels/opencl/layout_image_compute.cc +++ b/lite/kernels/opencl/layout_image_compute.cc @@ -74,6 +74,7 @@ class LayoutComputeBufferChwToImageDefault const int Stride1 = out_H * out_W; const int Stride0 = out_W; +#ifndef LITE_SHUTDOWN_LOG VLOG(2) << "param.process_type:" << param.process_type; VLOG(2) << "x_dims:" << x_dims; VLOG(2) << "param.x->memory_size():" << param.x->memory_size(); @@ -89,6 +90,7 @@ class LayoutComputeBufferChwToImageDefault VLOG(2) << "Stride2:" << Stride2; VLOG(2) << "Stride1:" << Stride1; VLOG(2) << "Stride0:" << Stride0; +#endif auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); @@ -177,6 +179,7 @@ class LayoutComputeImageDefaultToBufferChw new_dims[4 - x_dims.size() + j] = x_dims[j]; } +#ifndef LITE_SHUTDOWN_LOG VLOG(2) << "param.process_type:" << param.process_type; VLOG(2) << "x_dims:" << x_dims; VLOG(2) << "param.x->memory_size():" << param.x->memory_size(); @@ -186,6 +189,7 @@ class LayoutComputeImageDefaultToBufferChw << new_dims[1] << " " << new_dims[2] << " " << new_dims[3]; VLOG(2) << "y_dims:" << y_dims; VLOG(2) << "param.y->memory_size():" << param.y->memory_size(); +#endif size_t C = new_dims[1]; size_t in_height = new_dims[2]; @@ -217,8 +221,10 @@ class LayoutComputeImageDefaultToBufferChw CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, static_cast(C)); CL_CHECK_FATAL(status); +#ifndef LITE_SHUTDOWN_LOG VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3] << " " << (new_dims[0] * new_dims[2]); +#endif auto global_work_size = cl::NDRange{static_cast((new_dims[1] + 3) / 4), static_cast(new_dims[3]), diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc index bb19e044ae4a7b296fbace00797b0c05521c8adb..edce0368ddc9cda54fdab44b472fcd0e771413ae 100644 --- a/lite/kernels/opencl/lrn_image_compute.cc +++ b/lite/kernels/opencl/lrn_image_compute.cc @@ -65,6 +65,7 @@ class LrnImageCompute : public KernelLitedims(); auto in_dims = x->dims(); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "x->target(): " << TargetToStr(x->target()); VLOG(4) << "out->target(): " << TargetToStr(out->target()); VLOG(4) << "x->dims(): " << in_dims; @@ -74,6 +75,7 @@ class LrnImageCompute : public KernelLitedata(); @@ -81,9 +83,12 @@ class LrnImageCompute : public KernelLitemutable_data( out_image_shape["width"], out_image_shape["height"]); + +#ifndef LITE_SHUTDOWN_LOG // VLOG(4) << "out_image" << out_img; VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " << out_image_shape["height"]; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; @@ -97,8 +102,10 @@ class LrnImageCompute : public KernelLite{ static_cast(out_image_shape["width"]), static_cast(out_image_shape["height"])})); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "default_work_size: " << default_work_size[0] << ", " << default_work_size[1] << ", " << default_work_size[3]; +#endif cl_int status = kernel.setArg(arg_idx++, *x_img); CL_CHECK_FATAL(status); status = kernel.setArg(arg_idx++, *out_img); @@ -130,9 +137,10 @@ class LrnImageCompute : public KernelLiteemplace(out_img, event_); - +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " << global_work_size[1] << " " << global_work_size[2]; +#endif } protected: diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc index c34019161000bf25522c061194815e38932ba4d2..082f21ab1ae792ae33e9e2a368073274258b8884 100644 --- a/lite/kernels/opencl/nearest_interp_image_compute.cc +++ b/lite/kernels/opencl/nearest_interp_image_compute.cc @@ -87,6 +87,7 @@ class NearestInterpComputeImageDefault status = kernel.setArg(++arg_idx, static_cast(out_dims_w)); CL_CHECK_FATAL(status); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << "out_image_shape(w,h):" << out_image_shape["width"] << " " @@ -95,6 +96,7 @@ class NearestInterpComputeImageDefault << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " << y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; +#endif const std::vector& default_work_size = DefaultWorkSize(y_dims, diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc index 7f4838149d1e2364baf0b1b2286fef4a74ee9a4b..1be4729ee1b24ac77383de4d7c111e9d37d29d6b 100644 --- a/lite/kernels/opencl/pad2d_image_compute.cc +++ b/lite/kernels/opencl/pad2d_image_compute.cc @@ -71,10 +71,12 @@ class Pad2dCompute : public KernelLitetarget():" << TargetToStr(x->target()); VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "out->dims():" << out_dims; +#endif auto out_image_shape = InitImageDimInfoWith(out_dims); auto* x_img = x->data(); @@ -82,11 +84,13 @@ class Pad2dCompute : public KernelLitemutable_data( out_image_shape["width"], out_image_shape["height"]); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " " << out_image_shape["height"]; VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w; VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w; +#endif STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_; @@ -98,9 +102,10 @@ class Pad2dCompute : public KernelLite{ static_cast(out_image_shape["width"]), static_cast(out_image_shape["height"])})); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "default_work_size: " << default_work_size[0] << ", " << default_work_size[1] << ", " << default_work_size[2]; - +#endif int pad_h0 = pad2d_param_->paddings[0]; int pad_h1 = pad2d_param_->paddings[1]; int pad_w0 = pad2d_param_->paddings[2]; @@ -144,9 +149,10 @@ class Pad2dCompute : public KernelLiteemplace(out_img, event_); - +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " << global_work_size[1] << " " << global_work_size[2]; +#endif } protected: diff --git a/lite/kernels/opencl/pad2d_image_compute_test.cc b/lite/kernels/opencl/pad2d_image_compute_test.cc index d1e1e3bb4c8fc80fabacff52b66f20387dd7766f..c2371d07f31caf569cfe4b299bf2f88373eb3b9f 100644 --- a/lite/kernels/opencl/pad2d_image_compute_test.cc +++ b/lite/kernels/opencl/pad2d_image_compute_test.cc @@ -89,7 +89,7 @@ void pad2d_ref(const float *x_data, } } -#define LOOP_TEST +// #define LOOP_TEST // #define PRINT_RESULT TEST(pad2d_image2d, compute) { LOG(INFO) << "main steps of test: host -> layout(buf2img) -> " diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc index c2a8f7c7cf87ba709beb5f30a0149dc2cd92d11b..39da325ebb10c85f153e349173aa833bbf5e1f6e 100644 --- a/lite/kernels/opencl/pool_image_compute.cc +++ b/lite/kernels/opencl/pool_image_compute.cc @@ -59,10 +59,14 @@ class PoolComputeImage2D : public KernelLite paddings = *param.paddings; std::vector strides = param.strides; std::vector ksize = param.ksize; + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "global_pooling: " << global_pooling; VLOG(4) << "pooling_type: " << pooling_type; VLOG(4) << "paddings : " << paddings[0] << " " << paddings[1] << " " << paddings[2] << " " << paddings[3] << " "; +#endif + if (global_pooling) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[2 * i] = 0; @@ -70,6 +74,8 @@ class PoolComputeImage2D : public KernelLite(in_dims[i + 2]); } } + +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << " " << in_dims[1] << " " << in_dims[2] << " " << in_dims[3]; VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << " " @@ -82,6 +88,8 @@ class PoolComputeImage2D : public KernelLitemutable_data( out_image_shape["width"], out_image_shape["height"]); // VLOG(4) << "out_image" << out_img; @@ -109,8 +119,10 @@ class PoolComputeImage2D : public KernelLite(); const Tensor* const x = param.x; @@ -64,8 +62,9 @@ class ReshapeComputeFloatImage : public KernelLitemutable_data( out_image_shape.at("width"), out_image_shape.at("height")); +#ifndef LITE_SHUTDOWN_LOG VLOG(4) << "out_dims= " << out_dims; - +#endif const std::vector& default_work_size = DefaultWorkSize( out_dims, DDim(std::vector{ @@ -94,6 +93,8 @@ class ReshapeComputeFloatImage : public KernelLitemutable_data( out_image_shape["width"], out_image_shape["height"]); // LOG(INFO) << "out_image" << out_img; diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc index 43aaad3402b7873dbaa67d4c4897b5378e098500..4af8a2bd3464efaaec6937996445736068f0f656 100644 --- a/lite/kernels/xpu/bridges/graph.cc +++ b/lite/kernels/xpu/bridges/graph.cc @@ -49,7 +49,7 @@ std::shared_ptr Graph::Add(const std::string& name, CHECK_GE(idx, 1); node->set_data(std::make_shared(layer)); // Generate a unique name for the current XTCL layer - builder_.SetLayer(name + "__" + std::to_string(idx)); + builder_.SetLayer(name + "__" + paddle::lite::to_string(idx)); return node; } diff --git a/lite/model_parser/naive_buffer/naive_buffer_test.cc b/lite/model_parser/naive_buffer/naive_buffer_test.cc index 8b6ffb4dcf481bbb8df92e7e15c1d569d575bcae..98789e8006817fceb4745bffd0c095da7ad360fc 100644 --- a/lite/model_parser/naive_buffer/naive_buffer_test.cc +++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc @@ -155,7 +155,7 @@ TEST(ListBuilder, basic) { for (int i = 0; i < num_elems; i++) { auto* elem = li.New(); - elem->set("elem-" + std::to_string(i)); + elem->set("elem-" + paddle::lite::to_string(i)); } li.Save(); table.SaveToFile("2.bf"); @@ -169,7 +169,7 @@ TEST(ListBuilder, basic) { li1.Load(); for (int i = 0; i < num_elems; i++) { - ASSERT_EQ(li1.Get(i).data(), "elem-" + std::to_string(i)); + ASSERT_EQ(li1.Get(i).data(), "elem-" + paddle::lite::to_string(i)); } } diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 34c7b8d6669b4eddfa6fecaa67cf4523b5c36566..48e27560317c089446e8dbc5040786f34ca962c4 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -144,6 +144,8 @@ add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS}) if (LITE_WITH_TRAIN) add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS}) add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS}) + add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS}) + add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS}) add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS}) endif() diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc index 9ae52d1cb6a406dc8d1059ad97f3757dbc0a31fa..70ad3a32a83003e449524205a71dcc7536b9a11e 100644 --- a/lite/operators/conv_op.cc +++ b/lite/operators/conv_op.cc @@ -80,6 +80,34 @@ void UpdatePaddingAndDilation(std::vector* paddings, } } +bool ConvOpLite::SmartInferShape() { + if (!last_input_shapes.empty()) { + if (last_input_shapes[0] == param_.x->dims() && + last_input_lods[0] == param_.x->lod()) { + param_.output->Resize(last_output_shapes[0]); + param_.output->set_lod(last_output_lods[0]); + return true; + } + } + + this->InferShape(); + + if (!last_input_shapes.empty()) { + last_input_shapes.clear(); + last_input_lods.clear(); + } + last_input_shapes.push_back(param_.x->dims()); + last_input_lods.push_back(param_.x->lod()); + + if (!last_output_shapes.empty()) { + last_output_shapes.clear(); + last_output_lods.clear(); + } + last_output_shapes.push_back(param_.output->dims()); + last_output_lods.push_back(param_.output->lod()); + + return true; +} bool ConvOpLite::InferShape() const { const auto in_dims = param_.x->dims(); const auto filter_dims = param_.filter->dims(); @@ -104,9 +132,9 @@ bool ConvOpLite::InferShape() const { // Set output dims param_.output->Resize(lite::DDim(output_shape)); - // share LoD - // param_.output->set_lod(param_.x->lod()); + param_.output->set_lod(param_.x->lod()); + return true; } diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index 63107022f1ef69a21d37373c4a257625f8b0f5e3..3379fb409529e261f4af38ef2ee3483f17cc8a3b 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -36,6 +36,7 @@ class ConvOpLite : public OpLite { bool CheckShape() const override; bool InferShape() const override; + bool SmartInferShape() override; // TODO(Superjomn) replace framework::OpDesc with a lite one. bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override { diff --git a/lite/operators/elementwise_grad_ops.cc b/lite/operators/elementwise_grad_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..9d964bf9e36889f2bc72b2656d23bf4022cc121c --- /dev/null +++ b/lite/operators/elementwise_grad_ops.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/elementwise_grad_ops.h" +#include +#include +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool ElementwiseGradOp::CheckShape() const { + CHECK_OR_FALSE(param_.XGrad || param_.YGrad); + CHECK_OR_FALSE(param_.OutGrad); + return true; +} + +bool ElementwiseGradOp::InferShape() const { + auto x_dim = param_.X->dims(); + auto y_dim = param_.Y->dims(); + if (param_.XGrad) { + param_.XGrad->Resize(x_dim); + } + if (param_.YGrad) { + param_.YGrad->Resize(y_dim); + } + return true; +} + +bool ElementwiseGradOp::AttachImpl(const cpp::OpDesc& opdesc, + lite::Scope* scope) { + auto Y_name = opdesc.Input("Y").front(); + auto X_name = opdesc.Input("X").front(); + auto Out_name = opdesc.Input("Out@GRAD").front(); + CHECK(!opdesc.Output("X@GRAD").empty() || !opdesc.Output("Y@GRAD").empty()) + << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty"; + + if (!opdesc.Output("X@GRAD").empty()) { + auto x_grad_name = opdesc.Output("X@GRAD").front(); + param_.XGrad = GetMutableVar(scope, x_grad_name); + } + if (!opdesc.Output("Y@GRAD").empty()) { + auto y_grad_name = opdesc.Output("Y@GRAD").front(); + param_.YGrad = GetMutableVar(scope, y_grad_name); + } + + param_.X = GetVar(scope, X_name); + param_.Y = GetVar(scope, Y_name); + param_.OutGrad = GetVar(scope, Out_name); + param_.axis = opdesc.GetAttr("axis"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(elementwise_sub_grad, + paddle::lite::operators::ElementwiseGradOp); +REGISTER_LITE_OP(elementwise_add_grad, + paddle::lite::operators::ElementwiseGradOp); + +REGISTER_LITE_OP(elementwise_grad_mul, + paddle::lite::operators::ElementwiseGradOp); +REGISTER_LITE_OP(elementwise_grad_max, + paddle::lite::operators::ElementwiseGradOp); diff --git a/lite/operators/elementwise_grad_ops.h b/lite/operators/elementwise_grad_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c45d581936207f0b37ee70a0505b912d0b509e35 --- /dev/null +++ b/lite/operators/elementwise_grad_ops.h @@ -0,0 +1,44 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class ElementwiseGradOp : public OpLite { + public: + explicit ElementwiseGradOp(const std::string& op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override; + + void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "elementwise_grad_op"; } + + private: + mutable operators::ElementwiseGradParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc index 3dc6f06955d421bc1f25994139cfee5dee9bc472..044126b3c22fa853d4908c06c307f32278fa5b9b 100644 --- a/lite/operators/elementwise_ops.cc +++ b/lite/operators/elementwise_ops.cc @@ -26,7 +26,38 @@ bool ElementwiseOp::CheckShape() const { CHECK_OR_FALSE(param_.Out); return true; } +bool ElementwiseOp::SmartInferShape() { + if (!last_input_shapes.empty()) { + if (last_input_shapes[0] == param_.X->dims() && + last_input_shapes[1] == param_.Y->dims() && + last_input_lods[0] == param_.X->lod() && + last_input_lods[1] == param_.Y->lod()) { + param_.Out->Resize(last_output_shapes[0]); + param_.Out->set_lod(last_output_lods[0]); + return true; + } + } + + this->InferShape(); + + if (!last_input_shapes.empty()) { + last_input_shapes.clear(); + last_input_lods.clear(); + } + last_input_shapes.push_back(param_.X->dims()); + last_input_lods.push_back(param_.X->lod()); + last_input_shapes.push_back(param_.Y->dims()); + last_input_lods.push_back(param_.Y->lod()); + + if (!last_output_shapes.empty()) { + last_output_shapes.clear(); + last_output_lods.clear(); + } + last_output_shapes.push_back(param_.Out->dims()); + last_output_lods.push_back(param_.Out->lod()); + return true; +} bool ElementwiseOp::InferShape() const { auto x_dim = param_.X->dims(); auto y_dim = param_.Y->dims(); @@ -81,6 +112,7 @@ bool ElementwiseOp::InferShape() const { auto out_lod = param_.Out->mutable_lod(); *out_lod = param_.X->lod(); } + return true; } diff --git a/lite/operators/elementwise_ops.h b/lite/operators/elementwise_ops.h index d888e3d1c14b5d3129e01d12c75e1f590c17f297..9d6e5781b9754eb22be11da0d7f77b764eb25912 100644 --- a/lite/operators/elementwise_ops.h +++ b/lite/operators/elementwise_ops.h @@ -28,6 +28,7 @@ class ElementwiseOp : public OpLite { bool CheckShape() const override; bool InferShape() const override; + bool SmartInferShape() override; bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override; diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc index eff9300fea4caf412186bfc8d0ad136686507be5..345fc0d605ccd68e3a6ef72429e20400a772568c 100644 --- a/lite/operators/fc_op.cc +++ b/lite/operators/fc_op.cc @@ -48,6 +48,33 @@ bool FcOpLite::CheckShape() const { return true; } +bool FcOpLite::SmartInferShape() { + if (!last_input_shapes.empty() && !last_output_shapes.empty()) { + if (last_input_shapes[0] == param_.input->dims() && + last_input_lods[0] == param_.input->lod()) { + param_.output->Resize(last_output_shapes[0]); + param_.output->set_lod(last_output_lods[0]); + return true; + } + } + + this->InferShape(); + + if (!last_input_shapes.empty()) { + last_input_shapes.clear(); + last_input_lods.clear(); + } + last_input_shapes.push_back(param_.input->dims()); + last_input_lods.push_back(param_.input->lod()); + if (!last_output_shapes.empty()) { + last_output_shapes.clear(); + last_output_lods.clear(); + } + last_output_shapes.push_back(param_.output->dims()); + last_output_lods.push_back(param_.output->lod()); + + return true; +} bool FcOpLite::InferShape() const { const auto& input_dims = param_.input->dims(); const auto& w_dims = param_.w->dims(); @@ -64,6 +91,7 @@ bool FcOpLite::InferShape() const { // share LoD param_.output->set_lod(param_.input->lod()); + return true; } diff --git a/lite/operators/fc_op.h b/lite/operators/fc_op.h index ec449cd4bdc33f191c33fc04f215ad672b283215..f5dc302e27a220ee1f1e0679cbb3c2ed257747dd 100644 --- a/lite/operators/fc_op.h +++ b/lite/operators/fc_op.h @@ -36,6 +36,7 @@ class FcOpLite : public OpLite { bool CheckShape() const override; bool InferShape() const override; + bool SmartInferShape() override; bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; diff --git a/lite/operators/mul_grad_op.cc b/lite/operators/mul_grad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8215521637cbc29a4bdcc4b735b9658fc4cc4840 --- /dev/null +++ b/lite/operators/mul_grad_op.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/mul_grad_op.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool MulGradOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.y); + CHECK_OR_FALSE(param_.output_grad); + CHECK_OR_FALSE(param_.x_grad || param_.y_grad); + CHECK_OR_FALSE(param_.x_num_col_dims); + CHECK_OR_FALSE(param_.y_num_col_dims); + + const auto x_dims = param_.x->dims(); + const auto y_dims = param_.y->dims(); + const auto out_dims = param_.output_grad->dims(); + + CHECK_GT_OR_FALSE(x_dims.size(), static_cast(param_.x_num_col_dims)); + CHECK_GT_OR_FALSE(y_dims.size(), static_cast(param_.y_num_col_dims)); + + auto x_flatten_dims = flatten_2d(x_dims, param_.x_num_col_dims); + auto y_flatten_dims = flatten_2d(y_dims, param_.y_num_col_dims); + auto out_flatten_dims = flatten_2d(out_dims, param_.x_num_col_dims); + + // Out = X * Y; + CHECK_EQ_OR_FALSE(x_flatten_dims[1], y_flatten_dims[0]); + CHECK_EQ_OR_FALSE(x_flatten_dims[0], out_flatten_dims[0]); + CHECK_EQ_OR_FALSE(y_flatten_dims[1], out_flatten_dims[1]); + return true; +} + +bool MulGradOpLite::InferShape() const { + const auto x_dims = param_.x->dims(); + const auto y_dims = param_.y->dims(); + if (param_.x_grad) { + param_.x_grad->Resize(x_dims); + param_.x_grad->set_lod(param_.x->lod()); + } + if (param_.y_grad) { + param_.y_grad->Resize(y_dims); + param_.y_grad->set_lod(param_.y->lod()); + } +} + +bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + CHECK(!op_desc.Input("X").empty()); + CHECK(!op_desc.Input("Y").empty()); + CHECK(!op_desc.Input("Out@GRAD").empty()); + CHECK(!op_desc.Output("X@GRAD").empty() || !op_desc.Output("Y@GRAD").empty()) + << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty"; + + auto *x_var = scope->FindVar(op_desc.Input("X").front()); + CHECK(x_var); + param_.x = &x_var->Get(); + + auto *y_var = scope->FindVar(op_desc.Input("Y").front()); + CHECK(y_var); + param_.y = &y_var->Get(); + + auto *out_grad_var = scope->FindVar(op_desc.Input("Out@GRAD").front()); + CHECK(out_grad_var); + param_.output_grad = &out_grad_var->Get(); + + if (!op_desc.Output("X@GRAD").empty()) { + auto *x_grad_var = scope->FindVar(op_desc.Output("X@GRAD").front()); + CHECK(x_grad_var); + param_.x_grad = x_grad_var->GetMutable(); + } + + if (!op_desc.Output("Y@GRAD").empty()) { + auto *y_grad_var = scope->FindVar(op_desc.Output("Y@GRAD").front()); + CHECK(y_grad_var); + param_.y_grad = y_grad_var->GetMutable(); + } + param_.x_num_col_dims = op_desc.GetAttr("x_num_col_dims"); + param_.y_num_col_dims = op_desc.GetAttr("y_num_col_dims"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite); diff --git a/lite/operators/mul_grad_op.h b/lite/operators/mul_grad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ef61f54f9b88cd691ab98c4d8904b848dcea66b5 --- /dev/null +++ b/lite/operators/mul_grad_op.h @@ -0,0 +1,62 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class MulGradOpLite : public OpLite { + public: + MulGradOpLite() {} + + explicit MulGradOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; + + std::string DebugString() const override { return "mul_grad"; } + + private: + mutable MulGradParam param_; +}; + +std::vector flatten_2d(DDim dims, int num_col_dims) { + std::vector flatten_dims{1, 1}; + for (int i = 0; i < dims.size(); i++) { + if (i < num_col_dims) { + flatten_dims[0] *= dims[i]; + } else { + flatten_dims[1] *= dims[i]; + } + } + return flatten_dims; +} + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/mul_op.h b/lite/operators/mul_op.h index e53168e00e0e541e6b317e1633a8afbf33018d6e..caf7bf6ae902ac4e4f22d4a9aadfa108fa7622da 100644 --- a/lite/operators/mul_op.h +++ b/lite/operators/mul_op.h @@ -66,28 +66,6 @@ class MulOpLite : public OpLite { mutable MulParam param_; }; -#ifdef LITE_WITH_TRAIN -class MulGradOpLite : public OpLite { - public: - MulGradOpLite() {} - - explicit MulGradOpLite(const std::string &type) : OpLite(type) {} - - bool CheckShape() const override; - - bool InferShape() const override; - - void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } - - bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; - - std::string DebugString() const override { return "mul_grad"; } - - private: - mutable MulGradParam param_; -}; -#endif - } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 6d18f1bf348530fc111499ca7cbb89e9bec88d9d..36d3b42c6b315a3858f475bd5756579137528051 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -387,10 +387,11 @@ struct ElementwiseParam { }; struct ElementwiseGradParam { + const lite::Tensor* X{}; const lite::Tensor* Y{}; - const lite::Tensor* Out_grad{}; - lite::Tensor* X_grad{}; - lite::Tensor* Y_grad{}; + const lite::Tensor* OutGrad{}; + lite::Tensor* XGrad{}; + lite::Tensor* YGrad{}; int axis{-1}; // for broadcasting. }; diff --git a/lite/operators/softmax_op.cc b/lite/operators/softmax_op.cc index 1e89fc1a2af407ebbe11f207bd33a1dabb811dc0..0989c9139763a435d67deb21a2ab233e1c2f3bd9 100644 --- a/lite/operators/softmax_op.cc +++ b/lite/operators/softmax_op.cc @@ -29,10 +29,39 @@ bool SoftmaxOp::CheckShape() const { return true; } +bool SoftmaxOp::SmartInferShape() { + if (!last_input_shapes.empty() && !last_output_shapes.empty()) { + if (param_.x->dims() == last_input_shapes[0] && + param_.x->lod() == last_input_lods[0]) { + param_.output->Resize(last_output_shapes[0]); + param_.output->set_lod(last_output_lods[0]); + return true; + } + } + + this->InferShape(); + + if (!last_input_shapes.empty()) { + last_input_shapes.clear(); + last_input_lods.clear(); + } + last_input_shapes.push_back(param_.x->dims()); + last_input_lods.push_back(param_.x->lod()); + + if (!last_output_shapes.empty()) { + last_output_shapes.clear(); + last_output_lods.clear(); + } + last_output_shapes.push_back(param_.output->dims()); + last_output_lods.push_back(param_.output->lod()); + return true; +} + bool SoftmaxOp::InferShape() const { param_.output->Resize(param_.x->dims()); auto out_lod = param_.output->mutable_lod(); *out_lod = param_.x->lod(); + return true; } diff --git a/lite/operators/softmax_op.h b/lite/operators/softmax_op.h index bb24acad344f02fe3677484fd2c4c31326683a13..c65d039fda02c5396eff829bede3b4ffdeac0051 100644 --- a/lite/operators/softmax_op.h +++ b/lite/operators/softmax_op.h @@ -31,6 +31,7 @@ class SoftmaxOp : public OpLite { bool CheckShape() const override; bool InferShape() const override; + bool SmartInferShape() override; bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 41e7c619489cdb974b238f6584032cc778f9e919..f4afe9ee3c3c0f9b325ac55a0c2c6a6454617e57 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -65,6 +65,8 @@ if(LITE_BUILD_EXTRA) if (LITE_WITH_TRAIN) lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/tests/kernels/concat_compute_test.cc b/lite/tests/kernels/concat_compute_test.cc index 3e30035f1011405ad9beffefd0df91132747a609..18e4701bdf3e99fbb6f76ed9ac78bbbbfda60a1c 100644 --- a/lite/tests/kernels/concat_compute_test.cc +++ b/lite/tests/kernels/concat_compute_test.cc @@ -128,7 +128,7 @@ class ConcateComputeTester : public arena::TestCase { for (int i = 0; i < x_dims_.production(); i++) { x_data[i] = static_cast(i + n); } - const std::string x_name = "x_tensor_" + std::to_string(n); + const std::string x_name = "x_tensor_" + paddle::lite::to_string(n); x_vct_.push_back(x_name); SetCommonTensor(x_name, x_dims_, x_data.data()); } diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488 --- /dev/null +++ b/lite/tests/kernels/elementwise_grad_compute_test.cc @@ -0,0 +1,541 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/elementwise_grad_compute.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/arm/elementwise_compute.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +using param_t = operators::ElementwiseParam; +using grad_param_t = operators::ElementwiseGradParam; +using kernel_add_t = ElementwiseAddCompute; +using grad_kernel_add_t = ElementwiseAddGradCompute; +using kernel_sub_t = ElementwiseSubCompute; +using grad_kernel_sub_t = ElementwiseSubGradCompute; + +void elementwise_common(grad_param_t& param, // NOLINT + std::vector& out_grad, // NOLINT + std::vector& x_grad, // NOLINT + std::vector& y_grad, // NOLINT + std::string flag) { + auto x_dims = param.X->dims(); + auto y_dims = param.Y->dims(); + if (x_dims == y_dims) { + for (int i = 0; i < x_dims.production(); ++i) { + if (flag == "add") { + x_grad[i] = out_grad[i]; + y_grad[i] = out_grad[i]; + } + if (flag == "sub") { + x_grad[i] = out_grad[i]; + y_grad[i] = -out_grad[i]; + } + } + } else { + LOG(FATAL) << "unsupport dims"; + } +} + +class ElementwiseAddGradTester { + public: + explicit ElementwiseAddGradTester(const DDim& x_dims, + const DDim& y_dims, + int axis) + : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {} + + void prepare_kernel() { + std::unique_ptr ctx1(new KernelContext); + ctx1->As(); + kernel_.SetContext(std::move(ctx1)); + + std::unique_ptr ctx3(new KernelContext); + ctx3->As(); + grad_kernel_.SetContext(std::move(ctx3)); + } + + void run_forward(param_t* param, + kernel_add_t* kernel, + const std::vector& x_vec, + const std::vector& y_vec, + float* out_vec) { + Tensor x; + Tensor y; + Tensor output; + x.Resize(x_dims_); + y.Resize(y_dims_); + output.Resize(DDim(out_dims_)); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_data[i] = x_vec[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_data[i] = y_vec[i]; + } + + param->X = &x; + param->Y = &y; + param->Out = &output; + param->axis = axis_; + kernel->SetParam(*param); + kernel->Launch(); + + auto* output_data = output.mutable_data(); + for (int i = 0; i < out_dims_.production(); i++) { + out_vec[i] = output_data[i]; + } + } + + void run_backward(grad_param_t* param, + grad_kernel_add_t* kernel, + const std::vector& x_vec, + const std::vector& y_vec, + const std::vector& out_grad_vec, + float* x_grad_vec, + float* y_grad_vec) { + Tensor x; + Tensor x_grad; + Tensor y; + Tensor y_grad; + Tensor out_grad; + x.Resize(x_dims_); + x_grad.Resize(x_dims_); + y.Resize(y_dims_); + y_grad.Resize(y_dims_); + out_grad.Resize(out_dims_); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_grad_data = out_grad.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_data[i] = x_vec[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_data[i] = y_vec[i]; + } + for (int i = 0; i < out_dims_.production(); i++) { + out_grad_data[i] = out_grad_vec[i]; + } + + param->X = &x; + param->XGrad = &x_grad; + param->Y = &y; + param->YGrad = &y_grad; + param->OutGrad = &out_grad; + param->axis = axis_; + + kernel->SetParam(*param); + kernel->Launch(); + + auto* x_grad_data = x_grad.mutable_data(); + auto* y_grad_data = y_grad.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_grad_vec[i] = x_grad_data[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_grad_vec[i] = y_grad_data[i]; + } + } + + void check_grad(float delta2, float max_grad_delta2) { + std::vector out_shape; + // infer shape + auto x_dim = x_dims_; + auto y_dim = y_dims_; + if (x_dim == y_dim) { + out_dims_ = x_dim; + } else { + int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size()); + int axis = param_.axis; + axis = + (axis == -1 ? std::abs(static_cast(x_dim.size() - y_dim.size())) + : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + + if (x_dim.size() > y_dim.size()) { + for (int i = 0; i < axis; ++i) { + y_dims_array[i] = 1; + } + if (axis + y_dim.size() < max_dim) { + for (int i = axis + y_dim.size(); i < max_dim; ++i) { + y_dims_array[i] = 1; + } + } + x_dims_array = x_dim.Vectorize(); + for (int i = 0; i < y_dim.size(); ++i) { + y_dims_array[i + axis] = y_dim[i]; + } + } else { + for (int i = 0; i < axis; ++i) { + x_dims_array[i] = 1; + } + if (axis + x_dim.size() < max_dim) { + for (int i = axis + x_dim.size(); i < max_dim; ++i) { + x_dims_array[i] = 1; + } + } + y_dims_array = y_dim.Vectorize(); + for (int i = 0; i < x_dim.size(); ++i) { + x_dims_array[i + axis] = x_dim[i]; + } + } + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] == -1 || y_dims_array[i] == -1) { + out_dims_array[i] = -1; + } else { + out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); + } + } + out_dims_ = DDim(out_dims_array); + } + // infer end + // forward + std::vector x(x_dims_.production()); + std::vector y(y_dims_.production()); + std::vector out(out_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); + this->run_forward(¶m_, &kernel_, x, y, out.data()); + + for (int i = 0; i < x_dims_.production(); i++) { + LOG(INFO) << "x_" << i << ": " << x[i]; + } + + for (int i = 0; i < y_dims_.production(); i++) { + LOG(INFO) << "y_" << i << ": " << y[i]; + } + + for (int i = 0; i < out_dims_.production(); i++) { + LOG(INFO) << "out_" << i << ": " << out[i]; + } + + // backward + std::vector out_grad(out_dims_.production()); + std::vector x_grad(x_dims_.production()); + std::vector y_grad(y_dims_.production()); + for (int i = 0; i < out_dims_.production(); i++) { + out_grad[i] = 1.0; + } + this->run_backward(&grad_param_, + &grad_kernel_, + x, + y, + out_grad, + x_grad.data(), + y_grad.data()); + + for (int i = 0; i < x_grad.size(); i++) { + LOG(INFO) << "x_grad_" << i << ": " << x_grad[i]; + } + + for (int i = 0; i < y_grad.size(); i++) { + LOG(INFO) << "y_grad_" << i << ": " << y_grad[i]; + } + + // get numeric gradient + std::vector x_delta(x_dims_.production()); + std::vector y_delta(y_dims_.production()); + std::vector out_delta(out_dims_.production()); + Tensor tensor_x; + Tensor tensor_y; + tensor_x.Resize(x_dims_); + tensor_y.Resize(y_dims_); + grad_param_.X = &tensor_x; + grad_param_.Y = &tensor_y; + + elementwise_common(grad_param_, out_grad, x_delta, y_delta, "add"); + + float max_grad_delta = 0.0005; + for (int i = 0; i < x_dims_.production(); i++) { + EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta); + EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta); + } + } + + private: + DDim x_dims_; + DDim y_dims_; + DDim out_dims_; + int axis_; + kernel_add_t kernel_; + grad_kernel_add_t grad_kernel_; + param_t param_; + grad_param_t grad_param_; +}; + +class ElementwiseSubGradTester { + public: + explicit ElementwiseSubGradTester(const DDim& x_dims, + const DDim& y_dims, + int axis) + : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {} + + void prepare_kernel() { + std::unique_ptr ctx1(new KernelContext); + ctx1->As(); + kernel_.SetContext(std::move(ctx1)); + + std::unique_ptr ctx3(new KernelContext); + ctx3->As(); + grad_kernel_.SetContext(std::move(ctx3)); + } + + void run_forward(param_t* param, + kernel_sub_t* kernel, + const std::vector& x_vec, + const std::vector& y_vec, + float* out_vec) { + Tensor x; + Tensor y; + Tensor output; + x.Resize(x_dims_); + y.Resize(y_dims_); + output.Resize(DDim(out_dims_)); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_data[i] = x_vec[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_data[i] = y_vec[i]; + } + + param->X = &x; + param->Y = &y; + param->Out = &output; + param->axis = axis_; + kernel->SetParam(*param); + kernel->Launch(); + + auto* output_data = output.mutable_data(); + for (int i = 0; i < out_dims_.production(); i++) { + out_vec[i] = output_data[i]; + } + } + + void run_backward(grad_param_t* param, + grad_kernel_sub_t* kernel, + const std::vector& x_vec, + const std::vector& y_vec, + const std::vector& out_grad_vec, + float* x_grad_vec, + float* y_grad_vec) { + Tensor x; + Tensor x_grad; + Tensor y; + Tensor y_grad; + Tensor out_grad; + x.Resize(x_dims_); + x_grad.Resize(x_dims_); + y.Resize(y_dims_); + y_grad.Resize(y_dims_); + out_grad.Resize(out_dims_); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_grad_data = out_grad.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_data[i] = x_vec[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_data[i] = y_vec[i]; + } + for (int i = 0; i < out_dims_.production(); i++) { + out_grad_data[i] = out_grad_vec[i]; + } + + param->X = &x; + param->XGrad = &x_grad; + param->Y = &y; + param->YGrad = &y_grad; + param->OutGrad = &out_grad; + param->axis = axis_; + + kernel->SetParam(*param); + kernel->Launch(); + + auto* x_grad_data = x_grad.mutable_data(); + auto* y_grad_data = y_grad.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_grad_vec[i] = x_grad_data[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_grad_vec[i] = y_grad_data[i]; + } + } + + void check_grad(float delta2, float max_grad_delta2) { + std::vector out_shape; + // infer shape + auto x_dim = x_dims_; + auto y_dim = y_dims_; + if (x_dim == y_dim) { + out_dims_ = x_dim; + } else { + int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size()); + int axis = param_.axis; + axis = + (axis == -1 ? std::abs(static_cast(x_dim.size() - y_dim.size())) + : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + + if (x_dim.size() > y_dim.size()) { + for (int i = 0; i < axis; ++i) { + y_dims_array[i] = 1; + } + if (axis + y_dim.size() < max_dim) { + for (int i = axis + y_dim.size(); i < max_dim; ++i) { + y_dims_array[i] = 1; + } + } + x_dims_array = x_dim.Vectorize(); + for (int i = 0; i < y_dim.size(); ++i) { + y_dims_array[i + axis] = y_dim[i]; + } + } else { + for (int i = 0; i < axis; ++i) { + x_dims_array[i] = 1; + } + if (axis + x_dim.size() < max_dim) { + for (int i = axis + x_dim.size(); i < max_dim; ++i) { + x_dims_array[i] = 1; + } + } + y_dims_array = y_dim.Vectorize(); + for (int i = 0; i < x_dim.size(); ++i) { + x_dims_array[i + axis] = x_dim[i]; + } + } + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] == -1 || y_dims_array[i] == -1) { + out_dims_array[i] = -1; + } else { + out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); + } + } + out_dims_ = DDim(out_dims_array); + } + // infer end + // forward + std::vector x(x_dims_.production()); + std::vector y(y_dims_.production()); + std::vector out(out_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); + this->run_forward(¶m_, &kernel_, x, y, out.data()); + + for (int i = 0; i < x_dims_.production(); i++) { + LOG(INFO) << "x_" << i << ": " << x[i]; + } + + for (int i = 0; i < y_dims_.production(); i++) { + LOG(INFO) << "y_" << i << ": " << y[i]; + } + + for (int i = 0; i < out_dims_.production(); i++) { + LOG(INFO) << "out_" << i << ": " << out[i]; + } + + // backward + std::vector out_grad(out_dims_.production()); + std::vector x_grad(x_dims_.production()); + std::vector y_grad(y_dims_.production()); + for (int i = 0; i < out_dims_.production(); i++) { + out_grad[i] = 1.0; + } + this->run_backward(&grad_param_, + &grad_kernel_, + x, + y, + out_grad, + x_grad.data(), + y_grad.data()); + + for (int i = 0; i < x_grad.size(); i++) { + LOG(INFO) << "x_grad_" << i << ": " << x_grad[i]; + } + + for (int i = 0; i < y_grad.size(); i++) { + LOG(INFO) << "y_grad_" << i << ": " << y_grad[i]; + } + + // get numeric gradient + std::vector x_delta(x_dims_.production()); + std::vector y_delta(y_dims_.production()); + std::vector out_delta(out_dims_.production()); + Tensor tensor_x; + Tensor tensor_y; + tensor_x.Resize(x_dims_); + tensor_y.Resize(y_dims_); + grad_param_.X = &tensor_x; + grad_param_.Y = &tensor_y; + + elementwise_common(grad_param_, out_grad, x_delta, y_delta, "sub"); + + float max_grad_delta = 0.0005; + for (int i = 0; i < x_dims_.production(); i++) { + EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta); + EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta); + } + } + + private: + DDim x_dims_; + DDim y_dims_; + DDim out_dims_; + int axis_; + kernel_sub_t kernel_; + grad_kernel_sub_t grad_kernel_; + param_t param_; + grad_param_t grad_param_; +}; +void TestNormalCase(const std::vector& x_dims, + const std::vector& y_dims, + int axis) { + std::unique_ptr tester_add( + new ElementwiseAddGradTester(DDim(x_dims), DDim(y_dims), axis)); + std::unique_ptr tester_sub( + new ElementwiseSubGradTester(DDim(x_dims), DDim(y_dims), axis)); + + tester_add->prepare_kernel(); + tester_sub->prepare_kernel(); + float delta = 0.001; + float max_grad_delta = 0.005; + tester_add->check_grad(delta, max_grad_delta); + tester_sub->check_grad(delta, max_grad_delta); +} + +TEST(mul_grad_arm, compute) { + LOG(INFO) << "Test Elementwise grad"; + DeviceInfo::Init(); + TestNormalCase({3, 2}, {3, 2}, 0); + TestNormalCase({3, 5}, {3, 5}, 1); + TestNormalCase({3, 4, 3}, {3, 4, 3}, 0); + TestNormalCase({9, 2, 5}, {9, 2, 5}, 1); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle +USE_LITE_KERNEL(elementwise_add_grad, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc index 465b7becffe967ab77d0c1a237fe6a4951031b3a..bc2cfce7842c935898bd9ecddc6c2d0ac4c39af5 100644 --- a/lite/tests/kernels/fill_constant_compute_test.cc +++ b/lite/tests/kernels/fill_constant_compute_test.cc @@ -52,7 +52,8 @@ class FillConstantComputeTester : public arena::TestCase { is_use_shape_tensor_list_(is_use_shape_tensor_list) { if (is_use_shape_tensor_list) { for (int i = 0; i < shape.size(); i++) { - shape_tensor_list_.push_back(shape_tensor_ + std::to_string(i)); + shape_tensor_list_.push_back(shape_tensor_ + + paddle::lite::to_string(i)); } } } diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc index d9bbfaa8d049cf2bbcdea9b9c5e58d201e156a67..d070292332b65ed577ec6cefdb220ee691eb99e9 100644 --- a/lite/tests/kernels/mul_compute_test.cc +++ b/lite/tests/kernels/mul_compute_test.cc @@ -109,6 +109,7 @@ void TestMul(const std::vector& x_dims, int y_num_col_dims, const Place& place, float abs_error) { + LOG(INFO) << "run test arm"; std::unique_ptr tester(new MulComputeTester(place, "def", DDim(x_dims), @@ -131,7 +132,6 @@ TEST(Mul, precision) { #else return; #endif - TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error); TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error); TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error); diff --git a/lite/tests/kernels/mul_grad_compute_test.cc b/lite/tests/kernels/mul_grad_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..95cbb2f8b54dd41d6756f7ae0222a34a7bb18c1d --- /dev/null +++ b/lite/tests/kernels/mul_grad_compute_test.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/mul_grad_compute.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/arm/mul_compute.h" +#include "lite/tests/utils/fill_data.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +using param_t = operators::MulParam; +using grad_param_t = operators::MulGradParam; +using kernel_t = MulCompute; +using grad_kernel_t = MulGradCompute; + +class MulGradTester { + public: + explicit MulGradTester(const DDim& x_dims, + const DDim& y_dims, + int x_num_col_dims, + int y_num_col_dims) + : x_dims_(x_dims), + y_dims_(y_dims), + x_num_col_dims_(x_num_col_dims), + y_num_col_dims_(y_num_col_dims) {} + + void prepare_kernel() { + std::unique_ptr ctx1(new KernelContext); + ctx1->As(); + kernel_.SetContext(std::move(ctx1)); + + std::unique_ptr ctx2(new KernelContext); + ctx2->As(); + delta_kernel_.SetContext(std::move(ctx2)); + + std::unique_ptr ctx3(new KernelContext); + ctx3->As(); + grad_kernel_.SetContext(std::move(ctx3)); + } + + void run_forward(param_t* param, + kernel_t* kernel, + const std::vector& x_vec, + const std::vector& y_vec, + float* out_vec) { + Tensor x; + Tensor y; + Tensor output; + x.Resize(x_dims_); + y.Resize(y_dims_); + output.Resize(DDim(out_dims_)); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_data[i] = x_vec[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_data[i] = y_vec[i]; + } + + param->x = &x; + param->y = &y; + param->output = &output; + param->x_num_col_dims = x_num_col_dims_; + param->y_num_col_dims = y_num_col_dims_; + kernel->SetParam(*param); + kernel->Launch(); + + auto* output_data = output.mutable_data(); + for (int i = 0; i < out_dims_.production(); i++) { + out_vec[i] = output_data[i]; + } + } + + void run_backward(grad_param_t* param, + grad_kernel_t* kernel, + const std::vector& x_vec, + const std::vector& y_vec, + const std::vector& out_grad_vec, + float* x_grad_vec, + float* y_grad_vec) { + Tensor x; + Tensor x_grad; + Tensor y; + Tensor y_grad; + Tensor out_grad; + x.Resize(x_dims_); + x_grad.Resize(x_dims_); + y.Resize(y_dims_); + y_grad.Resize(y_dims_); + out_grad.Resize(out_dims_); + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_grad_data = out_grad.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_data[i] = x_vec[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_data[i] = y_vec[i]; + } + for (int i = 0; i < out_dims_.production(); i++) { + out_grad_data[i] = out_grad_vec[i]; + } + + param->x = &x; + param->x_grad = &x_grad; + param->y = &y; + param->y_grad = &y_grad; + param->output_grad = &out_grad; + param->x_num_col_dims = x_num_col_dims_; + param->y_num_col_dims = y_num_col_dims_; + kernel->SetParam(*param); + kernel->Launch(); + + auto* x_grad_data = x_grad.mutable_data(); + auto* y_grad_data = y_grad.mutable_data(); + for (int i = 0; i < x_dims_.production(); i++) { + x_grad_vec[i] = x_grad_data[i]; + } + for (int i = 0; i < y_dims_.production(); i++) { + y_grad_vec[i] = y_grad_data[i]; + } + } + + void check_grad() { + std::vector out_shape; + for (int i = 0; i < x_num_col_dims_; i++) { + out_shape.push_back(x_dims_[i]); + } + for (int i = y_num_col_dims_; i < y_dims_.size(); i++) { + out_shape.push_back(y_dims_[i]); + } + out_dims_ = DDim(out_shape); + + // forward + std::vector x(x_dims_.production()); + std::vector y(y_dims_.production()); + std::vector out(out_dims_.production()); + fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production()); + this->run_forward(¶m_, &kernel_, x, y, out.data()); + + // backward + std::vector out_grad(out_dims_.production()); + std::vector x_grad(x_dims_.production()); + std::vector y_grad(y_dims_.production()); + for (int i = 0; i < out_dims_.production(); i++) { + out_grad[i] = 1.0; + } + this->run_backward(&grad_param_, + &grad_kernel_, + x, + y, + out_grad, + x_grad.data(), + y_grad.data()); + + // get numeric gradient + std::vector x_delta(x_dims_.production()); + std::vector y_delta(y_dims_.production()); + std::vector out_delta(out_dims_.production()); + + float delta = 0.001; + float max_grad_delta = 0.005; + for (int i = 0; i < x_dims_.production(); i++) { + for (int j = 0; j < x_dims_.production(); j++) { + if (i == j) { + x_delta[j] = x[j] + delta; + } else { + x_delta[j] = x[j]; + } + } + this->run_forward( + &delta_param_, &delta_kernel_, x_delta, y, out_delta.data()); + + float sum = 0; + for (int j = 0; j < out_dims_.production(); j++) { + sum += (out_delta[j] - out[j]); + } + + EXPECT_NEAR(x_grad[i], sum / delta, max_grad_delta); + } + + for (int i = 0; i < y_dims_.production(); i++) { + for (int j = 0; j < y_dims_.production(); j++) { + y_delta[j] = i == j ? y[j] + delta : y[j]; + } + this->run_forward( + &delta_param_, &delta_kernel_, x, y_delta, out_delta.data()); + float sum = 0; + for (int j = 0; j < out_dims_.production(); j++) { + sum += out_delta[j] - out[j]; + } + + EXPECT_NEAR(y_grad[i], sum / delta, max_grad_delta); + } + } + + private: + DDim x_dims_; + DDim y_dims_; + DDim out_dims_; + int x_num_col_dims_; + int y_num_col_dims_; + kernel_t kernel_; + kernel_t delta_kernel_; + grad_kernel_t grad_kernel_; + param_t param_; + param_t delta_param_; + grad_param_t grad_param_; +}; + +void TestNormalCase(const std::vector& x_dims, + const std::vector& y_dims, + int x_num_col_dims, + int y_num_col_dims) { + std::unique_ptr tester(new MulGradTester( + DDim(x_dims), DDim(y_dims), x_num_col_dims, y_num_col_dims)); + + tester->prepare_kernel(); + + tester->check_grad(); +} + +TEST(mul_grad_arm, compute) { + LOG(INFO) << "Test Mul grad"; + DeviceInfo::Init(); + TestNormalCase({1, 3}, {3, 2}, 1, 1); + TestNormalCase({3, 2}, {2, 1}, 1, 1); + TestNormalCase({3, 1}, {1, 7}, 1, 1); + TestNormalCase({2, 3}, {3, 2}, 1, 1); + TestNormalCase({4, 5}, {5, 4}, 1, 1); + TestNormalCase({4, 5}, {5, 4, 3, 2}, 1, 1); + TestNormalCase({3, 4}, {2, 2, 3}, 1, 2); + TestNormalCase({4, 20}, {5, 4, 3, 2}, 1, 2); + TestNormalCase({4, 60}, {5, 4, 3, 2}, 1, 3); + TestNormalCase({2, 3, 4, 5}, {60, 4}, 1, 1); + TestNormalCase({2, 3, 4, 5}, {20, 4}, 2, 1); + TestNormalCase({2, 3, 4, 5}, {5, 4}, 3, 1); + TestNormalCase({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1); + TestNormalCase({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2); + TestNormalCase({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle +USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); +USE_LITE_KERNEL(mul_grad, kARM, kFloat, kNCHW, def); diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc index 1b20c8eaa2164eaf4d658fba72c28b860b5bea74..4fba28e2ab982b1f15e48c95dfa247b2ea56c1ae 100644 --- a/lite/tests/kernels/reshape_compute_test.cc +++ b/lite/tests/kernels/reshape_compute_test.cc @@ -45,7 +45,8 @@ class ReshapeComputeTester : public arena::TestCase { : TestCase(place, alias), dims_(dims) { if (is_shape_tensor_vct) { for (size_t i = 0; i < shape.size(); i++) { - shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i)); + shape_tensor_vct_.emplace_back(op_type_ + "/shape" + + paddle::lite::to_string(i)); } } else if (is_shape_tensor) { shape_tensor_ = op_type_ + "/shape"; diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc index e8c63e2d729c931578de555cdf16cb066cd40e06..4d698ebc0d42a34cf07a85735c09bd49b3fb1284 100644 --- a/lite/tests/kernels/slice_compute_test.cc +++ b/lite/tests/kernels/slice_compute_test.cc @@ -168,8 +168,9 @@ class SliceComputeTester : public arena::TestCase { std::vector ends_tensor_list_; for (int i = 0; i < starts_.size(); ++i) { starts_tensor_list_.push_back("starts_tensor_list_" + - std::to_string(i)); - ends_tensor_list_.push_back("ends_tensor_list_" + std::to_string(i)); + paddle::lite::to_string(i)); + ends_tensor_list_.push_back("ends_tensor_list_" + + paddle::lite::to_string(i)); } op_desc->SetInput("StartsTensorList", {starts_tensor_list_}); op_desc->SetInput("EndsTensorList", {ends_tensor_list_}); @@ -203,15 +204,15 @@ class SliceComputeTester : public arena::TestCase { } else if (use_tensor_list_) { Scope& scope_ = this->scope(); for (int i = 0; i < starts_.size(); ++i) { - auto* tensor = - scope_.NewTensor("starts_tensor_list_" + std::to_string(i)); + auto* tensor = scope_.NewTensor("starts_tensor_list_" + + paddle::lite::to_string(i)); tensor->Resize(DDim({1})); auto* d = tensor->mutable_data(); d[0] = starts_[i]; } for (int i = 0; i < ends_.size(); ++i) { auto* tensor = - scope_.NewTensor("ends_tensor_list_" + std::to_string(i)); + scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i)); tensor->Resize(DDim({1})); auto* d = tensor->mutable_data(); d[0] = ends_[i]; diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc index aba7bed4f1508d6dc2e813b16450470972b95de4..461ef7215e3ceb779b2522adbd5bb286036a0d8e 100644 --- a/lite/tests/kernels/unsqueeze_compute_test.cc +++ b/lite/tests/kernels/unsqueeze_compute_test.cc @@ -123,7 +123,7 @@ class UnsqueezeComputeTester : public arena::TestCase { } else if (input_axes_flag_ == 3) { std::string name = "axes_tensor_"; for (size_t i = 0; i < axes_.size(); i++) { - name = name + std::to_string(i); + name = name + paddle::lite::to_string(i); axes_tensor_list_.push_back(name); SetCommonTensor(name, DDim({1}), &axes_[i]); } diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 93bc95fa4a7136d2127370b076c6b51ccb29c9b5..e28dd6c53e53c477e56e044ada926b4056f1e4e1 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -291,6 +291,8 @@ function make_ios { -DLITE_ON_TINY_PUBLISH=ON \ -DLITE_WITH_OPENMP=OFF \ -DWITH_ARM_DOTPROD=OFF \ + -DLITE_BUILD_TAILOR=$BUILD_TAILOR \ + -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DARM_TARGET_ARCH_ABI=$abi \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ @@ -354,10 +356,12 @@ function make_x86 { -DWITH_LITE=ON \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ -DLITE_WITH_ARM=OFF \ + -DLITE_WITH_PYTHON=$BUILD_PYTHON \ -DWITH_GPU=OFF \ + -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON \ -DLITE_WITH_XPU=$BUID_XPU \ - -DXPU_SDK_ROOT=$XPU_SDK_ROOT \ + -DXPU_SDK_ROOT=$XPU_SDK_ROOT make publish_inference -j$NUM_PROC cd - diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 884576793db29bd745bc5397ca7d155c9701cd31..703da69fa59f3aa99bad9fb04c0decb591486058 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -184,7 +184,7 @@ function build_opencl { return 0 fi - build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl + build_dir=$cur_dir/build.lite.${os}.${abi}.${lang} mkdir -p $build_dir cd $build_dir @@ -193,11 +193,10 @@ function build_opencl { cmake_opencl ${os} ${abi} ${lang} make opencl_clhpp -j$NUM_CORES_FOR_COMPILE build $TESTS_FILE - - # test publish inference lib - make publish_inference -j$NUM_CORES_FOR_COMPILE } + + # This method is only called in CI. function cmake_x86_for_CI { prepare_workspace # fake an empty __generated_code__.cc to pass cmake. @@ -387,7 +386,7 @@ function test_arm_android { echo "test name: ${test_name}" adb_work_dir="/data/local/tmp" - skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass") + skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl") for skip_name in ${skip_list[@]} ; do [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return done @@ -755,16 +754,58 @@ function arm_push_necessary_file { adb -s ${device} push ${testpath} ${adb_work_dir} } + +function test_opencl { + os=$1 + abi=$2 + lang=$3 + device=$4 + + if [[ ${os} == "armlinux" ]]; then + # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf + echo "Skip test arm linux yet. armlinux must in another docker" + return 0 + fi + + if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then + echo "android do not need armv7hf" + return 0 + fi + + # prepare for CXXApi test + local adb="adb -s ${device}" + $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt + + # opencl test should be marked with `opencl` + opencl_test_mark="opencl" + + for _test in $(cat $TESTS_FILE); do + # tell if this test is marked with `opencl` + if [[ $_test == *$opencl_test_mark* ]]; then + test_arm_android $_test $device + fi + done + +} + function build_test_arm_opencl { ######################################################################## cur=$PWD + # job 1-4 must be in one runner + prepare_adb_devices # job 1 build_opencl "android" "armv8" "gcc" + adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*' + run_gen_code_test ${device_armv8} + test_opencl "android" "armv8" "gcc" ${device_armv8} cd $cur # job 2 build_opencl "android" "armv7" "gcc" + adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*' + run_gen_code_test ${device_armv7} + test_opencl "android" "armv7" "gcc" ${device_armv7} cd $cur echo "Done" @@ -1099,6 +1140,8 @@ function main { ;; build_test_arm_opencl) build_test_arm_opencl + build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1 + build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu shift ;; build_test_arm_subtask_android) diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py index 35012d5b163aac2b6998790b4cfcf31e16cb1454..0b96652c6f78ee6bcf5498b9247f0a2391c70473 100644 --- a/lite/tools/cmake_tools/create_fake_kernel_registry.py +++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# this module will record kernels in unvalid_places into all_kernel_faked.cc from __future__ import print_function import sys @@ -18,12 +19,13 @@ import logging from ast import RegisterLiteKernelParser from utils import * -if len(sys.argv) != 4: +if len(sys.argv) != 5: print("Error: create_fake_kernel_registry.py requires three inputs!") exit(1) -ops_list_path = sys.argv[1] -dest_path = sys.argv[2] -kernelmap_path = sys.argv[3] +kernels_list_path = sys.argv[1] +faked_kernels_list_path = sys.argv[2] +dest_path = sys.argv[3] +kernelmap_path = sys.argv[4] out_lines = [ '#pragma once', @@ -77,68 +79,85 @@ const std::map kernel2path_map{ ''' ] +def parse_fake_kernels_from_path(list_path): + with open(list_path) as f: + paths = set([path for path in f]) + for path in paths: + print('path', path) + with open(path.strip()) as g: + c = g.read() + kernel_parser = RegisterLiteKernelParser(c) + kernel_parser.parse() + + for k in kernel_parser.kernels: + kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format( + op_type=k.op_type, + target=k.target, + precision=k.precision, + data_layout=k.data_layout, + alias=k.alias + ) + + kernel_define = fake_kernel % ( + kernel_name, + k.target, + k.precision, + k.data_layout, + kernel_name + ) + + out_lines.append(kernel_define) + out_lines.append("") + + + key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % ( + k.op_type, + k.target, + k.precision, + k.data_layout, + '::paddle::lite::' + kernel_name, + k.alias + ) + out_lines.append(key) + + for input in k.inputs: + io = ' .BindInput("%s", {%s})' % (input.name, input.type) + out_lines.append(io) + for output in k.outputs: + io = ' .BindOutput("%s", {%s})' % (output.name, output.type) + out_lines.append(io) + out_lines.append(" .Finalize();") + out_lines.append("") + out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias)) + +def parse_sppported_kernels_from_path(list_path): + with open(list_path) as f: + paths = set([path for path in f]) + for path in paths: + print('path', path) + with open(path.strip()) as g: + c = g.read() + kernel_parser = RegisterLiteKernelParser(c) + kernel_parser.parse() + + for k in kernel_parser.kernels: + index = path.rindex('/') + filename = path[index + 1:] + map_element = ' {"%s,%s,%s,%s,%s", "%s"},' % ( + k.op_type, + k.target, + k.precision, + k.data_layout, + k.alias, + filename.strip() + ) + kernel_src_map_lines.append(map_element) + + +parse_fake_kernels_from_path(faked_kernels_list_path) +parse_sppported_kernels_from_path(faked_kernels_list_path) +parse_sppported_kernels_from_path(kernels_list_path) -with open(ops_list_path) as f: - paths = set([path for path in f]) - for path in paths: - print('path', path) - with open(path.strip()) as g: - c = g.read() - kernel_parser = RegisterLiteKernelParser(c) - kernel_parser.parse() - - for k in kernel_parser.kernels: - kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format( - op_type = k.op_type, - target = k.target, - precision = k.precision, - data_layout = k.data_layout, - alias = k.alias, - ) - - kernel_define = fake_kernel % ( - kernel_name, - k.target, - k.precision, - k.data_layout, - kernel_name, - ) - - out_lines.append(kernel_define) - out_lines.append("") - - - key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % ( - k.op_type, - k.target, - k.precision, - k.data_layout, - '::paddle::lite::' + kernel_name, - k.alias, - ) - out_lines.append(key) - - for input in k.inputs: - io = ' .BindInput("%s", {%s})' % (input.name, input.type) - out_lines.append(io) - for output in k.outputs: - io = ' .BindOutput("%s", {%s})' % (output.name, output.type) - out_lines.append(io) - out_lines.append(" .Finalize();") - out_lines.append("") - out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias)) - - index = path.rindex('/') - filename = path[index + 1:] - map_element = ' {"%s,%s,%s,%s,%s", "%s"},' % ( - k.op_type, - k.target, - k.precision, - k.data_layout, - k.alias, - filename.strip() - ) - kernel_src_map_lines.append(map_element) with open(dest_path, 'w') as f: logging.info("write kernel list to %s" % dest_path) f.write('\n'.join(out_lines)) diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py index f6a3af6bd3e5a2decfb6b3b65b0357bff8b4a378..560174bc632bec89b9655ff89fd5eeb9e7db7786 100644 --- a/lite/tools/cmake_tools/record_supported_kernel_op.py +++ b/lite/tools/cmake_tools/record_supported_kernel_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# this module will record supported ops from kernels_src.txt from __future__ import print_function import sys @@ -18,12 +19,13 @@ import logging from ast import RegisterLiteKernelParser from ast import RegisterLiteOpParser -if len(sys.argv) != 4: - print("Error: record_supported_kernel_op.py requires three inputs!") - exit(1) +if len(sys.argv) != 5: + print("Error: record_supported_kernel_op.py requires four inputs!") + sys.exit(1) kernels_list_path = sys.argv[1] -ops_list_path = sys.argv[2] -kernel_op_map_dest_path = sys.argv[3] +faked_kernels_list_path = sys.argv[2] +ops_list_path = sys.argv[3] +kernel_op_map_dest_path = sys.argv[4] out_lines = [ @@ -51,11 +53,11 @@ const std::vector> supported_ops_target = { ''' ] -ops_lines=[] +ops_lines = [] # valid targets and valid_ops valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"] -valid_ops = [[],[],[],[],[],[],[],[],[],[]] +valid_ops = [[], [], [], [], [], [], [], [], [], []] class TargetType: kUnk = 0 kHost = 1 @@ -78,8 +80,21 @@ with open(kernels_list_path) as f: kernel_parser.parse() for k in kernel_parser.kernels: if hasattr(TargetType, k.target): - index=getattr(TargetType, k.target) + index = getattr(TargetType, k.target) valid_ops[index].append(k.op_type) +# record op_info of valid kernels into `valid_ops` according to different target type +with open(faked_kernels_list_path) as f: + paths = set([path for path in f]) + for path in paths: + with open(path.strip()) as g: + c = g.read() + kernel_parser = RegisterLiteKernelParser(c) + kernel_parser.parse() + for k in kernel_parser.kernels: + if hasattr(TargetType, k.target): + index = getattr(TargetType, k.target) + valid_ops[index].append(k.op_type) + # clear the repeated ops for target in valid_targets: @@ -114,7 +129,7 @@ with open(kernel_op_map_dest_path, 'w') as f: f.write('\n'.join(out_lines)) # write kernels into head file for target in valid_targets: - if len(valid_ops[getattr(TargetType, target)]) == 0 : + if len(valid_ops[getattr(TargetType, target)]) == 0: f.write("\n // %s_OPS: " %target) f.write('\n {},') else: diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt index 153487623bd3539505543ba1bdc155f77f6c22c9..f07350a4720d7f7eaa268fcaaddf8de31357725d 100644 --- a/lite/utils/cv/CMakeLists.txt +++ b/lite/utils/cv/CMakeLists.txt @@ -1,6 +1,7 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM) lite_cc_library(paddle_cv_arm SRCS image_convert.cc + bgr_rotate.cc paddle_image_preprocess.cc image2tensor.cc image_flip.cc diff --git a/lite/utils/cv/bgr_rotate.cc b/lite/utils/cv/bgr_rotate.cc new file mode 100644 index 0000000000000000000000000000000000000000..93d280b89de8b729af3ed2b1c86d6b2c7e8771c8 --- /dev/null +++ b/lite/utils/cv/bgr_rotate.cc @@ -0,0 +1,1507 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ncnn license +// Tencent is pleased to support the open source community by making ncnn +// available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "lite/utils/cv/bgr_rotate.h" +#include +#include +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in); + +void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in); + +void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in); + +void bgr_rotate_hwc( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) { + if (angle == 90) { + rotate90_hwc(src, dst, w_in, h_in); + } + if (angle == 270) { + rotate270_hwc(src, dst, w_in, h_in); + } + if (angle == 180) { + rotate180_hwc(src, dst, w_in, h_in); + } +} + +/* +bgr1 bgr2 bgr3 +bgr4 bgr5 bgr6 +bgr7 bgr8 bgr9 +rotate: +bgr7 bgr4 bgr1 +bgr8 bgr5 bgr2 +bgr9 bgr6 bgr3 +*/ +#ifdef __aarch64__ +void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int w_out = h_in; + int h_out = w_in; + int win = w_in * 3; + int wout = w_out * 3; + int64_t stride_h = 4 * win; + int64_t stride_h_w = 4 * win - 24; + int ww = w_out - 8; + [w_out * h_out * 3]; + // block 8*8. -- 8*8 + int i = 0; + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr0], #64] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr1], #64] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr2], #64] \n" + "prfm pldl1keep, [%[ptr3]] \n" + "prfm pldl1keep, [%[ptr3], #64] \n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3) + : "memory"); + int j = 0; + for (; j < w_in - 7; j += 8) { + uint8_t* outptr0 = dst + j * wout + (ww - i) * 3; + uint8_t* outptr1 = outptr0 + wout; + uint8_t* outptr2 = outptr1 + wout; + uint8_t* outptr3 = outptr2 + wout; + uint8_t* outptr4 = outptr3 + wout; + uint8_t* outptr5 = outptr4 + wout; + uint8_t* outptr6 = outptr5 + wout; + uint8_t* outptr7 = outptr6 + wout; + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" // v0={00,01,02, 03, + // 04, 05, 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]] \n" // v0={10,11,12, 13, + // 14, 15, 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23, + // 24, 25, 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32, + // 33, 34, 35, 36, + // 37}" + + "add %[inptr0], %[inptr0], %[stride_h] \n" // 4 + 4*w_in + "add %[inptr1], %[inptr1], %[stride_h] \n" // 5 + "add %[inptr2], %[inptr2], %[stride_h] \n" // 6 + "add %[inptr3], %[inptr3], %[stride_h] \n" // 7 + + // b + "trn1 v12.8b, v0.8b, v3.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v15.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v18.8b, v0.8b, v3.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v21.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // g + "trn1 v13.8b, v1.8b, v4.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v16.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v19.8b, v1.8b, v4.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v22.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // r + "trn1 v14.8b, v2.8b, v5.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v17.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v20.8b, v2.8b, v5.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v23.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // b1 + "trn1 v24.4h, v12.4h, v15.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + "trn1 v27.4h, v18.4h, v21.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + "trn2 v0.4h, v12.4h, v15.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v3.4h, v18.4h, v21.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + // g1 + "trn1 v25.4h, v13.4h, v16.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + "trn1 v28.4h, v19.4h, v22.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + "trn2 v1.4h, v13.4h, v16.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v4.4h, v19.4h, v22.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + // r1 + "trn1 v26.4h, v14.4h, v17.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + "trn1 v29.4h, v20.4h, v23.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + "trn2 v2.4h, v14.4h, v17.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v5.4h, v20.4h, v23.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + "ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr0]] \n" // v0={00,01,02, + // 03, 04, 05, 06, + // 07}" + "ld3 {v15.8b, v16.8b, v17.8b}, [%[inptr1]] \n" // v0={10,11,12, + // 13, 14, 15, 16, + // 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23, + // 24, 25, 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32, + // 33, 34, 35, 36, + // 37}" + + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" // 4 - 4*w_in + 8 + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" // 5 + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" // 6 + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" // 7 + + // b2 + "trn1 v18.8b, v12.8b, v15.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v21.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + // g2 + "trn1 v19.8b, v13.8b, v16.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v22.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + // r2 + "trn1 v20.8b, v14.8b, v17.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v23.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v12.8b, v12.8b, v15.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v13.8b, v13.8b, v16.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v14.8b, v14.8b, v17.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + + "trn2 v15.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + "trn2 v16.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + "trn2 v17.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // b2 + "trn1 v6.4h, v18.4h, v21.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + // g2 + "trn1 v7.4h, v19.4h, v22.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + // r2 + "trn1 v8.4h, v20.4h, v23.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + + // bgr + "trn1 v9.4h, v12.4h, v15.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + "trn1 v10.4h, v13.4h, v16.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + "trn1 v11.4h, v14.4h, v17.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + // bgr + "trn2 v18.4h, v18.4h, v21.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v19.4h, v19.4h, v22.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v20.4h, v20.4h, v23.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + + // bgr + "trn2 v21.4h, v12.4h, v15.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + "trn2 v22.4h, v13.4h, v16.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + "trn2 v23.4h, v14.4h, v17.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + // b1 b2 + "trn1 v12.2s, v24.2s, v6.2s \n" // v8={00 10 20 30 40 50 + // 60 70} b + "trn1 v13.2s, v25.2s, v7.2s \n" // v6={00 10 20 30 40 50 + // 60 70} g + "trn1 v14.2s, v26.2s, v8.2s \n" // v6={00 10 20 30 40 50 + // 60 70} r + + // b1 b2 + "trn2 v15.2s, v24.2s, v6.2s \n" // v8={04 14 24 34 44 54 + // 64 74} b + "trn2 v16.2s, v25.2s, v7.2s \n" // v6={04 14 24 34 44 54 + // 64 74} g + "trn2 v17.2s, v26.2s, v8.2s \n" // v6={04 14 24 34 44 54 + // 64 74} r + + // b1 b2 + "trn1 v6.2s, v27.2s, v9.2s \n" // v8={01 11 20 30 40 50 + // 60 70} b + "trn1 v7.2s, v28.2s, v10.2s \n" // v6={01 10 20 30 40 50 + // 60 70} g + "trn1 v8.2s, v29.2s, v11.2s \n" // v6={01 10 20 30 40 50 + // 60 70} r + + "rev64 v12.8b, v12.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v13.8b, v13.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v14.8b, v14.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + "rev64 v15.8b, v15.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v16.8b, v16.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v17.8b, v17.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + // b1 b2 + "trn2 v24.2s, v27.2s, v9.2s \n" // v8={05 10 20 30 40 50 + // 60 70} b + "trn2 v25.2s, v28.2s, v10.2s \n" // v6={05 10 20 30 40 50 + // 60 70} g + "trn2 v26.2s, v29.2s, v11.2s \n" // v6={05 10 20 30 40 50 + // 60 70} r + + // "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" + // //00 10 20 30 04 14 24 34 + // "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" + // //02 12 22 32 + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" // 00 10 20 30 04 14 24 34 + "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" // 02 12 22 32 + // b1 b2 + "trn1 v9.2s, v0.2s, v18.2s \n" // v8={02 11 20 30 40 50 + // 60 70} b + "trn1 v10.2s, v1.2s, v19.2s \n" // v6={02 10 20 30 40 50 + // 60 70} g + "trn1 v11.2s, v2.2s, v20.2s \n" // v6={02 10 20 30 40 50 + // 60 70} r + + "trn2 v27.2s, v0.2s, v18.2s \n" // v8={06 11 20 30 40 50 + // 60 70} b + "trn2 v28.2s, v1.2s, v19.2s \n" // v6={06 10 20 30 40 50 + // 60 70} g + "trn2 v29.2s, v2.2s, v20.2s \n" // v6={06 10 20 30 40 50 + // 60 70} r + + // b1 b2 + "trn1 v0.2s, v3.2s, v21.2s \n" // v8={03 11 20 30 40 50 + // 60 70} b + "trn1 v1.2s, v4.2s, v22.2s \n" // v6={03 10 20 30 40 50 + // 60 70} g + "trn1 v2.2s, v5.2s, v23.2s \n" // v6={03 10 20 30 40 50 + // 60 70} r + + "trn2 v18.2s, v3.2s, v21.2s \n" // v8={07 11 20 30 40 50 + // 60 70} b + "trn2 v19.2s, v4.2s, v22.2s \n" // v6={07 10 20 30 40 50 + // 60 70} g + "trn2 v20.2s, v5.2s, v23.2s \n" // v6={07 10 20 30 40 50 + // 60 70} r + + "rev64 v6.8b, v6.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 b + "rev64 v7.8b, v7.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 g + "rev64 v8.8b, v8.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 r + + "rev64 v24.8b, v24.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v25.8b, v25.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v26.8b, v26.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v9.8b, v9.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 b + "rev64 v10.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v11.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v27.8b, v27.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v28.8b, v28.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v29.8b, v29.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v0.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 b + "rev64 v1.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 g + "rev64 v2.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02 + // 01 00 r + + "rev64 v18.8b, v18.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v19.8b, v19.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v20.8b, v20.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24 \n" // 02 12 22 32 + + "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24 \n" // 02 12 22 32 + + "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24 \n" // 02 12 22 32 + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7), + [stride_h] "+r"(stride_h), + [stride_h_w] "+r"(stride_h_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26", + "v27", + "v28", + "v29", + "v30"); + } + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + for (; j < w_in; j++) { + int tmpx = (ww - i) * 3; + uint8_t* outptr = dst + j * wout + tmpx; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} +#else +void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int w_out = h_in; + int h_out = w_in; + int win = w_in * 3; + int wout = w_out * 3; + int hremain = h_in % 8; + int stride_h = 4 * win; + int stride_h_w = 4 * win - 24; + int ww = w_out - 8; + // block 8*8. -- 8*8 + int i = 0; + for (i = 0; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + asm volatile( + "pld [%[ptr0]] @ preload a, 64byte\n" + "pld [%[ptr0], #64] @ preload a, 64byte\n" + "pld [%[ptr1]] @ preload a, 64byte\n" + "pld [%[ptr1], #64] @ preload a, 64byte\n" + "pld [%[ptr2]] @ preload a, 64byte\n" + "pld [%[ptr2], #64] @ preload a, 64byte\n" + "pld [%[ptr3]] @ preload a, 64byte\n" + "pld [%[ptr3], #64] @ preload a, 64byte\n" + "pld [%[ptr4]] @ preload a, 64byte\n" + "pld [%[ptr4], #64] @ preload a, 64byte\n" + "pld [%[ptr5]] @ preload a, 64byte\n" + "pld [%[ptr5], #64] @ preload a, 64byte\n" + "pld [%[ptr6]] @ preload a, 64byte\n" + "pld [%[ptr6], #64] @ preload a, 64byte\n" + "pld [%[ptr7]] @ preload a, 64byte\n" + "pld [%[ptr7], #64] @ preload a, 64byte\n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3), + [ptr4] "r"(inptr4), + [ptr5] "r"(inptr5), + [ptr6] "r"(inptr6), + [ptr7] "r"(inptr7) + : "memory"); + int j = 0; + for (; j < w_in; j++) { + int tmpx = (ww - i) * 3; + uint8_t* outptr = dst + j * wout + tmpx; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + } + } + ww = w_out - 1; + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + j * wout + (ww - i) * 3; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} +#endif +/* +bgr1 bgr2 bgr3 +bgr4 bgr5 bgr6 +bgr7 bgr8 bgr9 +rotate: +bgr3 bgr6 bgr9 +bgr2 bgr5 bgr8 +bgr1 bgr4 bgr7 +*/ +// dst = (h_out - 1) * w_out +// 类似rotate90,将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转 +#ifdef __aarch64__ +void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int w_out = h_in; + int h_out = w_in; + int win = w_in * 3; + int wout = w_out * 3; + int64_t stride_h = 4 * win; + int64_t stride_h_w = 4 * win - 24; + int hout = h_out - 1; + // block 8*8. -- 8*8 + int i = 0; + for (; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr0], #64] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr1], #64] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr2], #64] \n" + "prfm pldl1keep, [%[ptr3]] \n" + "prfm pldl1keep, [%[ptr3], #64] \n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3) + : "memory"); + int j = 0; + for (; j < w_in - 7; j += 8) { + uint8_t* outptr0 = dst + (hout - j) * wout + i * 3; + uint8_t* outptr1 = outptr0 - wout; + uint8_t* outptr2 = outptr1 - wout; + uint8_t* outptr3 = outptr2 - wout; + uint8_t* outptr4 = outptr3 - wout; + uint8_t* outptr5 = outptr4 - wout; + uint8_t* outptr6 = outptr5 - wout; + uint8_t* outptr7 = outptr6 - wout; + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" // v0={00,01,02, 03, + // 04, 05, 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]] \n" // v0={10,11,12, 13, + // 14, 15, 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23, + // 24, 25, 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32, + // 33, 34, 35, 36, + // 37}" + + "add %[inptr0], %[inptr0], %[stride_h] \n" // 4 + 4*w_in + "add %[inptr1], %[inptr1], %[stride_h] \n" // 5 + "add %[inptr2], %[inptr2], %[stride_h] \n" // 6 + "add %[inptr3], %[inptr3], %[stride_h] \n" // 7 + + // b + "trn1 v12.8b, v0.8b, v3.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v15.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v18.8b, v0.8b, v3.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v21.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // g + "trn1 v13.8b, v1.8b, v4.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v16.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v19.8b, v1.8b, v4.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v22.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // r + "trn1 v14.8b, v2.8b, v5.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v17.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v20.8b, v2.8b, v5.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v23.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // b1 + "trn1 v24.4h, v12.4h, v15.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + "trn1 v27.4h, v18.4h, v21.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + "trn2 v0.4h, v12.4h, v15.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v3.4h, v18.4h, v21.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + // g1 + "trn1 v25.4h, v13.4h, v16.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + "trn1 v28.4h, v19.4h, v22.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + "trn2 v1.4h, v13.4h, v16.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v4.4h, v19.4h, v22.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + // r1 + "trn1 v26.4h, v14.4h, v17.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + "trn1 v29.4h, v20.4h, v23.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + "trn2 v2.4h, v14.4h, v17.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v5.4h, v20.4h, v23.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + "ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr0]] \n" // v0={00,01,02, + // 03, 04, 05, 06, + // 07}" + "ld3 {v15.8b, v16.8b, v17.8b}, [%[inptr1]] \n" // v0={10,11,12, + // 13, 14, 15, 16, + // 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23, + // 24, 25, 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32, + // 33, 34, 35, 36, + // 37}" + + "sub %[inptr0], %[inptr0], %[stride_h_w] \n" // 4 - 4*w_in + 8 + "sub %[inptr1], %[inptr1], %[stride_h_w] \n" // 5 + "sub %[inptr2], %[inptr2], %[stride_h_w] \n" // 6 + "sub %[inptr3], %[inptr3], %[stride_h_w] \n" // 7 + + // b2 + "trn1 v18.8b, v12.8b, v15.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v21.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + // g2 + "trn1 v19.8b, v13.8b, v16.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v22.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + // r2 + "trn1 v20.8b, v14.8b, v17.8b \n" // v4={00 10 02 12 04 14 + // 06 16 } + "trn1 v23.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34 + // 26 36 } + + "trn2 v12.8b, v12.8b, v15.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v13.8b, v13.8b, v16.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + "trn2 v14.8b, v14.8b, v17.8b \n" // v5={01 11 03 13 05 15 + // 07 17 } + + "trn2 v15.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + "trn2 v16.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + "trn2 v17.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35 + // 27 37 } + + // b2 + "trn1 v6.4h, v18.4h, v21.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + // g2 + "trn1 v7.4h, v19.4h, v22.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + // r2 + "trn1 v8.4h, v20.4h, v23.4h \n" // v0={00 10 20 30 04 14 + // 24 34} + + // bgr + "trn1 v9.4h, v12.4h, v15.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + "trn1 v10.4h, v13.4h, v16.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + "trn1 v11.4h, v14.4h, v17.4h \n" // v2={01 11 21 31 05 15 + // 25 35} + + // bgr + "trn2 v18.4h, v18.4h, v21.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v19.4h, v19.4h, v22.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + "trn2 v20.4h, v20.4h, v23.4h \n" // v1={02 12 22 32 06 16 + // 26 36} + + // bgr + "trn2 v21.4h, v12.4h, v15.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + "trn2 v22.4h, v13.4h, v16.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + "trn2 v23.4h, v14.4h, v17.4h \n" // v3={03 13 23 33 07 17 + // 27 37} + + // b1 b2 + "trn1 v12.2s, v24.2s, v6.2s \n" // v8={00 10 20 30 40 50 + // 60 70} b + "trn1 v13.2s, v25.2s, v7.2s \n" // v6={00 10 20 30 40 50 + // 60 70} g + "trn1 v14.2s, v26.2s, v8.2s \n" // v6={00 10 20 30 40 50 + // 60 70} r + + // b1 b2 + "trn2 v15.2s, v24.2s, v6.2s \n" // v8={04 14 24 34 44 54 + // 64 74} b + "trn2 v16.2s, v25.2s, v7.2s \n" // v6={04 14 24 34 44 54 + // 64 74} g + "trn2 v17.2s, v26.2s, v8.2s \n" // v6={04 14 24 34 44 54 + // 64 74} r + + // b1 b2 + "trn1 v6.2s, v27.2s, v9.2s \n" // v8={01 11 20 30 40 50 + // 60 70} b + "trn1 v7.2s, v28.2s, v10.2s \n" // v6={01 10 20 30 40 50 + // 60 70} g + "trn1 v8.2s, v29.2s, v11.2s \n" // v6={01 10 20 30 40 50 + // 60 70} r + + // b1 b2 + "trn2 v24.2s, v27.2s, v9.2s \n" // v8={05 10 20 30 40 50 + // 60 70} b + "trn2 v25.2s, v28.2s, v10.2s \n" // v6={05 10 20 30 40 50 + // 60 70} g + "trn2 v26.2s, v29.2s, v11.2s \n" // v6={05 10 20 30 40 50 + // 60 70} r + + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" // 00 10 20 30 04 14 24 34 + "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" // 02 12 22 32 + // b1 b2 + "trn1 v9.2s, v0.2s, v18.2s \n" // v8={02 11 20 30 40 50 + // 60 70} b + "trn1 v10.2s, v1.2s, v19.2s \n" // v6={02 10 20 30 40 50 + // 60 70} g + "trn1 v11.2s, v2.2s, v20.2s \n" // v6={02 10 20 30 40 50 + // 60 70} r + + "trn2 v27.2s, v0.2s, v18.2s \n" // v8={06 11 20 30 40 50 + // 60 70} b + "trn2 v28.2s, v1.2s, v19.2s \n" // v6={06 10 20 30 40 50 + // 60 70} g + "trn2 v29.2s, v2.2s, v20.2s \n" // v6={06 10 20 30 40 50 + // 60 70} r + + // b1 b2 + "trn1 v0.2s, v3.2s, v21.2s \n" // v8={03 11 20 30 40 50 + // 60 70} b + "trn1 v1.2s, v4.2s, v22.2s \n" // v6={03 10 20 30 40 50 + // 60 70} g + "trn1 v2.2s, v5.2s, v23.2s \n" // v6={03 10 20 30 40 50 + // 60 70} r + + "trn2 v18.2s, v3.2s, v21.2s \n" // v8={07 11 20 30 40 50 + // 60 70} b + "trn2 v19.2s, v4.2s, v22.2s \n" // v6={07 10 20 30 40 50 + // 60 70} g + "trn2 v20.2s, v5.2s, v23.2s \n" // v6={07 10 20 30 40 50 + // 60 70} r + + "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24 \n" // 02 12 22 32 + + "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24 \n" // 02 12 22 32 + + "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24 \n" // 00 + // 10 + // 20 + // 30 + // 04 + // 14 + // 24 + // 34 + "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24 \n" // 02 12 22 32 + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [outptr4] "+r"(outptr4), + [outptr5] "+r"(outptr5), + [outptr6] "+r"(outptr6), + [outptr7] "+r"(outptr7), + [stride_h] "+r"(stride_h), + [stride_h_w] "+r"(stride_h_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25", + "v26", + "v27", + "v28", + "v29"); + } + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + for (; j < w_in; j++) { + int tmpx = i * 3; + uint8_t* outptr = dst + (hout - j) * wout + tmpx; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + (hout - j) * wout + i * 3; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} +#else +void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) { + int w_out = h_in; + int h_out = w_in; + int win = w_in * 3; + int wout = w_out * 3; + int hremain = h_in % 8; + int stride_h = 4 * win; + int stride_h_w = 4 * win - 24; + int hout = h_out - 1; + // block 8*8. -- 8*8 + int i = 0; + for (; i < h_in - 7; i += 8) { + const uint8_t* inptr0 = src + i * win; + const uint8_t* inptr1 = inptr0 + win; + const uint8_t* inptr2 = inptr1 + win; + const uint8_t* inptr3 = inptr2 + win; + const uint8_t* inptr4 = inptr3 + win; + const uint8_t* inptr5 = inptr4 + win; + const uint8_t* inptr6 = inptr5 + win; + const uint8_t* inptr7 = inptr6 + win; + asm volatile( + "pld [%[ptr0]] @ preload a, 64byte\n" + "pld [%[ptr0], #64] @ preload a, 64byte\n" + "pld [%[ptr1]] @ preload a, 64byte\n" + "pld [%[ptr1], #64] @ preload a, 64byte\n" + "pld [%[ptr2]] @ preload a, 64byte\n" + "pld [%[ptr2], #64] @ preload a, 64byte\n" + "pld [%[ptr3]] @ preload a, 64byte\n" + "pld [%[ptr3], #64] @ preload a, 64byte\n" + "pld [%[ptr4]] @ preload a, 64byte\n" + "pld [%[ptr4], #64] @ preload a, 64byte\n" + "pld [%[ptr5]] @ preload a, 64byte\n" + "pld [%[ptr5], #64] @ preload a, 64byte\n" + "pld [%[ptr6]] @ preload a, 64byte\n" + "pld [%[ptr6], #64] @ preload a, 64byte\n" + "pld [%[ptr7]] @ preload a, 64byte\n" + "pld [%[ptr7], #64] @ preload a, 64byte\n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3), + [ptr4] "r"(inptr4), + [ptr5] "r"(inptr5), + [ptr6] "r"(inptr6), + [ptr7] "r"(inptr7) + : "memory"); + int j = 0; + for (; j < w_in; j++) { + int tmpx = i * 3; + uint8_t* outptr = dst + (hout - j) * wout + tmpx; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + *outptr++ = *inptr0++; + + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + *outptr++ = *inptr1++; + + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + *outptr++ = *inptr2++; + + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + *outptr++ = *inptr3++; + + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + *outptr++ = *inptr4++; + + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + *outptr++ = *inptr5++; + + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + *outptr++ = *inptr6++; + + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + *outptr++ = *inptr7++; + } + } + for (; i < h_in; i++) { + const uint8_t* inptr0 = src + i * win; + for (int j = 0; j < w_in; j++) { + uint8_t* outptr0 = dst + (hout - j) * wout + i * 3; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + } + } +} +#endif +/* +bgr1 bgr2 bgr3 +bgr4 bgr5 bgr6 +bgr7 bgr8 bgr9 +rotate: +bgr9 bgr8 bgr7 +bgr6 bgr5 bgr4 +bgr3 bgr2 bgr1 +*/ +// filp y +#ifdef __aarch64__ +void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) { + int w_in = w * 3; + uint8_t zerobuff[30000]; // [w_in]; + memset(zerobuff, 0, w_in * sizeof(uint8_t)); + int64_t stride_w = 24; + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + + uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col + uint8_t* outptr1 = outptr0 - w_in; + uint8_t* outptr2 = outptr1 - w_in; + uint8_t* outptr3 = outptr2 - w_in; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr3]] \n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3) + : "memory"); + int j = 0; + for (; j < w - 7; j += 8) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff; + default: + break; + } + } + asm volatile( + "ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02, + // 03, 04, 05, + // 06, 07}" + "ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12, + // 13, 14, 15, + // 16, 17}" + "ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22, + // 23, 24, 25, + // 26, 27}" + "ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32, + // 33, 34, 35, + // 36, 37}" + + "rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 b + "rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 g + "rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 r + + "rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + "rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03 + // 02 01 00 + + "prfm pldl1keep, [%[inptr0]] \n" + "prfm pldl1keep, [%[inptr1]] \n" + "prfm pldl1keep, [%[inptr2]] \n" + "prfm pldl1keep, [%[inptr3]] \n" + + "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10 + // 20 30 + // 04 14 + // 24 34 + "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12 + // 22 32 + "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11 + // 21 31 + "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13 + // 23 33 + + "sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w + "sub %[outptr1], %[outptr1], %[stride_w] \n" + "sub %[outptr2], %[outptr2], %[stride_w] \n" + "sub %[outptr3], %[outptr3], %[stride_w] \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23"); + } + outptr3 += stride_w - 3; + outptr2 += stride_w - 3; + outptr1 += stride_w - 3; + outptr0 += stride_w - 3; + for (; j < w; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 6; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + } + } + } + delete[] zerobuff; +} +#else +void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) { + int w_in = w * 3; + uint8_t zerobuff[30000]; // w_in + memset(zerobuff, 0, w_in * sizeof(uint8_t)); + int stride_w = 24; + // 4*8 + for (int i = 0; i < h_in; i += 4) { + const uint8_t* inptr0 = src + i * w_in; + const uint8_t* inptr1 = inptr0 + w_in; + const uint8_t* inptr2 = inptr1 + w_in; + const uint8_t* inptr3 = inptr2 + w_in; + + uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last + uint8_t* outptr1 = outptr0 - w_in; + uint8_t* outptr2 = outptr1 - w_in; + uint8_t* outptr3 = outptr2 - w_in; + asm volatile( + "pld [%[ptr0]] @ preload a, 64byte\n" + "pld [%[ptr1]] @ preload a, 64byte\n" + "pld [%[ptr2]] @ preload a, 64byte\n" + "pld [%[ptr3]] @ preload a, 64byte\n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3) + : "memory"); + int j = 0; + for (; j < w - 7; j += 8) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 3: + inptr0 = zerobuff; + outptr0 = zerobuff; + case 2: + inptr1 = zerobuff; + outptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + outptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + outptr3 = zerobuff; + default: + break; + } + } + asm volatile( + "vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 " + "04 05 06 07\n" + "vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 " + "14 15 16 17\n" + "vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 " + "24 25 26 27\n" + "vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 " + "33 34 35 36 37\n" + + "vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n" + + "vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n" + "vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + "vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 " + "\n" + + "pld [%[inptr0]] @ preload a, 64byte\n" + "pld [%[inptr1]] @ preload a, 64byte\n" + "pld [%[inptr2]] @ preload a, 64byte\n" + "pld [%[inptr3]] @ preload a, 64byte\n" + + "vst3.8 {d12, d13, d14}, [%[outptr0]] @ write " + "d0(q0,low),r00,r10 20 30\n" + "vst3.8 {d15, d16, d17}, [%[outptr1]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst3.8 {d18, d19, d20}, [%[outptr2]] @ write " + "d4(q0,low),r01,r11 21 31\n" + "vst3.8 {d21, d22, d23}, [%[outptr3]] @ write " + "d4(q0,low),r01,r11 21 31\n" + + "sub %[outptr0], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr1], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr2], %[stride_w] @ ptr - stride_w \n" + "sub %[outptr3], %[stride_w] @ ptr - stride_w \n" + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr0] "+r"(outptr0), + [outptr1] "+r"(outptr1), + [outptr2] "+r"(outptr2), + [outptr3] "+r"(outptr3), + [stride_w] "+r"(stride_w) + : + : "q0", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12"); + } + outptr3 += stride_w - 3; + outptr2 += stride_w - 3; + outptr1 += stride_w - 3; + outptr0 += stride_w - 3; + for (; j < w; j++) { + if (i + 3 >= h_in) { + switch ((i + 3) - h_in) { + case 0: + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + case 1: + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + case 2: + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + case 3: + // inptr3 = zerobuff; + default: + break; + } + } else { + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + *outptr3++ = *inptr3++; + outptr3 -= 6; + + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + *outptr2++ = *inptr2++; + outptr2 -= 6; + + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + *outptr1++ = *inptr1++; + outptr1 -= 6; + + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + *outptr0++ = *inptr0++; + outptr0 -= 6; + } + } + } + delete[] zerobuff; +} +#endif +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/bgr_rotate.h b/lite/utils/cv/bgr_rotate.h new file mode 100644 index 0000000000000000000000000000000000000000..bb85da56955154863eb17595ebb5b58d79cd6a83 --- /dev/null +++ b/lite/utils/cv/bgr_rotate.h @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +namespace paddle { +namespace lite { +namespace utils { +namespace cv { +void bgr_rotate_hwc( + const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle); +} // namespace cv +} // namespace utils +} // namespace lite +} // namespace paddle diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc index 2fc884a0fa4fd359c150115a20bdf751094b4687..f4a80ed6255186b8c1b59a8d56fd64b78c9bc1d2 100644 --- a/lite/utils/cv/image_resize.cc +++ b/lite/utils/cv/image_resize.cc @@ -664,15 +664,6 @@ void resize(const uint8_t* src, memcpy(dst, src, sizeof(uint8_t) * size); return; } - double scale_x = static_cast(srcw) / dstw; - double scale_y = static_cast(srch) / dsth; - - int* buf = new int[dstw * 2 + dsth * 3]; - - int* xofs = buf; - int* yofs = buf + dstw; - int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); - int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); int w_out = dstw; int w_in = srcw; @@ -692,12 +683,19 @@ void resize(const uint8_t* src, w_in = srcw * 3; w_out = dstw * 3; num = 3; - } else if (srcFormat == BGRA || srcFormat == RGBA) { w_in = srcw * 4; w_out = dstw * 4; num = 4; } + double scale_x = static_cast(srcw) / dstw; + double scale_y = static_cast(srch) / dsth; + + int* buf = new int[dstw * 2 + dsth * 3]; + int* xofs = buf; + int* yofs = buf + dstw; + int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); + int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); compute_xy( srcw, srch, dstw, orih, num, scale_x, scale_y, xofs, yofs, ialpha, ibeta); @@ -726,10 +724,10 @@ void resize(const uint8_t* src, int remain = w_out % 8; int32x4_t _v2 = vdupq_n_s32(2); int prev_sy1 = -1; + int16_t* rowsbuf0 = new int16_t[w_out + 1]; + int16_t* rowsbuf1 = new int16_t[w_out + 1]; #pragma omp parallel for for (int dy = 0; dy < dsth; dy++) { - int16_t* rowsbuf0 = new int16_t[w_out + 1]; - int16_t* rowsbuf1 = new int16_t[w_out + 1]; int sy = yofs[dy]; if (dy >= orih) { xofs = xofs1; @@ -853,8 +851,6 @@ void resize(const uint8_t* src, 2); } ibeta += 2; - delete[] rowsbuf0; - delete[] rowsbuf1; } if (orih < dsth) { // uv delete[] xofs1; @@ -862,6 +858,8 @@ void resize(const uint8_t* src, delete[] ialpha1; } delete[] buf; + delete[] rowsbuf0; + delete[] rowsbuf1; } // compute xofs, yofs, alpha, beta void compute_xy(int srcw, diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc index 4ef757793ec009f4a4807499b1c48ac908393966..c87fc4def24220e240168a7114910c7c9ecee5ba 100644 --- a/lite/utils/cv/image_rotate.cc +++ b/lite/utils/cv/image_rotate.cc @@ -15,6 +15,7 @@ #include "lite/utils/cv/image_rotate.h" #include #include +#include "lite/utils/cv/bgr_rotate.h" namespace paddle { namespace lite { namespace utils { @@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src, if (srcFormat == GRAY) { rotate_hwc1(src, dst, srcw, srch, degree); } else if (srcFormat == BGR || srcFormat == RGB) { - rotate_hwc3(src, dst, srcw, srch, degree); + // rotate_hwc3(src, dst, srcw, srch, degree); + bgr_rotate_hwc(src, dst, srcw, srch, static_cast(degree)); } else if (srcFormat == BGRA || srcFormat == RGBA) { rotate_hwc4(src, dst, srcw, srch, degree); } else { diff --git a/lite/utils/logging.h b/lite/utils/logging.h index c2c999fd70f3eee78c1deaf5ec2c4fea4e4f3fd1..3d97f4dbec1e4973295248c94c4156563dfb4f5d 100644 --- a/lite/utils/logging.h +++ b/lite/utils/logging.h @@ -29,6 +29,7 @@ #include #include #include "lite/utils/replace_stl/stream.h" +#include "lite/utils/string.h" #ifdef LITE_WITH_ANDROID #include @@ -171,7 +172,7 @@ class VLogMessage { if (GLOG_v_int < level_int) { return; } - const char* level = std::to_string(level_int).c_str(); + const char* level = paddle::lite::to_string(level_int).c_str(); paddle::lite::gen_log(log_stream_, file, func, lineno, level); } diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc index e72f2717293d0cc07ac28c6d51dd4d2bb5ae7874..37b02d3c50b8ed78bb8335a1618f753f645fd00b 100644 --- a/lite/utils/replace_stl/stream.cc +++ b/lite/utils/replace_stl/stream.cc @@ -15,6 +15,7 @@ #include "lite/utils/replace_stl/stream.h" #include #include +#include "lite/utils/string.h" #ifdef LITE_ON_TINY_PUBLISH @@ -39,9 +40,9 @@ void ostream::pad(const std::string& text) { #ifdef LITE_SHUTDOWN_LOG #define ADD_DATA_AS_STRING(data_, obj_) #else -#define ADD_DATA_AS_STRING(data_, obj_) \ - std::string text = std::to_string(obj_); \ - pad(text); \ +#define ADD_DATA_AS_STRING(data_, obj_) \ + std::string text = paddle::lite::to_string(obj_); \ + pad(text); \ data_ = data_ + text; #endif diff --git a/lite/utils/string.h b/lite/utils/string.h index d96b2aac20549989afdc730e34af4fc40541329d..5269525b64f473f1018e183613c087886dba97d6 100644 --- a/lite/utils/string.h +++ b/lite/utils/string.h @@ -48,7 +48,14 @@ template static std::string to_string_with_precision(const T& v, const int n = 6) { STL::stringstream ss; ss.precision(n); - // ss << std::fixed << v; + ss << v; + return ss.str(); +} + +template +static std::string to_string(const T& v) { + STL::stringstream ss; + ss << v; return ss.str(); }