提交 f4e27ad1 编写于 作者: J jackzhang235

Merge remote-tracking branch 'upstream/develop' into develop

...@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources ...@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources metal/MobileNetDemo/MobileNetDemo/Resources
build*
...@@ -57,22 +57,20 @@ function(check_linker_flag) ...@@ -57,22 +57,20 @@ function(check_linker_flag)
endforeach() endforeach()
set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE) set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
endfunction() endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON)) if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif() endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections) check_linker_flag(-Wl,--gc-sections)
endif() endif()
if(LITE_WITH_OPENCL)
if(ARM_TARGET_LANG STREQUAL "clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
endif()
if(LITE_WITH_OPENMP) if(LITE_WITH_OPENMP)
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
if(OPENMP_FOUND OR OpenMP_CXX_FOUND) if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
......
...@@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels") ...@@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels")
set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt") set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
file(WRITE ${kernels_src_list} "") # clean file(WRITE ${kernels_src_list} "") # clean
# file to record faked kernels for opt python lib
set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
file(WRITE ${fake_kernels_src_list} "") # clean
if(LITE_BUILD_TAILOR) if(LITE_BUILD_TAILOR)
set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list") set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
...@@ -313,56 +318,65 @@ function(add_kernel TARGET device level) ...@@ -313,56 +318,65 @@ function(add_kernel TARGET device level)
return() return()
endif() endif()
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
# the source list will collect for model_optimize_tool to fake kernel generation.
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
# when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
# no need to continue the compilation of the true kernel source.
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
return()
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
if ("${device}" STREQUAL "Host") if ("${device}" STREQUAL "Host")
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "ARM") if ("${device}" STREQUAL "ARM")
if (NOT LITE_WITH_ARM) if (NOT LITE_WITH_ARM)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "X86") if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86) if (NOT LITE_WITH_X86)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "NPU") if ("${device}" STREQUAL "NPU")
if (NOT LITE_WITH_NPU) if (NOT LITE_WITH_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "XPU") if ("${device}" STREQUAL "XPU")
if (NOT LITE_WITH_XPU) if (NOT LITE_WITH_XPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "") set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "FPGA") if ("${device}" STREQUAL "FPGA")
if (NOT LITE_WITH_FPGA) if (NOT LITE_WITH_FPGA)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "") set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "BM") if ("${device}" STREQUAL "BM")
if (NOT LITE_WITH_BM) if (NOT LITE_WITH_BM)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
...@@ -375,6 +389,9 @@ function(add_kernel TARGET device level) ...@@ -375,6 +389,9 @@ function(add_kernel TARGET device level)
endif() endif()
if ("${device}" STREQUAL "OPENCL") if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL) if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "") set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
...@@ -382,6 +399,9 @@ function(add_kernel TARGET device level) ...@@ -382,6 +399,9 @@ function(add_kernel TARGET device level)
if ("${device}" STREQUAL "CUDA") if ("${device}" STREQUAL "CUDA")
if (NOT LITE_WITH_CUDA) if (NOT LITE_WITH_CUDA)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "") set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")
......
...@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true ...@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
> 不同手机,不同版本,测试模型的性能数据不同。 > 不同手机,不同版本,测试模型的性能数据不同。
```shell ```shell
run benchmark armv7 run benchmark armv8
-------------------------------------- --------------------------------------
PaddleLite Benchmark PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30 Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 159.8427 ms mnasnet min = 19.83500 max = 19.38500 average = 19.65503
-- mobilenet_v1 avg = 235.0072 ms mobilenetv1 min = 32.00600 max = 31.56900 average = 31.81983
-- mobilenet_v2 avg = 173.0387 ms mobilenetv2 min = 22.37900 max = 22.08700 average = 22.28623
-- shufflenet_v2 avg = 76.0040 ms shufflenetv2 min = 10.80400 max = 10.62900 average = 10.68890
-- squeezenet_v11 avg = 164.2957 ms squeezenet min = 17.67400 max = 17.47900 average = 17.57677
Threads=2 Warmup=10 Repeats=30 Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 83.1287 ms mnasnet min = 11.85600 max = 11.72000 average = 11.77127
-- mobilenet_v1 avg = 121.6029 ms mobilenetv1 min = 18.75000 max = 18.64300 average = 18.70593
-- mobilenet_v2 avg = 86.6175 ms mobilenetv2 min = 14.05100 max = 13.59900 average = 13.71450
-- shufflenet_v2 avg = 41.5761 ms shufflenetv2 min = 6.67200 max = 6.58300 average = 6.63400
-- squeezenet_v11 avg = 87.8678 ms squeezenet min = 12.07100 max = 11.33400 average = 11.41253
Threads=4 Warmup=10 Repeats=30 Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 73.3880 ms mnasnet min = 7.19300 max = 7.02600 average = 7.08480
-- mobilenet_v1 avg = 119.0739 ms mobilenetv1 min = 10.42000 max = 10.29100 average = 10.34267
-- mobilenet_v2 avg = 85.3050 ms mobilenetv2 min = 8.61900 max = 8.46900 average = 8.54707
-- shufflenet_v2 avg = 38.0762 ms shufflenetv2 min = 4.55200 max = 4.41900 average = 4.46477
-- squeezenet_v11 avg = 64.2201 ms squeezenet min = 8.60000 max = 7.85200 average = 7.98407
-------------------------------------- --------------------------------------
run benchmark armv8 run benchmark armv7
-------------------------------------- --------------------------------------
PaddleLite Benchmark PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30 Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 165.3073 ms mnasnet min = 20.98300 max = 20.81400 average = 20.92527
-- mobilenet_v1 avg = 306.0188 ms mobilenetv1 min = 33.19000 max = 32.81700 average = 33.08490
-- mobilenet_v2 avg = 195.1884 ms mobilenetv2 min = 25.91400 max = 25.61700 average = 25.73097
-- shufflenet_v2 avg = 99.3692 ms shufflenetv2 min = 11.14300 max = 10.97600 average = 11.06757
-- squeezenet_v11 avg = 156.6971 ms squeezenet min = 19.31800 max = 19.20000 average = 19.26530
Threads=2 Warmup=10 Repeats=30 Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 90.2290 ms mnasnet min = 12.59900 max = 12.46600 average = 12.52207
-- mobilenet_v1 avg = 157.0007 ms mobilenetv1 min = 19.05800 max = 18.94700 average = 18.97897
-- mobilenet_v2 avg = 118.1607 ms mobilenetv2 min = 15.28400 max = 15.11300 average = 15.19843
-- shufflenet_v2 avg = 68.6804 ms shufflenetv2 min = 6.97000 max = 6.81400 average = 6.90863
-- squeezenet_v11 avg = 91.3090 ms squeezenet min = 12.87900 max = 12.12900 average = 12.22530
Threads=4 Warmup=10 Repeats=30 Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 179.9730 ms mnasnet min = 7.31400 max = 7.12900 average = 7.20357
-- mobilenet_v1 avg = 204.0684 ms mobilenetv1 min = 11.44000 max = 10.86900 average = 10.94383
-- mobilenet_v2 avg = 181.6486 ms mobilenetv2 min = 9.14900 max = 9.03800 average = 9.09907
-- shufflenet_v2 avg = 123.2728 ms shufflenetv2 min = 4.60600 max = 4.49400 average = 4.53360
-- squeezenet_v11 avg = 412.9046 ms squeezenet min = 8.27000 max = 8.10600 average = 8.19000
-------------------------------------- --------------------------------------
``` ```
...@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an ...@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
--optimize_out_type=(protobuf|naive_buffer) \ --optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \ --optimize_out=<output_optimize_model_dir> \
--valid_targets=npu,arm \ --valid_targets=npu,arm \
--prefer_int8_kernel=(true|false) \
--record_tailoring_info =(true|false) --record_tailoring_info =(true|false)
``` ```
- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576) - model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)
......
...@@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h ...@@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h
--arm_os=android \ --arm_os=android \
--arm_abi=armv8 \ --arm_abi=armv8 \
--arm_lang=gcc \ --arm_lang=gcc \
build_test_arm_opencl build_opencl
``` ```
注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
### 编译产物说明 ### 编译产物说明
编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物: 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物:
......
...@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 ...@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
例如: 例如:
```bash ```bash
./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish ./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
``` ```
**注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径 **注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
...@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编 ...@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
#include <stdio.h> #include <stdio.h>
#include <vector> #include <vector>
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#include "paddle_use_passes.h" // NOLINT
using namespace paddle::lite_api; // NOLINT using namespace paddle::lite_api; // NOLINT
...@@ -182,4 +179,4 @@ int main(int argc, char** argv) { ...@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename``--param_filename`指定。 2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename``--param_filename`指定。
3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
4. 需要使用Paddle-Lite 最新版本(release/v2.1.0之后)代码编译出的model_optimize_tool 4. 需要使用Paddle-Lite `release/v2.1.0`之后版本代码编译出的模型优化工具
...@@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式: ...@@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式:
--optimize_out_type=(protobuf|naive_buffer) \ --optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \ --optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86|npu|xpu) \ --valid_targets=(arm|opencl|x86|npu|xpu) \
--prefer_int8_kernel=(true|false) \
--record_tailoring_info =(true|false) --record_tailoring_info =(true|false)
``` ```
...@@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式: ...@@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式:
| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 | | --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 |
| --optimize_out | 优化模型的输出路径。 | | --optimize_out | 优化模型的输出路径。 |
| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 | | --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 |
| --prefer_int8_kernel | 若待优化模型为int8量化模型(如量化训练得到的量化模型),则设置该选项为true以使用int8内核函数进行推理加速,默认为false。 |
| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 | | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 |
* 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file``--param_file` * 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file``--param_file`
* 如果待优化的fluid模型是combined形式,请设置`--model_file``--param_file`,忽略`--model_dir` * 如果待优化的fluid模型是combined形式,请设置`--model_file``--param_file`,忽略`--model_dir`
* 优化后的模型为以`.nb`名称结尾的单个文件。 * 优化后的模型为以`.nb`名称结尾的单个文件。
* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。
### 功能二:统计模型算子信息、判断是否支持 ### 功能二:统计模型算子信息、判断是否支持
......
...@@ -245,7 +245,6 @@ python compress.py \ ...@@ -245,7 +245,6 @@ python compress.py \
--optimize_out_type=naive_buffer \ --optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \ --optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \ --valid_targets=arm \
--prefer_int8_kernel=true
``` ```
如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。 如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。
......
...@@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir, ...@@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。 参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
因为该模型会将量化的权重反量化,然后实际加载并执行FP32预测模型,所以opt命令的输入参数--prefer_int8_kernel不需要设置为true,同时其他参数按照实际情况参考文档设置。
比如在安卓手机ARM端进行预测,模型转换的命令为: 比如在安卓手机ARM端进行预测,模型转换的命令为:
```bash ```bash
./opt --model_dir=./mobilenet_v1_quant \ ./opt --model_dir=./mobilenet_v1_quant \
......
...@@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'): ...@@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'):
参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。 参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true,其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测,模型转换的命令为: 参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具,参数按照实际情况设置。比如在安卓手机ARM端进行预测,模型转换的命令为:
```bash ```bash
./opt --model_dir=./mobilenet_v1_quant \ ./opt --model_dir=./mobilenet_v1_quant \
--optimize_out_type=naive_buffer \ --optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \ --optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \ --valid_targets=arm
--prefer_int8_kernel=true
``` ```
### 3.2 量化模型预测 ### 3.2 量化模型预测
......
...@@ -24,8 +24,7 @@ $ ./opt \ ...@@ -24,8 +24,7 @@ $ ./opt \
--param_file=<param_path> \ --param_file=<param_path> \
--optimize_out_type=(protobuf|naive_buffer) \ --optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \ --optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86) \ --valid_targets=(arm|opencl|x86)
--prefer_int8_kernel=(ture|false)
``` ```
其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。 其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。
......
...@@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") ...@@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
if (LITE_WITH_PYTHON) if (LITE_WITH_PYTHON)
add_custom_target(publish_inference_python_lib ${TARGET} add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so") COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND python setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET} add_custom_target(publish_inference_python_light_demo ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/") COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
...@@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON) ...@@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON)
endif() endif()
add_dependencies(publish_inference_python_lib lite_pybind) add_dependencies(publish_inference_python_lib lite_pybind)
add_dependencies(publish_inference publish_inference_python_lib) add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo) add_dependencies(publish_inference publish_inference_python_light_demo)
endif() endif()
...@@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference tiny_publish_cxx_lib) add_dependencies(publish_inference tiny_publish_cxx_lib)
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so) COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
endif() endif()
endif() endif()
......
...@@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH)
return() return()
endif() endif()
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
if (LITE_ON_MODEL_OPTIMIZE_TOOL) if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt") message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
......
...@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
const std::vector<std::string> quant_dequant_op = {
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
"fake_quantize_moving_average_abs_max",
"fake_quantize_dequantize_moving_average_abs_max",
"fake_dequantize_max_abs",
"fake_channel_wise_dequantize_max_abs"};
bool is_quantized_model = false;
for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
++i) {
auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
std::string op_type = op_desc->Type();
if (std::find(quant_dequant_op.begin(),
quant_dequant_op.end(),
op_type) != quant_dequant_op.end()) {
is_quantized_model = true;
}
}
}
if (is_quantized_model) {
inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(desc, scope_, inner_places); Program program(desc, scope_, inner_places);
core::KernelPickFactor factor; core::KernelPickFactor factor;
......
...@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) { ...@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {
int main(int argc, char** argv) { int main(int argc, char** argv) {
CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>"; CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
paddle::lite::Run(argv[1], std::stoi(argv[2])); paddle::lite::Run(argv[1], atoi(argv[2]));
return 0; return 0;
} }
......
...@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() { ...@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {
std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() { std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
LOG(FATAL) << "The Clone API is not supported in LigthPredictor"; LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
return nullptr;
} }
std::string LightPredictorImpl::GetVersion() const { return lite::version(); } std::string LightPredictorImpl::GetVersion() const { return lite::version(); }
......
...@@ -95,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -95,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000); ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1; double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)]; auto result = pdata[j * step + (out->dims()[1] * i)];
...@@ -119,21 +119,21 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -119,21 +119,21 @@ void TestModel(const std::vector<Place>& valid_places,
// Get detailed result // Get detailed result
size_t output_tensor_num = predictor.GetOutputNames().size(); size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num; VLOG(1) << "output tensor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
auto* output_tensor = predictor.GetOutput(tidx); auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n"; VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims(); auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims; auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, out_dims.production());
float sum = 0.f; auto out_std_dev = compute_standard_deviation<float>(
for (int i = 0; i < out_dims.production(); ++i) { out_data, out_dims.production(), true, out_mean);
sum += output_tensor->data<float>()[i];
} VLOG(1) << "output tensor dims:" << out_dims;
VLOG(1) << "out_dims.production():" << out_dims.production(); VLOG(1) << "output tensor elements num:" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum; VLOG(1) << "output tensor standard deviation:" << out_std_dev;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production(); VLOG(1) << "output tensor mean value:" << out_mean;
// print result // print result
for (int i = 0; i < out_dims.production(); ++i) { for (int i = 0; i < out_dims.production(); ++i) {
......
...@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) { if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000); ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1; double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
for (int i = 0; i < ref.size(); ++i) { for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) { for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)]; auto result = pdata[j * step + (out->dims()[1] * i)];
...@@ -121,21 +121,21 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -121,21 +121,21 @@ void TestModel(const std::vector<Place>& valid_places,
// Get detailed result // Get detailed result
size_t output_tensor_num = predictor.GetOutputNames().size(); size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num; VLOG(1) << "output tensor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
auto* output_tensor = predictor.GetOutput(tidx); auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n"; VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims(); auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims; auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, out_dims.production());
float sum = 0.f; auto out_std_dev = compute_standard_deviation<float>(
for (int i = 0; i < out_dims.production(); ++i) { out_data, out_dims.production(), true, out_mean);
sum += output_tensor->data<float>()[i];
} VLOG(1) << "output tensor dims:" << out_dims;
VLOG(1) << "out_dims.production():" << out_dims.production(); VLOG(1) << "output tensor elements num:" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum; VLOG(1) << "output tensor standard deviation:" << out_std_dev;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production(); VLOG(1) << "output tensor mean value:" << out_mean;
// print result // print result
for (int i = 0; i < out_dims.production(); ++i) { for (int i = 0; i < out_dims.production(); ++i) {
......
...@@ -138,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -138,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt"); std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) { for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i]; sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n"; out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
} }
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num; << ", mean value is " << sum * 1. / arg_num;
......
...@@ -250,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -250,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt"); std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) { for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i]; sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n"; out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
} }
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num; << ", mean value is " << sum * 1. / arg_num;
......
...@@ -264,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -264,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt"); std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) { for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i]; sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n"; out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
} }
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str() LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num; << ", mean value is " << sum * 1. / arg_num;
......
...@@ -67,7 +67,6 @@ DEFINE_string(valid_targets, ...@@ -67,7 +67,6 @@ DEFINE_string(valid_targets,
"arm", "arm",
"The targets this model optimized for, should be one of (arm, " "The targets this model optimized for, should be one of (arm, "
"opencl, x86), splitted by space"); "opencl, x86), splitted by space");
DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
DEFINE_bool(print_supported_ops, DEFINE_bool(print_supported_ops,
false, false,
"Print supported operators on the inputed target"); "Print supported operators on the inputed target");
...@@ -123,11 +122,6 @@ std::vector<Place> ParserValidPlaces() { ...@@ -123,11 +122,6 @@ std::vector<Place> ParserValidPlaces() {
<< "At least one target should be set, should set the " << "At least one target should be set, should set the "
"command argument 'valid_targets'"; "command argument 'valid_targets'";
if (FLAGS_prefer_int8_kernel) {
LOG(WARNING) << "Int8 mode is only support by ARM target";
valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
return valid_places; return valid_places;
} }
...@@ -257,7 +251,6 @@ void PrintHelpInfo() { ...@@ -257,7 +251,6 @@ void PrintHelpInfo() {
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--prefer_int8_kernel=(true|false)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/api/opt_base.h"
#include "all_kernel_faked.cc" // NOLINT
namespace paddle {
namespace lite_api {
void OptBase::SetModelDir(const std::string& model_path) {
opt_config_.set_model_dir(model_path);
}
void OptBase::SetModelFile(const std::string& model_path) {
opt_config_.set_model_file(model_path);
}
void OptBase::SetParamFile(const std::string& param_path) {
opt_config_.set_param_file(param_path);
}
void OptBase::SetModelType(std::string optimize_out_type) {
if (optimize_out_type == "protobuf") {
model_type_ = LiteModelType::kProtobuf;
} else if (optimize_out_type == "naive_buffer") {
model_type_ = LiteModelType::kNaiveBuffer;
} else {
LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
}
}
void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.clear();
auto target_reprs = lite::Split(valid_places, ",");
for (auto& target_repr : target_reprs) {
if (target_repr == "arm") {
valid_places_.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") {
valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU));
} else {
LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag "
"'valid_targets'",
target_repr.c_str());
}
}
CHECK(!valid_places_.empty())
<< "At least one target should be set, should set the "
"command argument 'valid_targets'";
}
void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
optimize_out_path_ = optimized_out_path;
}
void OptBase::RunOptimize(bool record_strip_info) {
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
auto resulted_model_name =
record_strip_info ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << optimize_out_path_ << "successfully";
}
}
// collect ops info of modelset
void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models,
const std::string& filename) {
std::set<std::string> total;
for (const auto& name : models) {
std::string model_path =
lite::Join<std::string>({output_dir, name, filename}, "/");
auto lines = lite::ReadLines(model_path);
total.insert(lines.begin(), lines.end());
}
std::string output_path =
lite::Join<std::string>({output_dir, filename}, "/");
lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
output_path);
}
void OptBase::SetModelSetDir(const std::string& model_set_path) {
model_set_dir_ = model_set_path;
}
void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
// 1. mkdir of outputed optimized model set.
lite::MkDirRecur(optimize_out_path_);
auto model_dirs = lite::ListDir(model_set_dir_, true);
if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
}
// 2. optimize each model in inputed model set dir.
std::string model_file = opt_config_.model_file();
std::string param_file = opt_config_.param_file();
for (const auto& name : model_dirs) {
std::string input_model_dir =
lite::Join<std::string>({model_set_dir_, name}, "/");
std::string output_model_dir =
lite::Join<std::string>({optimize_out_path_, name}, "/");
if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
auto model_file_path =
lite::Join<std::string>({input_model_dir, model_file}, "/");
auto param_file_path =
lite::Join<std::string>({input_model_dir, param_file}, "/");
}
std::cout << "Start optimize model: " << input_model_dir;
opt_config_.set_model_dir(input_model_dir);
opt_config_.set_model_file(model_file);
opt_config_.set_param_file(param_file);
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
std::cout << "Optimize done. ";
}
// 3. if record_strip_info = true, we will record striping info
if (record_strip_info) {
// Collect all models information
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(optimize_out_path_,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
std::cout << "Record the information of stripped models into :"
<< optimize_out_path_ << "successfully";
}
}
void OptBase::PrintHelpInfo() {
const std::string opt_version = lite::version();
const char help_info[] =
"At least one argument should be inputed. Valid arguments are listed "
"below:\n"
" Arguments of help information:\n"
" `help()` Print help infomation\n"
" Arguments of model optimization:\n"
" `set_model_dir(model_dir)`\n"
" `set_model_file(model_file_path)`\n"
" `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n"
" `set_optimize_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
" `run_optimize(false|true)`\n"
" ` ----fasle&true refer to whether to record ops info for "
"tailoring lib, false by default`\n"
" Arguments of model checking and ops information:\n"
" `print_all_ops()` Display all the valid operators of "
"Paddle-Lite\n"
" `print_supported_ops` Display supported operators of valid "
"places\n"
" `check_if_model_supported()` Check if the input model is "
"supported\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
}
// 2. Print supported info of inputed ops
void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
std::vector<std::string> lite_supported_targets = {"kHost",
"kX86",
"kCUDA",
"kARM",
"kOpenCL",
"kFPGA",
"kNPU",
"kXPU",
"kAny",
"kUnk"};
// Get the lengh of the first column: maximum length of the op_type
size_t maximum_optype_length = 0;
for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
maximum_optype_length = it->first.size() > maximum_optype_length
? it->first.size()
: maximum_optype_length;
}
std::cout << std::setiosflags(std::ios::internal);
// Print the first row: OP_nam taget1 target2 ...
std::cout << std::setw(maximum_optype_length) << "OP_name";
for (size_t i = 0; i < lite_supported_targets.size(); i++) {
std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
}
std::cout << std::endl;
// Print the name of supported ops and mark if it's supported by each target
// print the support info of inputed ops: valid_ops
for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
std::cout << std::setw(maximum_optype_length) << *op;
// Check: If this kernel doesn't match any operator, we will skip it.
if (supported_ops.find(*op) == supported_ops.end()) {
continue;
}
// Print OP info.
auto ops_valid_places = supported_ops.at(*op);
for (size_t i = 0; i < lite_supported_targets.size(); i++) {
if (std::find(ops_valid_places.begin(),
ops_valid_places.end(),
lite_supported_targets[i]) != ops_valid_places.end()) {
std::cout << std::setw(10) << "Y";
} else {
std::cout << std::setw(10) << " ";
}
}
std::cout << std::endl;
}
}
void OptBase::DisplayKernelsInfo() { // Display kernel information
std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
}
void OptBase::PrintAllOps() {
// 1. Get supported ops on these targets
std::set<std::string> valid_ops;
for (size_t i = 0; i < supported_ops_target.size(); i++) {
auto ops = supported_ops_target[i];
valid_ops.insert(ops.begin(), ops.end());
}
// 2. Print support info of these ops
PrintOpsInfo(valid_ops);
}
void OptBase::PrintSupportedOps() {
// 1. Get the valid hardware targets
std::vector<TargetType> target_types = {};
for (size_t i = 0; i < valid_places_.size(); i++) {
target_types.push_back(valid_places_[i].target);
}
std::string targets_str = TargetToStr(target_types[0]);
for (size_t i = 1; i < target_types.size(); i++) {
targets_str = targets_str + TargetToStr(target_types[i]);
}
std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
target_types.push_back(TARGET(kHost));
target_types.push_back(TARGET(kUnk));
// 2. Get supported ops on these targets
std::set<std::string> valid_ops;
for (size_t i = 0; i < target_types.size(); i++) {
auto ops = supported_ops_target[static_cast<int>(target_types[i])];
valid_ops.insert(ops.begin(), ops.end());
}
// 3. Print support info of these ops
PrintOpsInfo(valid_ops);
}
// test whether this model is supported
void OptBase::CheckIfModelSupported(bool print_ops_info) {
// 1. parse valid places and valid targets
auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
valid_ops.insert(
valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
for (size_t i = 0; i < valid_places_.size(); i++) {
auto target = valid_places_[i].target;
auto ops = supported_ops_target[static_cast<int>(target)];
valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
}
// get valid ops
std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
// 2.Load model into program to get ops in model
std::string prog_path = opt_config_.model_dir() + "/__model__";
if (!(opt_config_.model_file()).empty() &&
!(opt_config_.param_file()).empty()) {
prog_path = opt_config_.model_file();
}
lite::cpp::ProgramDesc cpp_prog;
framework::proto::ProgramDesc pb_proto_prog =
*lite::LoadProgram(prog_path, false);
lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
// Transform to cpp::ProgramDesc
lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
std::set<std::string> unsupported_ops;
std::set<std::string> input_model_ops;
for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
for (size_t i = 0; i < current_block->OpsSize(); ++i) {
auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
auto op_type = op_desc.Type();
input_model_ops.insert(op_type);
if (valid_ops_set.count(op_type) == 0) {
unsupported_ops.insert(op_type);
}
}
}
// 3. Print ops_info of input model and check if this model is supported
if (print_ops_info) {
std::cout << "OPs in the input model include:\n";
PrintOpsInfo(input_model_ops);
}
if (!unsupported_ops.empty()) {
std::string unsupported_ops_str = *unsupported_ops.begin();
for (auto op_str = ++unsupported_ops.begin();
op_str != unsupported_ops.end();
op_str++) {
unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
}
std::vector<TargetType> targets = {};
for (size_t i = 0; i < valid_places_.size(); i++) {
targets.push_back(valid_places_[i].target);
}
std::sort(targets.begin(), targets.end());
targets.erase(unique(targets.begin(), targets.end()), targets.end());
std::string targets_str = TargetToStr(targets[0]);
for (size_t i = 1; i < targets.size(); i++) {
targets_str = targets_str + "," + TargetToStr(targets[i]);
}
LOG(ERROR) << "Error: This model is not supported, because "
<< unsupported_ops.size() << " ops are not supported on '"
<< targets_str << "'. These unsupported ops are: '"
<< unsupported_ops_str << "'.";
exit(1);
}
if (print_ops_info) {
std::cout << "Paddle-Lite supports this model!" << std::endl;
exit(1);
}
}
} // namespace lite_api
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines Opt and basic functions about model transformation.
*/
#ifndef PADDLE_LITE_OPT_H_ // NOLINT
#define PADDLE_LITE_OPT_H_
#include <algorithm>
#include <iomanip>
#include <set>
#include <string>
#include <vector>
// stores the map that records the source_file path of each kernel.
#include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
// version of Paddle-lite
#include "lite/core/version.h"
// model parser functions to pre-load model to verify if this model is supported
#include "lite/model_parser/compatible_pb.h"
#include "lite/model_parser/pb/program_desc.h"
#include "lite/utils/string.h"
// recorded all the ops supported by paddle-lite
#include "supported_kernel_op_info.h" // NOLINT
namespace paddle {
namespace lite_api {
/// The PaddlePredictor defines the basic interfaces for different kinds of
/// predictors.
class LITE_API OptBase {
public:
OptBase() = default;
void SetModelSetDir(const std::string &model_set_path);
void SetModelDir(const std::string &model_path);
void SetModelFile(const std::string &model_path);
void SetParamFile(const std::string &param_path);
void SetValidPlaces(const std::string &valid_places);
void SetOptimizeOut(const std::string &optimized_out_path);
// set optimized_model type
void SetModelType(std::string model_type);
// transform and save the optimized model
void RunOptimize(bool record_strip_info = false);
// fuctions of printing info
// 1. help info
void PrintHelpInfo();
// 2. PrintOpsInfo
void PrintOpsInfo(const std::set<std::string> &valid_ops =
{}); // print supported ops on target_types
void PrintAllOps(); // print all ops
void PrintSupportedOps(); // print ops supported on valid_places_
void DisplayKernelsInfo(); // Display kernel information
// 3. Check if this model is supported
void CheckIfModelSupported(bool print_ops_info = true);
private:
CxxConfig opt_config_;
// valid places for the optimized_model
std::vector<Place> valid_places_;
// filename of the optimized_model
std::string optimize_out_path_;
// type of the optimized_model, kNaiveBuffer default.
LiteModelType model_type_{LiteModelType::kNaiveBuffer};
// Dir path of a set of models, this should be combined with model
std::string model_set_dir_;
void RunOptimizeFromModelSet(bool record_strip_info = false);
};
} // namespace lite_api
} // namespace paddle
#endif // NOLINT
...@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON) ...@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
return() return()
endif() endif()
# to create setup.py for packeting whl for Paddle-Lite and opt
execute_process(
COMMAND git describe --tags --exact-match
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_LITE_TAG
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_LITE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_subdirectory(pybind) add_subdirectory(pybind)
#add_subdirectory(interface) #add_subdirectory(interface)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set(PYBIND_DEPS pybind python paddle_api_light paddle_api) set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
endif() endif()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
......
...@@ -26,11 +26,12 @@ ...@@ -26,11 +26,12 @@
#ifndef LITE_ON_TINY_PUBLISH #ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/opt_base.h"
#endif #endif
#include "lite/api/light_api.h" #include "lite/api/light_api.h"
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/core/tensor.h"
namespace py = pybind11; namespace py = pybind11;
...@@ -48,10 +49,27 @@ using lite_api::DataLayoutType; ...@@ -48,10 +49,27 @@ using lite_api::DataLayoutType;
using lite_api::Place; using lite_api::Place;
using lite_api::MLUCoreVersion; using lite_api::MLUCoreVersion;
using lite::LightPredictorImpl; using lite::LightPredictorImpl;
using lite_api::OptBase;
#ifndef LITE_ON_TINY_PUBLISH #ifndef LITE_ON_TINY_PUBLISH
using lite::CxxPaddleApiImpl; using lite::CxxPaddleApiImpl;
static void BindLiteCxxPredictor(py::module *m); static void BindLiteCxxPredictor(py::module *m);
void BindLiteOpt(py::module *m) {
py::class_<OptBase> opt_base(*m, "Opt");
opt_base.def(py::init<>())
.def("set_model_dir", &OptBase::SetModelDir)
.def("set_modelset_dir", &OptBase::SetModelSetDir)
.def("set_model_file", &OptBase::SetModelFile)
.def("set_param_file", &OptBase::SetParamFile)
.def("set_valid_places", &OptBase::SetValidPlaces)
.def("set_optimize_out", &OptBase::SetOptimizeOut)
.def("set_model_type", &OptBase::SetModelType)
.def("run_optimize", &OptBase::RunOptimize)
.def("help", &OptBase::PrintHelpInfo)
.def("print_supported_ops", &OptBase::PrintSupportedOps)
.def("display_kernels_info", &OptBase::DisplayKernelsInfo)
.def("print_all_ops", &OptBase::PrintAllOps);
}
#endif #endif
static void BindLiteLightPredictor(py::module *m); static void BindLiteLightPredictor(py::module *m);
static void BindLiteCxxConfig(py::module *m); static void BindLiteCxxConfig(py::module *m);
......
...@@ -22,11 +22,15 @@ namespace lite { ...@@ -22,11 +22,15 @@ namespace lite {
namespace pybind { namespace pybind {
void BindLiteApi(pybind11::module *m); void BindLiteApi(pybind11::module *m);
void BindLiteOpt(pybind11::module *m);
PYBIND11_MODULE(lite_core, m) { PYBIND11_MODULE(lite, m) {
m.doc() = "C++ core of Paddle-Lite"; m.doc() = "C++ core of Paddle-Lite";
BindLiteApi(&m); BindLiteApi(&m);
#ifndef LITE_ON_TINY_PUBLISH
BindLiteOpt(&m);
#endif
} }
} // namespace pybind } // namespace pybind
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# module of pack whl installer for Paddle-lite
import shutil
import os
from setuptools import setup, Distribution
class BinaryDistribution(Distribution):
'binary distribution'
def has_ext_modules(foo):
return True
# get paddle-lite version, if it's not based on a release tag, we use commit id instead
PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
if PADDLELITE_TAG == "":
PADDLELITE_VERSION = PADDLELITE_COMMITE
else:
PADDLELITE_VERSION = PADDLELITE_TAG
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
# link lite.so to paddlelite.libs
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'):
os.remove(LIB_PATH+'/__init__.py')
# set dir path of each package
PACKAGE_DIR = {
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddlelite.libs': LIB_PATH,
'paddlelite': LITE_PATH
}
setup(
name='paddlelite',
version=PADDLELITE_VERSION,
description='Paddle-Lite Library',
packages=['paddlelite', 'paddlelite.libs'],
package_dir=PACKAGE_DIR,
package_data=PACKAGE_DATA,
distclass=BinaryDistribution
)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <cmath>
// for eval // for eval
DEFINE_string(model_dir, "", "model dir"); DEFINE_string(model_dir, "", "model dir");
...@@ -43,5 +44,31 @@ inline double GetCurrentUS() { ...@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
return 1e+6 * time.tv_sec + time.tv_usec; return 1e+6 * time.tv_sec + time.tv_usec;
} }
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx, ...@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx,
} }
} }
template <>
void elementwise_add_grad<float>(const float* dout_grad,
float* x_grad,
int num) {
int cnt = num >> 4;
int remain = num & 0x0f;
#pragma omp parallel for
for (int i = 0; i < cnt; ++i) {
const float* out_data = dout_grad + 16 * i;
float* x_data = x_grad + 16 * i;
float32x4_t din0 = vld1q_f32(out_data);
float32x4_t din1 = vld1q_f32(out_data + 4);
float32x4_t din2 = vld1q_f32(out_data + 8);
float32x4_t din3 = vld1q_f32(out_data + 12);
vst1q_f32(x_data, din0);
vst1q_f32(x_data + 4, din1);
vst1q_f32(x_data + 8, din2);
vst1q_f32(x_data + 12, din3);
}
if (remain > 0) {
const float* out_data = dout_grad + 16 * cnt;
float* x_data = x_grad + 16 * cnt;
for (int i = 0; i < remain; ++i) {
x_data[i] = out_data[i];
}
}
}
// we assume that y_data numel less than x_data, otherwise, call this function
// by change x_grad and y_grad position
template <>
void elementwise_add_grad_broadcast<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int pre,
int n,
int post) {
if (x_grad) {
elementwise_add_grad(dout_grad, x_grad, pre * n * post);
}
if (y_grad) {
memset(y_grad, 0, n * sizeof(float));
#pragma omp parallel for
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
float sum = 0;
int cnt = post >> 2;
int remain = post & 0x03;
const float* out_data = dout_grad + (i * n + j) * post;
float32x4_t sum_v = vdupq_n_f32(0);
for (int ci = 0; ci < cnt; ++ci) {
float32x4_t din = vld1q_f32(out_data + 4 * ci);
sum_v = vaddq_f32(sum_v, din);
}
out_data += 4 * cnt;
for (int ci = 0; ci < remain; ++ci) {
sum += out_data[ci];
}
float32x2_t high = vget_high_f32(sum_v);
float32x2_t low = vget_low_f32(sum_v);
sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
y_grad[j] += sum;
}
}
}
}
template <> template <>
void elementwise_sub<float>(const float* dinx, void elementwise_sub<float>(const float* dinx,
const float* diny, const float* diny,
...@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx, ...@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx,
} }
} }
} }
// we assume the formula is x-y
template <>
void elementwise_sub_grad<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int num) {
if (x_grad) {
elementwise_add_grad(dout_grad, x_grad, num);
}
if (y_grad) {
int cnt = num >> 4;
int remain = num & 0x0f;
float32x4_t minus = vdupq_n_f32(-1);
#pragma omp parallel for
for (int i = 0; i < cnt; ++i) {
const float* out_data = dout_grad + 16 * i;
float* y_data = y_grad + 16 * i;
float32x4_t din0 = vld1q_f32(out_data);
float32x4_t din1 = vld1q_f32(out_data + 4);
float32x4_t din2 = vld1q_f32(out_data + 8);
float32x4_t din3 = vld1q_f32(out_data + 12);
din0 = vmulq_f32(din0, minus);
din1 = vmulq_f32(din1, minus);
din2 = vmulq_f32(din2, minus);
din3 = vmulq_f32(din3, minus);
vst1q_f32(y_data, din0);
vst1q_f32(y_data + 4, din1);
vst1q_f32(y_data + 8, din2);
vst1q_f32(y_data + 12, din3);
}
if (remain > 0) {
const float* out_data = dout_grad + 16 * cnt;
float* y_data = y_grad + 16 * cnt;
for (int i = 0; i < remain; ++i) {
y_data[i] = -out_data[i];
}
}
}
}
// we assume that y_data numel less than x_data, otherwise, call this function
// by change x_grad and y_grad position
template <>
void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int pre,
int n,
int post) {
if (x_grad) {
elementwise_add_grad(dout_grad, x_grad, pre * n * post);
}
if (y_grad) {
memset(y_grad, 0, n * sizeof(float));
#pragma omp parallel for
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
float sum = 0;
int cnt = post << 2;
int remain = post & 0x03;
const float* out_data = dout_grad + (i * n + j) * post;
float32x4_t sum_v = vdupq_n_f32(0);
for (int ci = 0; ci < cnt; ++ci) {
float32x4_t din = vld1q_f32(out_data + 4 * ci);
sum_v = vaddq_f32(sum_v, din);
}
out_data += 4 * cnt;
for (int ci = 0; ci < remain; ++ci) {
sum -= out_data[ci];
}
float32x2_t high = vget_high_f32(sum_v);
float32x2_t low = vget_low_f32(sum_v);
sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
y_grad[j] += sum;
}
}
}
}
template <> template <>
void elementwise_mul<float>(const float* dinx, void elementwise_mul<float>(const float* dinx,
......
...@@ -183,6 +183,13 @@ template <typename T> ...@@ -183,6 +183,13 @@ template <typename T>
void elementwise_add_relu_broadcast( void elementwise_add_relu_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num); const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
template <typename T>
void elementwise_add_grad(const T* dout, T* dinx, int num);
template <typename T>
void elementwise_add_grad_broadcast(
const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
template <typename T> template <typename T>
void elementwise_sub(const T* dinx, const T* diny, T* dout, int num); void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);
...@@ -197,6 +204,13 @@ template <typename T> ...@@ -197,6 +204,13 @@ template <typename T>
void elementwise_sub_relu_broadcast( void elementwise_sub_relu_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num); const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
template <typename T>
void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num);
template <typename T>
void elementwise_sub_grad_broadcast(
const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
template <typename T> template <typename T>
void elementwise_mul(const T* dinx, const T* diny, T* dout, int num); void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);
......
...@@ -983,10 +983,12 @@ void sgemv_trans(const int M, ...@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
"vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \ "vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \
"vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \ "vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \
"vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \ "vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
"vmla.f32 q0, q4, q6 @ mul add\n" \ "vmla.f32 q0, q4, q6 @ mul add\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vmla.f32 q1, q4, q8 @ mul add\n" \ "vmla.f32 q1, q4, q8 @ mul add\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
/*"vmla.f32 q0, q4, q6 @ mul add\n" */ \
/*"vmla.f32 q1, q4, q8 @ mul add\n" */ \
"vmla.f32 q2, q4, q10 @ mul add\n" \ "vmla.f32 q2, q4, q10 @ mul add\n" \
"vmla.f32 q3, q4, q12 @ mul add\n" \ "vmla.f32 q3, q4, q12 @ mul add\n" \
"subs %[cnt], #1 @ sub loop count \n" \ "subs %[cnt], #1 @ sub loop count \n" \
......
...@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) { ...@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
inline void save_float(float* data, const std::string& name, int len) { inline void save_float(float* data, const std::string& name, int len) {
static int counter = 0; static int counter = 0;
std::string old_string = std::to_string(counter); std::string old_string = paddle::lite::to_string(counter);
std::string new_string = std::string new_string =
std::string(3 - old_string.length(), '0') + old_string; std::string(3 - old_string.length(), '0') + old_string;
......
...@@ -351,10 +351,10 @@ class Tensor { ...@@ -351,10 +351,10 @@ class Tensor {
void printScale(std::string type) { printScale(); } void printScale(std::string type) { printScale(); }
std::string dimsFileName() { std::string dimsFileName() {
return std::to_string(shape_->num()) + "_" + return paddle::lite::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" + paddle::lite::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" + paddle::lite::to_string(shape_->height()) + "_" +
std::to_string(shape_->width()) + ".txt"; paddle::lite::to_string(shape_->width()) + ".txt";
} }
void saveToFile() { std::string path = dimsFileName(); } void saveToFile() { std::string path = dimsFileName(); }
...@@ -374,7 +374,7 @@ class Tensor { ...@@ -374,7 +374,7 @@ class Tensor {
invalidate(); invalidate();
std::ofstream ofs; std::ofstream ofs;
static int counter = 0; static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path; std::string npath = paddle::lite::to_string(counter) + "_" + path;
counter++; counter++;
save_file_with_name(npath); save_file_with_name(npath);
} }
......
...@@ -19,8 +19,8 @@ namespace paddle { ...@@ -19,8 +19,8 @@ namespace paddle {
namespace lite { namespace lite {
namespace npu { namespace npu {
std::unique_ptr<hiai::AiModelMngerClient> Device::Build( std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
std::string& model_name, // NOLINT const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT std::vector<ge::Operator>& output_nodes // NOLINT
) { ) {
...@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build( ...@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
ir_build.ReleaseModelBuff(om_model_buf); ir_build.ReleaseModelBuff(om_model_buf);
return nullptr; return nullptr;
} }
// Create a HiAI model manager client to load the HiAI om model // Create a HiAI model manager client to load the HiAI om model
std::unique_ptr<hiai::AiModelMngerClient> model_client( std::shared_ptr<hiai::AiModelMngerClient> model_client(
new hiai::AiModelMngerClient()); new hiai::AiModelMngerClient());
if (model_client->Init(nullptr) != hiai::AI_SUCCESS) { if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!"; LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
ir_build.ReleaseModelBuff(om_model_buf); ir_build.ReleaseModelBuff(om_model_buf);
return nullptr; return nullptr;
} }
model_name = "model_" + std::to_string(model_count_++) + ".om";
auto model_desc = std::make_shared<hiai::AiModelDescription>( auto model_desc = std::make_shared<hiai::AiModelDescription>(
model_name, freq_level(), framework_type(), model_type(), device_type()); model_name, freq_level(), framework_type(), model_type(), device_type());
model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length); model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
......
...@@ -40,8 +40,8 @@ class Device { ...@@ -40,8 +40,8 @@ class Device {
// Build the HiAI IR graph to om model, return HiAI model manager client to // Build the HiAI IR graph to om model, return HiAI model manager client to
// load om model and run inference. // load om model and run inference.
std::unique_ptr<hiai::AiModelMngerClient> Build( std::shared_ptr<hiai::AiModelMngerClient> Build(
std::string& model_name, // NOLINT const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT std::vector<ge::Operator>& output_nodes // NOLINT
); // NOLINT ); // NOLINT
...@@ -51,7 +51,6 @@ class Device { ...@@ -51,7 +51,6 @@ class Device {
int framework_type_{0}; int framework_type_{0};
int model_type_{0}; int model_type_{0};
int device_type_{0}; int device_type_{0};
int model_count_{0};
}; };
} // namespace npu } // namespace npu
......
...@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt ...@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image) lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime) lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper) lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)
add_dependencies(cl_wrapper opencl_clhpp) add_dependencies(cl_wrapper opencl_clhpp)
#include <cl_common.h> #include <cl_common.h>
__kernel void conv2d_1x1(__private const int global_size_dim0, __kernel void conv2d_1x1_opt(__private const int global_size_dim0,
__private const int global_size_dim1, __private const int global_size_dim1,
__private const int global_size_dim2, __private const int global_size_dim2,
__read_only image2d_t input_image, __read_only image2d_t input_image,
......
...@@ -26,6 +26,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, ...@@ -26,6 +26,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int stride, __private const int stride,
__private const int pad, __private const int pad,
__private const int dilation, __private const int dilation,
__private const int batch,
__private const int in_ch, __private const int in_ch,
__private const int in_w, __private const int in_w,
__private const int in_h, __private const int in_h,
...@@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, ...@@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
#ifdef BIASE_CH #ifdef BIASE_CH
CL_DTYPE4 output[5]; CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0)); output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0]; output[1] = output[0];
output[2] = output[0]; output[2] = output[0];
output[3] = output[0]; output[3] = output[0];
...@@ -69,22 +71,32 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, ...@@ -69,22 +71,32 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
#elif defined(BIASE_ELE) #elif defined(BIASE_ELE)
CL_DTYPE4 output[5]; CL_DTYPE4 output[5];
output[0] = output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) { if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id)); (int2)(out_w_base_id + out_w_id1, item_h_id));
} }
if (out_w_id2 < out_w) { if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id)); (int2)(out_w_base_id + out_w_id2, item_h_id));
} }
if (out_w_id3 < out_w) { if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id)); (int2)(out_w_base_id + out_w_id3, item_h_id));
} }
if (out_w_id4 < out_w) { if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id)); (int2)(out_w_base_id + out_w_id4, item_h_id));
} }
#else #else
...@@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, ...@@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
int filter_w_val = ch * 3; int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) { for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, int in_h_val = select(out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < 0 || (out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h)); out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) { for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter[0] = READ_IMG_TYPE(
filter_image, sampler, CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter[1] = READ_IMG_TYPE(
filter_image, sampler, CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter[2] = READ_IMG_TYPE(
filter_image, sampler, CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter[3] = READ_IMG_TYPE(
filter_image, sampler, CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x, filter[1].x, filter[2].x, filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3 filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y, filter[1].y, filter[2].y, filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3 filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z, filter[1].z, filter[2].z, filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3 filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w, filter[1].w, filter[2].w, filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3 filter[3].w); // in_ch:3,out_ch:0-3
input[0] = input[0] = READ_IMG_TYPE(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val)); CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = input[1] = READ_IMG_TYPE(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val)); CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = input[2] = READ_IMG_TYPE(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val)); CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = input[3] = READ_IMG_TYPE(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val)); CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = input[4] = READ_IMG_TYPE(
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val)); CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]); output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]); output[1] = mad(input[1].x, filter_trans[0], output[1]);
...@@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, ...@@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
output[3] = activation_type4(output[3]); output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]); output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]); output[0]);
if (out_w_id1 < out_w) { if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]); output[1]);
} }
if (out_w_id2 < out_w) { if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]); output[2]);
} }
if (out_w_id3 < out_w) { if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]); output[3]);
} }
if (out_w_id4 < out_w) { if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]); output[4]);
} }
} }
// support batch > 1
__kernel void conv2d_3x3_multi_batch(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
#endif
CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(
out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
output[2] = activation_type4(output[2]);
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include "glog/logging.h" #include "glog/logging.h"
#include "lite/backends/x86/jit/gen/jitcode.h" #include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/string.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode { ...@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode {
base += "_Vec"; base += "_Vec";
} }
base += (with_relu_ ? "_Relu" : ""); base += (with_relu_ ? "_Relu" : "");
base += "_D" + std::to_string(num_); base += "_D" + paddle::lite::to_string(num_);
return base; return base;
} }
void genCode() override; void genCode() override;
......
...@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode { ...@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode {
} else if (type_ == SeqPoolType::kSqrt) { } else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt"; base += "_Sqrt";
} }
base += ("_W" + std::to_string(tbl_w_)); base += ("_W" + paddle::lite::to_string(tbl_w_));
return base; return base;
} }
void genCode() override; void genCode() override;
......
...@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode { ...@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode {
std::string name() const override { std::string name() const override {
std::string base = "MatMulJitCode"; std::string base = "MatMulJitCode";
base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" + base = base + "_M" + paddle::lite::to_string(m_) + "_N" +
std::to_string(k_); paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_);
return base; return base;
} }
void genCode() override; void genCode() override;
......
...@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode { ...@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode {
} else if (type_ == SeqPoolType::kSqrt) { } else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt"; base += "_Sqrt";
} }
base += ("_W" + std::to_string(w_)); base += ("_W" + paddle::lite::to_string(w_));
return base; return base;
} }
void genCode() override; void genCode() override;
......
...@@ -94,9 +94,13 @@ add_custom_command( ...@@ -94,9 +94,13 @@ add_custom_command(
OUTPUT ops.h # not a real path to the output to force it execute every time. OUTPUT ops.h # not a real path to the output to force it execute every time.
) )
# generate fake kernels for memory_optimize_tool # generate fake kernels for memory_optimize_tool
#-------------------------------opt----------------------------------------------------------------
# tricks to create headfiles for opt
add_custom_command( add_custom_command(
COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
${kernels_src_list} ${kernels_src_list}
${fake_kernels_src_list}
${CMAKE_BINARY_DIR}/all_kernel_faked.cc ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
${CMAKE_BINARY_DIR}/kernel_src_map.h ${CMAKE_BINARY_DIR}/kernel_src_map.h
OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time. OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
...@@ -104,12 +108,12 @@ add_custom_command( ...@@ -104,12 +108,12 @@ add_custom_command(
add_custom_target(op_list_h DEPENDS ops.h) add_custom_target(op_list_h DEPENDS ops.h)
add_custom_target(kernel_list_h DEPENDS kernels.h) add_custom_target(kernel_list_h DEPENDS kernels.h)
add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
#add_custom_target(opencl_kernels_source_cc DEPENDS opencl_kernels_source.cc)
# create headfile to restore ops info sorted by suppported platforms # create headfile to restore ops info sorted by suppported platforms
add_custom_command( add_custom_command(
COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
${kernels_src_list} ${kernels_src_list}
${fake_kernels_src_list}
${ops_src_list} ${ops_src_list}
${CMAKE_BINARY_DIR}/supported_kernel_op_info.h ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time. OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.
......
...@@ -490,7 +490,7 @@ class ContextScheduler { ...@@ -490,7 +490,7 @@ class ContextScheduler {
} break; } break;
#endif #endif
default: default:
#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
LOG(FATAL) << "unsupported target " << TargetToStr(target); LOG(FATAL) << "unsupported target " << TargetToStr(target);
#endif #endif
break; break;
......
...@@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) {
auto attr_type = op_info->GetAttrType(attr_name); auto attr_type = op_info->GetAttrType(attr_name);
switch (attr_type) { switch (attr_type) {
case AttrType::INT: case AttrType::INT:
os << ":int:" << std::to_string(op_info->GetAttr<int>(attr_name)); os << ":int:"
<< paddle::lite::to_string(op_info->GetAttr<int>(attr_name));
break; break;
case AttrType::FLOAT: case AttrType::FLOAT:
os << ":float:" << std::to_string(op_info->GetAttr<float>(attr_name)); os << ":float:"
<< paddle::lite::to_string(op_info->GetAttr<float>(attr_name));
break; break;
case AttrType::BOOLEAN: case AttrType::BOOLEAN:
os << ":int:" << std::to_string(op_info->GetAttr<bool>(attr_name)); os << ":int:"
<< paddle::lite::to_string(op_info->GetAttr<bool>(attr_name));
break; break;
case AttrType::STRING: case AttrType::STRING:
os << ":string: \"" os << ":string: \""
......
...@@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice( ...@@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
// non-tensor(like tensor_array) variables will not be reused // non-tensor(like tensor_array) variables will not be reused
for (auto& node : graph->nodes()) { for (auto& node : graph->nodes()) {
if (node.IsArg() && !node.arg()->type->IsTensor()) { if (node.IsArg() && (node.arg()->type != nullptr) &&
!node.arg()->type->IsTensor()) {
invalid_var_names.insert(node.arg()->name); invalid_var_names.insert(node.arg()->name);
} }
} }
...@@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan( ...@@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan(
if (reuse_table.count(name) && reuse_table.at(name) != name) { if (reuse_table.count(name) && reuse_table.at(name) != name) {
auto replace_name = reuse_table.at(name); auto replace_name = reuse_table.at(name);
input_node->AsArg().name = input_node->AsArg().name =
replace_name + "(" + std::to_string(node_append_idx) + ")"; replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
node_append_idx++; node_append_idx++;
} }
} }
...@@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan( ...@@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan(
if (reuse_table.count(name) && reuse_table.at(name) != name) { if (reuse_table.count(name) && reuse_table.at(name) != name) {
auto replace_name = reuse_table.at(name); auto replace_name = reuse_table.at(name);
out_node->AsArg().name = out_node->AsArg().name =
replace_name + "(" + std::to_string(node_append_idx) + ")"; replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
node_append_idx++; node_append_idx++;
} }
} }
......
...@@ -85,7 +85,7 @@ class Node { ...@@ -85,7 +85,7 @@ class Node {
struct Arg { struct Arg {
std::string name; std::string name;
int id{0}; int id{0};
const Type* type{}; const Type* type{nullptr};
// Weight is a special kind of argument, it is marked as weight explicitly // Weight is a special kind of argument, it is marked as weight explicitly
// so that some weight related optimization can take place. // so that some weight related optimization can take place.
bool is_weight{false}; bool is_weight{false};
......
...@@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply( ...@@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply(
} }
if (found) { if (found) {
inst.mutable_op_info()->SetAttr("output_scale", output_scale); inst.mutable_op_info()->SetAttr("output_scale", output_scale);
} else if (op_info->HasAttr("output_scale")) {
int bit_length = op_info->GetAttr<int>("bit_length");
int range = (1 << (bit_length - 1)) - 1;
output_scale = op_info->GetAttr<float>("output_scale");
inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
} }
if (op_info->HasAttr("output_scale")) { if (op_info->HasAttr("output_scale")) {
inst.mutable_op_info()->SetAttr("enable_int8", true); inst.mutable_op_info()->SetAttr("enable_int8", true);
......
...@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass {
} }
VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "[score(final)]:" << final_score;
VLOG(4) << "-------- pick summary --------"; VLOG(2) << "-------- pick summary for " << instruct.op_type()
VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) << " --------";
VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
<< " " << DataLayoutToStr(winner_place.layout) << " " << " " << DataLayoutToStr(winner_place.layout) << " "
<< TargetToStr(winner_place.target); << TargetToStr(winner_place.target);
VLOG(4) << " ===> kernel.place():" VLOG(2) << " ===> kernel.place():"
<< PrecisionToStr(kernel.place().precision) << " " << PrecisionToStr(kernel.place().precision) << " "
<< DataLayoutToStr(kernel.place().layout) << " " << DataLayoutToStr(kernel.place().layout) << " "
<< TargetToStr(kernel.place().target); << TargetToStr(kernel.place().target);
......
...@@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() { ...@@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() {
} else { } else {
exists_ops[op_type]++; exists_ops[op_type]++;
} }
auto op_name = op_type + std::to_string(exists_ops[op_type]); auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]);
std::string op_color = "white"; std::string op_color = "white";
if (subgraph_indices.count(node)) { if (subgraph_indices.count(node)) {
auto subgraph_idx = subgraph_indices[node]; auto subgraph_idx = subgraph_indices[node];
op_name += "_subgraph_" + std::to_string(subgraph_idx); op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx);
op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()]; op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
} }
dot.AddNode(op_name, dot.AddNode(op_name,
...@@ -223,6 +223,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() { ...@@ -223,6 +223,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
std::vector<std::string> lines = ReadLines(config_file_path); std::vector<std::string> lines = ReadLines(config_file_path);
for (std::string line : lines) { for (std::string line : lines) {
if (line.empty()) continue;
std::vector<std::string> node_info = Split(line, ":"); std::vector<std::string> node_info = Split(line, ":");
std::string op_type = node_info.at(0); std::string op_type = node_info.at(0);
std::vector<std::string> in_vars_name; std::vector<std::string> in_vars_name;
...@@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph, ...@@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
cpp::OpDesc subgraph_op_desc; cpp::OpDesc subgraph_op_desc;
subgraph_op_desc.SetType("subgraph"); subgraph_op_desc.SetType("subgraph");
// Create a new sub block desc for storing all of Ops an Vars of the target // Create a new sub block desc for storing all of Ops and Vars of the target
// subgraph and sub_block_idx is set as a attribute of subgraph op, // subgraph and sub_block_idx is set as a attribute of subgraph op,
// sub_block_idx < 0 means it's a new subgraph op // sub_block_idx < 0 means it's a new subgraph op
int sub_block_idx = -(subgraph_idx + 1); int sub_block_idx = -(subgraph_idx + 1);
......
...@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc( ...@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc(
CHECK_EQ(input_var_names.size(), 1); CHECK_EQ(input_var_names.size(), 1);
CHECK_EQ(wshape.size(), 2); CHECK_EQ(wshape.size(), 2);
static int id = 0; static int id = 0;
std::string prefix = "fc_" + std::to_string(id); std::string prefix = "fc_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>(); auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* wgt = block_desc->AddVar<cpp::VarDesc>(); auto* wgt = block_desc->AddVar<cpp::VarDesc>();
...@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc( ...@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc(
const std::vector<std::string>& input_Y_names) { const std::vector<std::string>& input_Y_names) {
// CHECK_EQ(input_var_names.size(), 2); // CHECK_EQ(input_var_names.size(), 2);
static int id = 0; static int id = 0;
std::string prefix = "elementwise_add_" + std::to_string(id); std::string prefix = "elementwise_add_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>(); auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* out = block_desc->AddVar<cpp::VarDesc>(); auto* out = block_desc->AddVar<cpp::VarDesc>();
...@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc( ...@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc(
const std::vector<std::string>& input_X_names) { const std::vector<std::string>& input_X_names) {
// CHECK_EQ(input_var_names.size(), 1); // CHECK_EQ(input_var_names.size(), 1);
static int id = 0; static int id = 0;
std::string prefix = "feed_" + std::to_string(id); std::string prefix = "feed_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>(); auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* out = block_desc->AddVar<cpp::VarDesc>(); auto* out = block_desc->AddVar<cpp::VarDesc>();
...@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc( ...@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc(
const std::vector<std::string>& input_X_names) { const std::vector<std::string>& input_X_names) {
// CHECK_EQ(input_var_names.size(), 1); // CHECK_EQ(input_var_names.size(), 1);
static int id = 0; static int id = 0;
std::string prefix = "fetch_" + std::to_string(id); std::string prefix = "fetch_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>(); auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* out = block_desc->AddVar<cpp::VarDesc>(); auto* out = block_desc->AddVar<cpp::VarDesc>();
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(model_file, "", "model file path of combined protobuf model"); DEFINE_string(model_file, "", "model file path of combined protobuf model");
DEFINE_string(params_file, "", "params file path of combined protobuf model"); DEFINE_string(params_file, "", "params file path of combined protobuf model");
...@@ -31,43 +32,17 @@ namespace lite { ...@@ -31,43 +32,17 @@ namespace lite {
// The helper functions for loading and running model from command line and // The helper functions for loading and running model from command line and
// verifying output data // verifying output data
std::vector<std::string> TypeParsing(std::string text) { std::vector<std::string> TypeParsing(std::string text) {
std::vector<std::string> types; return Split(text, ":");
while (!text.empty()) {
size_t index = text.find_first_of(":");
std::string type = text.substr(0, index);
VLOG(3) << type;
types.push_back(type);
if (index == std::string::npos) {
break;
} else {
text = text.substr(index + 1);
}
}
return types;
} }
std::vector<std::vector<int64_t>> ShapeParsing(std::string text) { std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
std::vector<std::vector<int64_t>> shapes; std::vector<std::vector<int64_t>> shapes;
while (!text.empty()) { std::vector<std::string> shape_strings = Split(text, ":");
size_t index = text.find_first_of(":"); shapes.resize(shape_strings.size());
std::string slice = text.substr(0, index); for (int i = 0; i < shape_strings.size(); i++) {
std::vector<int64_t> shape; std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
while (!slice.empty()) { for (auto shape_num : shape_nums) {
size_t index = slice.find_first_of(","); shapes[i].push_back(atoi(shape_num.c_str()));
int d = atoi(slice.substr(0, index).c_str());
VLOG(3) << d;
shape.push_back(d);
if (index == std::string::npos) {
break;
} else {
slice = slice.substr(index + 1);
}
}
shapes.push_back(shape);
if (index == std::string::npos) {
break;
} else {
text = text.substr(index + 1);
} }
} }
return shapes; return shapes;
......
...@@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
VLOG(4) << "!node->IsStmt():" << !node->IsStmt(); VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
auto inlinks = node->inlinks; auto inlinks = node->inlinks;
VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc VLOG(4) << "============== node->AsStmt().op_type():"
<< " inlinks.size():" << inlinks.size(); << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size()
<< " ================";
for (auto* in : inlinks) { for (auto* in : inlinks) {
ComplementInputs(graph.get(), node, in); ComplementInputs(graph.get(), node, in);
} }
...@@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph, ...@@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name)); CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
auto decl_arg_type = auto decl_arg_type =
inst.picked_kernel().GetInputDeclType(inst_in_tensor_name); inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
CHECK(in->AsArg().type); CHECK(in->AsArg().type);
VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name
<< "\n in->AsArg().name:" << in->AsArg().name << "\n in->AsArg().name:" << in->AsArg().name
<< "\n *in->AsArg().type:" << *in->AsArg().type << "\n *in->AsArg().type:" << *in->AsArg().type
<< "\n *decl_arg_type:" << *decl_arg_type << "\n *decl_arg_type:" << *decl_arg_type
<< "\n inst.op()->DebugString():" << inst.op()->DebugString(); << "\n inst.op()->DebugString():" << inst.op()->DebugString();
// TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL
// layout).
// not a good judge, but don't find the source of this issue from
// static_pick_kernel_pass
// to this pass.
auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
if (in_arg_type->target() == TARGET(kARM) &&
in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
return;
}
if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) { if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
<< " for kernel " << inst.op()->DebugString() << " " << " for kernel " << inst.op()->DebugString() << " "
......
...@@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from, ...@@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
CHECK(in->IsArg()); CHECK(in->IsArg());
// auto node_id = [&] { return graph->nodes().size(); }; // auto node_id = [&] { return graph->nodes().size(); };
auto cast_op_output_name = in->AsArg().name + "/precision_trans"; auto cast_op_output_name = in->AsArg().name + "/precision_trans";
// in->AsArg().name + "/precision_trans/" + std::to_string(node_id()); // in->AsArg().name + "/precision_trans/" +
// paddle::lite::to_string(node_id());
auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
cast_op_output_arg->AsArg().type = cast_op_output_arg->AsArg().type =
LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
......
...@@ -65,6 +65,7 @@ class OpLite : public Registry { ...@@ -65,6 +65,7 @@ class OpLite : public Registry {
virtual bool CheckShape() const { return true; } virtual bool CheckShape() const { return true; }
// Inference the outputs' shape. // Inference the outputs' shape.
virtual bool InferShape() const { return true; } virtual bool InferShape() const { return true; }
virtual bool SmartInferShape() { return this->InferShape(); }
// Run this operator. // Run this operator.
virtual bool Run(); virtual bool Run();
// Indicate whether the Op runs only once or not // Indicate whether the Op runs only once or not
...@@ -150,6 +151,10 @@ class OpLite : public Registry { ...@@ -150,6 +151,10 @@ class OpLite : public Registry {
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
std::unique_ptr<OpInfo> op_info_; std::unique_ptr<OpInfo> op_info_;
std::vector<DDimLite> last_input_shapes;
std::vector<DDimLite> last_output_shapes;
std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
}; };
/* /*
......
...@@ -22,18 +22,25 @@ ...@@ -22,18 +22,25 @@
#include <vector> #include <vector>
#include "lite/core/program.h" #include "lite/core/program.h"
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/kernels/opencl/image_helper.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace profile { namespace profile {
template <typename dtype> template <typename dtype>
static void write_tensorfile(const Tensor* tensor, const std::string& locate) { static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
if (locate.find('/') != std::string::npos) { if (locate.find('/') != std::string::npos) {
return; return false;
} }
FILE* fp = fopen(locate.c_str(), "w"); FILE* fp = fopen(locate.c_str(), "w");
if (fp == nullptr) { if (fp == nullptr) {
LOG(ERROR) << "file open field " << locate; LOG(ERROR) << "file open field " << locate;
return false;
} else { } else {
const dtype* data = tensor->data<dtype>(); const dtype* data = tensor->data<dtype>();
for (int i = 0; i < tensor->numel(); ++i) { for (int i = 0; i < tensor->numel(); ++i) {
...@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) { ...@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
} }
} }
fclose(fp); fclose(fp);
return true;
} }
class PrecisionProfiler { class PrecisionProfiler {
public: public:
explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {} // TODO(ysh329): need to remove `explicit PrecisionProfiler`
~PrecisionProfiler() { // keep this method only for arm/math/conditional
LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr() explicit PrecisionProfiler(const Instruction* inst) {
<< " on Target " << TargetToStr(inst_->kernel()->target()) << " " std::string inst_precison_str = GetInstPrecision(inst);
<< PrecisionToStr(inst_->kernel()->precision()); }
auto tensor_mean = [](const Tensor* in,
PrecisionType ptype, PrecisionProfiler() {}
std::string name = "inst") -> double {
if (!in->data<int8_t>()) { std::string GetSummaryHeader() {
return -99999; using std::setw;
using std::left;
using std::fixed;
STL::stringstream ss;
ss << "========================================= "
<< "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl;
ss << setw(45) << left << "operator:(kernel_info)"
<< " " << setw(70) << left << "output_tensor_name:(tensor_info)"
<< " " << setw(15) << left << "dims"
<< " " << setw(15) << left << "mean"
<< " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl;
return ss.str();
} }
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.; double sum = 0.;
switch (ptype) { for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
template <typename T>
double compute_average_grow_rate(const T* in, const size_t length) {
const double eps = 1e-5;
double ave_grow_rate = 0.0f;
for (size_t i = 1; i < length; ++i) {
ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
}
ave_grow_rate /= length;
return ave_grow_rate;
}
// check if output tensor unused
bool is_unused(const Tensor* in) {
if (!in->data<int8_t>()) {
return true;
}
return false;
}
void compute_tensor_precision_info(const Tensor* in,
TargetType target_type,
PrecisionType precision_type,
DataLayoutType layout_type,
double* mean,
double* std_dev,
double* ave_grow_rate,
std::string name = "inst",
bool write_result_to_file = false) {
std::string unsupported_error_log =
"Unsupported precision profile for kernel registered on" +
TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
DataLayoutToStr(layout_type);
if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
target_type == TARGET(kX86)) {
switch (precision_type) {
case PRECISION(kFloat): { case PRECISION(kFloat): {
auto ptr = in->data<float>(); auto ptr = in->data<float>();
// write_tensorfile<float>(in, name); *mean = compute_mean<float>(ptr, in->numel());
for (int i = 0; i < in->numel(); ++i) { *std_dev =
sum += ptr[i]; compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
} *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
return sum / in->numel(); write_result_to_file&& write_tensorfile<float>(in, name);
return;
} }
case PRECISION(kAny): { case PRECISION(kAny): {
auto ptr = in->data<float>(); auto ptr = in->data<float>();
// write_tensorfile<float>(in, name); *mean = compute_mean<float>(ptr, in->numel());
for (int i = 0; i < in->numel(); ++i) { *std_dev =
sum += ptr[i]; compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
} *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
return sum / in->numel(); write_result_to_file&& write_tensorfile<float>(in, name);
return;
} }
case PRECISION(kInt8): { case PRECISION(kInt8): {
auto ptr = in->data<int8_t>(); auto ptr = in->data<int8_t>();
// write_tensorfile<int8_t>(in, name); *mean = compute_mean<int8_t>(ptr, in->numel());
for (int i = 0; i < in->numel(); ++i) { *std_dev =
sum += ptr[i]; compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
} *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
return sum / in->numel(); write_result_to_file&& write_tensorfile<int8_t>(in, name);
return;
} }
case PRECISION(kInt32): { case PRECISION(kInt32): {
auto ptr = in->data<int32_t>(); auto ptr = in->data<int32_t>();
// write_tensorfile<int32_t>(in, name); *mean = compute_mean<int32_t>(ptr, in->numel());
for (int i = 0; i < in->numel(); ++i) { *std_dev = compute_standard_deviation<int32_t>(
sum += ptr[i]; ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int32_t>(in, name);
return;
} }
return sum / in->numel(); default:
*mean = -333333333333;
*std_dev = -33333333333;
*ave_grow_rate = -33333333333;
LOG(ERROR) << unsupported_error_log;
return;
}
#ifdef LITE_WITH_OPENCL
} else if (target_type == TARGET(kOpenCL)) {
switch (layout_type) {
case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
<< im_h;
std::vector<uint16_t> in_data_v(im_w * im_h * 4);
std::vector<float> real_out_v(in->numel());
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(in_data_v.data(),
in->data<uint16_t, cl::Image2D>(),
im_w,
im_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
default_convertor.ImageToNCHW(
in_data_v.data(), real_out_v.data(), image_shape, in->dims());
CHECK(real_out_v.size() == in->numel());
*mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
*std_dev = compute_standard_deviation<float>(
real_out_v.data(), in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
real_out_v.size());
write_result_to_file&& write_tensorfile<float>(in, name);
return;
}
case DATALAYOUT(kNCHW): {
std::vector<float> in_data_v(in->numel(), 0);
TargetWrapperCL::MemcpySync(in_data_v.data(),
in->data<float>(),
in->numel() * sizeof(float),
IoDirection::DtoH);
VLOG(1) << name << ":" << in->numel();
*mean = compute_mean<float>(in_data_v.data(), in->numel());
*std_dev = compute_standard_deviation<float>(
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
return;
} }
default: default:
LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype); *mean = -222222222222;
return 0.; *std_dev = -22222222222;
*ave_grow_rate = -22222222222;
LOG(ERROR) << unsupported_error_log;
return;
}
#endif
} else {
*mean = -111111111111;
*std_dev = -11111111111;
*ave_grow_rate = -11111111111;
LOG(ERROR) << unsupported_error_log;
return;
}
} }
};
if (inst_->op()->op_info()->Type() != "fetch") { std::string GetInstPrecision(const Instruction* inst = nullptr) {
auto op = const_cast<lite::OpLite*>(inst_->op()); using std::setw;
auto kernel = inst_->kernel(); using std::left;
using std::fixed;
STL::stringstream ss;
bool write_result_to_file = false;
VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
<< " registered on " << TargetToStr(inst->kernel()->target()) << "/"
<< PrecisionToStr(inst->kernel()->precision()) << "/"
<< DataLayoutToStr(inst->kernel()->layout());
std::string kernel_repr = inst->op()->op_info()->Repr();
std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
PrecisionToStr(inst->kernel()->precision()) +
"/" + DataLayoutToStr(inst->kernel()->layout());
std::string op_name = inst->op()->op_info()->Type();
if (inst->op()->op_info()->Type() != "fetch") {
auto op = const_cast<lite::OpLite*>(inst->op());
auto kernel = inst->kernel();
auto op_scope = op->scope(); auto op_scope = op->scope();
auto out_names = op->op_info()->output_names(); auto out_names = op->op_info()->output_names();
for (auto& out_name : out_names) { for (auto& out_name : out_names) {
...@@ -106,32 +277,90 @@ class PrecisionProfiler { ...@@ -106,32 +277,90 @@ class PrecisionProfiler {
auto type = kernel->GetOutputDeclType(out_arg_name); auto type = kernel->GetOutputDeclType(out_arg_name);
if (type->IsTensor()) { if (type->IsTensor()) {
auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>(); const Tensor* tout =
double mean = tensor_mean(tout, type->precision(), out_name); op_scope->FindVar(out_name)->GetMutable<Tensor>();
LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims() double mean = -999999;
<< ", precision: " << PrecisionToStr(type->precision()) double std_dev = -100000;
<< ", mean value: " << mean << " shape:" << tout->dims(); double ave_grow_rate = 99999;
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
type->target(),
type->precision(),
type->layout(),
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
ss << setw(45) << left << kernel_info << " " << setw(70) << left
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl;
} else if (type->IsTensorList()) { } else if (type->IsTensorList()) {
auto tout = auto touts =
op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>(); op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
for (auto& t : *tout) { for (auto t : *touts) {
double mean = tensor_mean(&t, type->precision(), out_name); const Tensor* tout = &t;
LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims() double mean = -999999;
<< ", precision: " << PrecisionToStr(type->precision()) double std_dev = -100000;
<< ", mean value: " << mean; double ave_grow_rate = 99999;
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
type->target(),
type->precision(),
type->layout(),
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
} }
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
ss << setw(45) << left << kernel_info << " " << setw(70) << left
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl;
} }
} }
} }
} }
return ss.str();
private: }
const Instruction* inst_{nullptr};
}; };
} // namespace profile } // namespace profile
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// TODO(ysh329): need to remove.
// keep this method only for arm/math/conditional_block_compute
#define LITE_PRECISION_PROFILE(inst) \ #define LITE_PRECISION_PROFILE(inst) \
{ auto a = paddle::lite::profile::PrecisionProfiler(&inst); } { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
...@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
} }
void RuntimeProgram::Run() { void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
std::string precision_profiler_summary =
inst_precision_profiler.GetSummaryHeader();
#endif
#endif
for (auto& inst : instructions_) { for (auto& inst : instructions_) {
#ifndef LITE_WITH_FPGA #ifndef LITE_WITH_FPGA
if (inst.is_feed_fetch_op()) continue; if (inst.is_feed_fetch_op()) continue;
...@@ -144,13 +152,17 @@ void RuntimeProgram::Run() { ...@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA #ifndef LITE_WITH_FPGA
LITE_PRECISION_PROFILE(inst) precision_profiler_summary +=
inst_precision_profiler.GetInstPrecision(&inst);
#endif #endif
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
...@@ -274,7 +286,8 @@ void Instruction::Run() { ...@@ -274,7 +286,8 @@ void Instruction::Run() {
return; return;
} }
op_->InferShape(); // op_->InferShape();
op_->SmartInferShape();
kernel_->Launch(); kernel_->Launch();
has_run_ = true; has_run_ = true;
} }
......
...@@ -30,9 +30,9 @@ Program FakeProgram() { ...@@ -30,9 +30,9 @@ Program FakeProgram() {
auto add_fc = [&](int id, std::string x) { auto add_fc = [&](int id, std::string x) {
// create variables // create variables
std::string w1 = "w" + std::to_string(id); std::string w1 = "w" + paddle::lite::to_string(id);
std::string b1 = "b" + std::to_string(id); std::string b1 = "b" + paddle::lite::to_string(id);
std::string out1 = "out" + std::to_string(id); std::string out1 = "out" + paddle::lite::to_string(id);
auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>(); auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>(); auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>(); auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();
......
...@@ -53,9 +53,9 @@ static std::string version() { ...@@ -53,9 +53,9 @@ static std::string version() {
static int64_t int_version(const std::string& version) { static int64_t int_version(const std::string& version) {
const std::vector<std::string> vec = Split(version, "."); const std::vector<std::string> vec = Split(version, ".");
if (vec.size() == 3) { if (vec.size() == 3) {
return std::stoi(vec[0]) * MAJOR_COEFF + return atoi(vec[0].c_str()) * MAJOR_COEFF +
std::stoi(vec[1]) * MINOR_COEFF + atoi(vec[1].c_str()) * MINOR_COEFF +
std::stoi(vec[2]) * PATCH_COEFF; atoi(vec[2].c_str()) * PATCH_COEFF;
} }
return -1; return -1;
} }
......
...@@ -207,7 +207,8 @@ void RunModel(std::string det_model_file, ...@@ -207,7 +207,8 @@ void RunModel(std::string det_model_file,
cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h); cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h);
// uncomment two lines below, save roi img to disk // uncomment two lines below, save roi img to disk
// std::string roi_name = "roi_" + std::to_string(i) + ".jpg"; // std::string roi_name = "roi_" + paddle::lite::to_string(i)
// + ".jpg";
// imwrite(roi_name, roi); // imwrite(roi_name, roi);
// Do PreProcess // Do PreProcess
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <cmath>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) { ...@@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) {
return shape_str; return shape_str;
} }
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
inline double GetCurrentUS() { inline double GetCurrentUS() {
struct timeval time; struct timeval time;
gettimeofday(&time, NULL); gettimeofday(&time, NULL);
...@@ -101,24 +128,24 @@ void RunModel(std::string model_dir, ...@@ -101,24 +128,24 @@ void RunModel(std::string model_dir,
// 5. Get output // 5. Get output
std::cout << "\n====== output summary ====== " << std::endl; std::cout << "\n====== output summary ====== " << std::endl;
size_t output_tensor_num = predictor->GetOutputNames().size(); size_t output_tensor_num = predictor->GetOutputNames().size();
std::cout << "output tesnor num:" << output_tensor_num << std::endl; std::cout << "output tensor num:" << output_tensor_num << std::endl;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const paddle::lite_api::Tensor> output_tensor = std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
predictor->GetOutput(tidx); predictor->GetOutput(tidx);
std::cout << "\n--- output tensor " << tidx << " ---" << std::endl; std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
auto out_shape = output_tensor->shape(); auto out_shape = output_tensor->shape();
std::cout << "out_shape(NCHW):" << ShapePrint(out_shape) << std::endl; auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
auto out_std_dev = compute_standard_deviation<float>(
out_data, ShapeProduction(out_shape), true, out_mean);
float sum = 0.f; std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
for (int i = 0; i < ShapeProduction(out_shape); ++i) {
sum += output_tensor->data<float>()[i];
}
std::cout << "output tensor " << tidx std::cout << "output tensor " << tidx
<< " elem num:" << ShapeProduction(out_shape) << std::endl; << " elem num:" << ShapeProduction(out_shape) << std::endl;
std::cout << "output tensor " << tidx << " sum value:" << sum << std::endl;
std::cout << "output tensor " << tidx std::cout << "output tensor " << tidx
<< " mean value:" << sum / ShapeProduction(out_shape) << " standard deviation:" << out_std_dev << std::endl;
std::cout << "output tensor " << tidx << " mean value:" << out_mean
<< std::endl; << std::endl;
// print output // print output
......
...@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id, ...@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id,
switch (type) { switch (type) {
case AttrType::INT: case AttrType::INT:
return std::to_string(desc.GetAttr<int>(name)); return paddle::lite::to_string(desc.GetAttr<int>(name));
case AttrType::FLOAT: case AttrType::FLOAT:
return std::to_string(desc.GetAttr<float>(name)); return paddle::lite::to_string(desc.GetAttr<float>(name));
case AttrType::BOOLEAN: case AttrType::BOOLEAN:
return std::to_string(desc.GetAttr<bool>(name)); return paddle::lite::to_string(desc.GetAttr<bool>(name));
case AttrType::STRING: case AttrType::STRING:
return "\"" + desc.GetAttr<std::string>(name) + "\""; return "\"" + desc.GetAttr<std::string>(name) + "\"";
case AttrType::FLOATS: { case AttrType::FLOATS: {
......
...@@ -153,16 +153,16 @@ class Module { ...@@ -153,16 +153,16 @@ class Module {
private: private:
std::string WeightUniqueName() const { std::string WeightUniqueName() const {
return "w_" + std::to_string(weight_counter_++); return "w_" + paddle::lite::to_string(weight_counter_++);
} }
std::string TmpVarUniqueName() const { std::string TmpVarUniqueName() const {
return "tmp_" + std::to_string(tmp_var_counter_++); return "tmp_" + paddle::lite::to_string(tmp_var_counter_++);
} }
std::string OpUniqueName() const { std::string OpUniqueName() const {
return "op_" + std::to_string(op_counter_++); return "op_" + paddle::lite::to_string(op_counter_++);
} }
std::string KernelUniqueName() const { std::string KernelUniqueName() const {
return "kernel_" + std::to_string(kernel_counter_++); return "kernel_" + paddle::lite::to_string(kernel_counter_++);
} }
std::string DataRepr(const std::string &raw_data, PrecisionType dtype); std::string DataRepr(const std::string &raw_data, PrecisionType dtype);
......
# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
# to the model_optimize_tool. # to the model_optimize_tool.
if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))) if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
return() return()
endif() endif()
...@@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de ...@@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de
if(LITE_WITH_TRAIN) if(LITE_WITH_TRAIN)
add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
endif() endif()
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/elementwise_grad_compute.h"
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
inline DDim trim_trailing_singular_dims(const DDim& dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
std::vector<int64_t> trim_dims;
trim_dims.resize(actual_dims_size);
for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
}
if (trim_dims.size() == 0) {
return DDim();
}
return DDim(trim_dims);
}
inline bool is_broadcast(const DDim& x_dims,
const DDim& y_dims,
int axis,
int* pre,
int* n,
int* post) {
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
if (x_dims.size() == y_dim_trim.size()) {
return false;
}
*pre = 1;
*n = 1;
*post = 1;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dim_trim.size(); ++i) {
CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
<< "Broadcast dimension mismatch.";
(*n) *= y_dim_trim[i];
}
for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
return true;
}
void ElementwiseAddGradCompute::Run() {
auto& param = Param<operators::ElementwiseGradParam>();
const float* x_data = param.X->data<float>();
const float* y_data = param.Y->data<float>();
const float* out_grad_data = param.OutGrad->data<float>();
float* x_grad_data;
float* y_grad_data;
if (param.XGrad) {
x_grad_data = param.XGrad->mutable_data<float>();
}
if (param.YGrad) {
y_grad_data = param.YGrad->mutable_data<float>();
}
int axis = param.axis;
auto x_dims = param.X->dims();
auto y_dims = param.Y->dims();
int pre, n, post;
if (!param.XGrad) {
CHECK(param.YGrad);
lite::arm::math::elementwise_add_grad(
out_grad_data, y_grad_data, y_dims.production());
return;
}
if (!param.YGrad) {
CHECK(param.XGrad);
lite::arm::math::elementwise_add_grad(
out_grad_data, x_grad_data, x_dims.production());
return;
}
if (x_dims.size() < y_dims.size() &&
is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_add_grad_broadcast(
out_grad_data, y_grad_data, x_grad_data, pre, n, post);
} else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_add_grad_broadcast(
out_grad_data, x_grad_data, y_grad_data, pre, n, post);
} else {
lite::arm::math::elementwise_add_grad(
out_grad_data, x_grad_data, x_dims.production());
lite::arm::math::elementwise_add_grad(
out_grad_data, y_grad_data, y_dims.production());
}
}
void ElementwiseSubGradCompute::Run() {
auto& param = Param<operators::ElementwiseGradParam>();
const float* x_data = param.X->data<float>();
const float* y_data = param.Y->data<float>();
const float* out_data = param.OutGrad->data<float>();
float* x_grad_data;
float* y_grad_data;
if (param.XGrad) {
x_grad_data = param.XGrad->mutable_data<float>();
}
if (param.YGrad) {
y_grad_data = param.YGrad->mutable_data<float>();
}
int axis = param.axis;
auto x_dims = param.X->dims();
auto y_dims = param.Y->dims();
int pre, n, post;
if (!param.XGrad || !param.YGrad) {
CHECK(param.XGrad || param.YGrad);
lite::arm::math::elementwise_sub_grad(
out_data, x_grad_data, y_grad_data, y_dims.production());
return;
}
if (x_dims.size() < y_dims.size()) {
LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size";
}
if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_sub_grad_broadcast(
out_data, x_grad_data, y_grad_data, pre, n, post);
} else {
lite::arm::math::elementwise_sub_grad(
out_data, x_grad_data, y_grad_data, x_dims.production());
}
}
template <typename T, PrecisionType PType>
void ElementwiseMulGradCompute<T, PType>::Run() {
LOG(FATAL) << "elementwise mul_grad not implement yet";
}
void ElementwiseMaxGradCompute::Run() {
LOG(FATAL) << "elementwise max_grad not implement yet";
}
void ElementwiseDivGradCompute::Run() {
LOG(FATAL) << "elementwise div_grad not implement yet";
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
using elementwise_mul_grad_float =
paddle::lite::kernels::arm::ElementwiseMulGradCompute<float,
PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(elementwise_add_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseAddGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseSubGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_div_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseDivGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(
elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_max_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseMaxGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class ElementwiseAddGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseAddGradCompute() = default;
};
class ElementwiseSubGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseSubGradCompute() = default;
};
template <typename T, PrecisionType PType>
class ElementwiseMulGradCompute : public KernelLite<TARGET(kARM), PType> {
public:
void Run() override;
virtual ~ElementwiseMulGradCompute() = default;
};
class ElementwiseMaxGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseMaxGradCompute() = default;
};
class ElementwiseDivGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseDivGradCompute() = default;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/mul_grad_compute.h"
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/sgemm.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void MulGradCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>();
}
void MulGradCompute::Run() {
// step1 flatten_2d
auto& param = Param<param_t>();
const auto x_dims = param.x->dims();
const auto y_dims = param.y->dims();
const auto out_dims = param.output_grad->dims();
m_ = static_cast<int>(x_dims.Slice(0, param.x_num_col_dims).production());
k_ = static_cast<int>(
x_dims.Slice(param.x_num_col_dims, x_dims.size()).production());
n_ = static_cast<int>(
y_dims.Slice(param.y_num_col_dims, y_dims.size()).production());
const auto* out_grad_data = param.output_grad->data<float>();
const auto* x_data = param.x->data<float>();
const auto* y_data = param.y->data<float>();
float* x_grad_data;
float* y_grad_data;
if (param.x_grad) {
x_grad_data = param.x_grad->mutable_data<float>();
}
if (param.y_grad) {
y_grad_data = param.y_grad->mutable_data<float>();
}
paddle::lite::operators::ActivationParam act_param;
act_param.has_active = false;
// out_grad * y^T = x_grad
// (m, n), (n, k) -> (m, k)
auto& ctx = this->ctx_->template As<ARMContext>();
if (param.x_grad) {
if (m_ == 1) {
lite::arm::math::sgemv(y_data,
out_grad_data,
x_grad_data,
false,
k_, // M
n_, // N
false,
nullptr,
false,
lite_api::ActivationType::kIndentity,
&ctx);
} else {
paddle::lite::arm::math::sgemm(false,
true, // is_transB,
m_, // M
k_, // N
n_, // K
1.0f, // alpha
out_grad_data, // A
n_, // lda
y_data, // B
n_, // ldb
0.f, // beta
x_grad_data, // C
k_, // ldc
NULL, // bias
false, // is_bias
act_param, // act_param
&ctx); // ctx
}
}
// x^T * out_grad = y_grad
// (k, m) (m, n) -> (k, n)
if (param.y_grad) {
if (n_ == 1) {
lite::arm::math::sgemv(x_data,
out_grad_data,
y_grad_data,
true,
k_, // M
m_, // N
false,
nullptr,
false,
lite_api::ActivationType::kIndentity,
&ctx);
} else {
paddle::lite::arm::math::sgemm(true, // is_transA
false, // is_transB,
k_, // M
n_, // N
m_, // K
1.0f, // alpha
x_data, // A
k_, // lda
out_grad_data, // B
n_, // ldb
0.f, // beta
y_grad_data, // C
n_, // ldc
NULL, // bias
false, // is_bias
act_param, // act_param
&ctx); // ctx
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(mul_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::MulGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class MulGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::MulGradParam;
void PrepareForRun() override;
void Run() override;
virtual ~MulGradCompute() = default;
private:
int m_, n_, k_;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) { ...@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) {
counter = ++(it->second); counter = ++(it->second);
} }
return prefix + "_" + std::to_string(counter); return prefix + "_" + paddle::lite::to_string(counter);
} }
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
......
if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA)) if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA))
return() return()
endif() endif()
......
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA)) if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
return() return()
endif() endif()
......
...@@ -87,7 +87,8 @@ class Graph { ...@@ -87,7 +87,8 @@ class Graph {
auto idx = Add(name, node); auto idx = Add(name, node);
CHECK_GE(idx, 1); CHECK_GE(idx, 1);
// Generate a unique name for the created HiAI IR // Generate a unique name for the created HiAI IR
node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx))); node->set_data(
std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
return node; return node;
} }
......
...@@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
split_op->create_dynamic_output_y(out_names.size()); split_op->create_dynamic_output_y(out_names.size());
int idx = 1; int idx = 1;
for (auto& out_name : out_names) { for (auto& out_name : out_names) {
auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0); auto zero_node =
graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0);
auto add_node = graph->Add<ge::op::Add>(out_name); auto add_node = graph->Add<ge::op::Add>(out_name);
auto add_op = add_node->data<ge::op::Add>(); auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx)); add_op->set_input_x1(*split_node->data(),
"y" + paddle::lite::to_string(idx));
add_op->set_input_x2(*zero_node->data()); add_op->set_input_x2(*zero_node->data());
idx++; idx++;
} }
......
...@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() {
<< "[NPU] No input nodes found for building NPU model"; << "[NPU] No input nodes found for building NPU model";
CHECK(!device_onames_.empty()) CHECK(!device_onames_.empty())
<< "[NPU] No output nodes found for building NPU model"; << "[NPU] No output nodes found for building NPU model";
// Build the HiAI IR graph to HiAI om model as the device program // Build the HiAI IR graph to HiAI om model as the device program
device_program_ = lite::npu::Device::Global().Build( if (device_program_map_.count(inputs_shape_) > 0) {
return status;
}
auto device_client = lite::npu::Device::Global().Build(
model_name_, device_inodes, device_onodes); model_name_, device_inodes, device_onodes);
if (device_program_ == nullptr) { if (device_client == nullptr) {
LOG(WARNING) << "[NPU] Build model failed!"; LOG(WARNING) << "[NPU] Build model failed!";
return subgraph::FAILED; return subgraph::FAILED;
} }
auto device_program = std::make_shared<device_program_t>(device_client);
device_program_map_[inputs_shape_] = device_program;
// Query and check the dimensions of valid input and output tensors // Query and check the dimensions of valid input and output tensors
std::vector<hiai::TensorDimension> device_idims, device_odims; std::vector<hiai::TensorDimension> device_idims, device_odims;
if (device_program_->GetModelIOTensorDim( if (device_program->client->GetModelIOTensorDim(
model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) { model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
LOG(WARNING) LOG(WARNING)
<< "[NPU] Get the dimensions of input and output tensors failed!"; << "[NPU] Get the dimensions of input and output tensors failed!";
return subgraph::FAILED; return subgraph::FAILED;
} }
device_program->device_idims = device_idims;
device_program->device_odims = device_odims;
CHECK_EQ(device_idims.size(), device_inames_.size()); CHECK_EQ(device_idims.size(), device_inames_.size());
CHECK_EQ(device_odims.size(), device_onames_.size()); CHECK_EQ(device_odims.size(), device_onames_.size());
origin_idims_.resize(device_inames_.size()); origin_idims_.resize(device_inames_.size());
...@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() {
origin_odims_.resize(device_onames_.size()); origin_odims_.resize(device_onames_.size());
origin_otensors_.resize(device_onames_.size()); origin_otensors_.resize(device_onames_.size());
device_otensors_.resize(device_onames_.size()); device_otensors_.resize(device_onames_.size());
for (int i = 0; i < device_inames_.size(); i++) { for (int i = 0; i < device_inames_.size(); i++) {
auto node = graph.Get(device_inames_[i]); auto node = graph.Get(device_inames_[i]);
auto precision = node->precision(); auto precision = node->precision();
...@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() {
device_itensors_[i].reset(new hiai::AiTensor); device_itensors_[i].reset(new hiai::AiTensor);
device_itensors_[i]->Init(&(device_idims[i])); device_itensors_[i]->Init(&(device_idims[i]));
} }
device_program->origin_idims = origin_idims_;
for (int i = 0; i < device_onames_.size(); i++) { for (int i = 0; i < device_onames_.size(); i++) {
auto node = graph.Get(device_onames_[i]); auto node = graph.Get(device_onames_[i]);
auto precision = node->precision(); auto precision = node->precision();
...@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() {
<< PrecisionToStr(precision); << PrecisionToStr(precision);
break; break;
} }
device_program->origin_odims = origin_odims_;
CHECK_EQ(origin_odims_[i].production(), CHECK_EQ(origin_odims_[i].production(),
device_odims[i].GetNumber() * device_odims[i].GetChannel() * device_odims[i].GetNumber() * device_odims[i].GetChannel() *
device_odims[i].GetHeight() * device_odims[i].GetWidth()); device_odims[i].GetHeight() * device_odims[i].GetWidth());
...@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() {
int SubgraphEngine::LaunchDeviceProgram() { int SubgraphEngine::LaunchDeviceProgram() {
// Copy the data of origin input tensors to the buffer of input HiAI tensors // Copy the data of origin input tensors to the buffer of input HiAI tensors
// init device_itensors_, device_otensors_, origin_otensors_
auto device_program = device_program_map_[inputs_shape_];
for (size_t i = 0; i < device_itensors_.size(); i++) { for (size_t i = 0; i < device_itensors_.size(); i++) {
device_itensors_[i]->Init(&(device_program->device_idims[i]));
std::memcpy(device_itensors_[i]->GetBuffer(), std::memcpy(device_itensors_[i]->GetBuffer(),
origin_itensors_[i]->raw_data(), origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size()); origin_itensors_[i]->memory_size());
} }
for (size_t i = 0; i < device_otensors_.size(); i++) {
device_otensors_[i]->Init(&(device_program->device_odims[i]));
}
for (size_t i = 0; i < origin_otensors_.size(); i++) {
origin_otensors_[i]->Resize(device_program->origin_odims[i]);
}
// Run the HiAI model by name // Run the HiAI model by name
std::string key = "model_name"; // Note: key seems must be model_name std::string key = "model_name"; // Note: key seems must be model_name
model_context_.AddPara(key, model_name_); hiai::AiContext model_context;
model_context.AddPara(key, model_name_);
auto GetCurrentUS = []() -> double { auto GetCurrentUS = []() -> double {
struct timeval time; struct timeval time;
gettimeofday(&time, NULL); gettimeofday(&time, NULL);
...@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
}; };
int istamp; int istamp;
auto start_time = GetCurrentUS(); auto start_time = GetCurrentUS();
CHECK_EQ( CHECK_EQ(device_program->client->Process(
device_program_->Process( model_context, device_itensors_, device_otensors_, 1000, istamp),
model_context_, device_itensors_, device_otensors_, 1000, istamp),
hiai::AI_SUCCESS); hiai::AI_SUCCESS);
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output HiAI tensor to the buffer of origin output tensors // Copy the data of output HiAI tensor to the buffer of origin output tensors
for (size_t i = 0; i < device_otensors_.size(); i++) { for (size_t i = 0; i < device_otensors_.size(); i++) {
std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()), std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
...@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
return 0; return 0;
} }
bool SubgraphEngine::InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape;
for (auto origin_itensor : origin_itensors_) {
new_shape.push_back(origin_itensor->dims().Vectorize());
}
inputs_shape_ = new_shape;
if (device_program_map_.count(inputs_shape_) > 0) {
return false;
}
return true;
}
void SubgraphCompute::PrepareForRun() { void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(ctx_.get(), engine_.reset(new SubgraphEngine(ctx_.get(),
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine { ...@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine {
: subgraph::Engine( : subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {} ctx, block_idx, block_desc, input_names, output_names, scope) {}
struct device_program_t {
explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
: client(_client) {}
std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
std::vector<DDim> origin_idims{};
std::vector<DDim> origin_odims{};
std::vector<hiai::TensorDimension> device_idims{};
std::vector<hiai::TensorDimension> device_odims{};
};
protected: protected:
int BuildDeviceProgram() override; int BuildDeviceProgram() override;
int LaunchDeviceProgram() override; int LaunchDeviceProgram() override;
bool InputShapeChanged() override;
std::string model_name_; std::string model_name_{"model.om"};
hiai::AiContext model_context_; std::vector<std::vector<int64_t>> inputs_shape_{};
std::vector<std::string> device_inames_; std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
std::vector<std::string> device_onames_; device_program_map_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_; std::vector<std::string> device_inames_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_; std::vector<std::string> device_onames_{};
std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr}; std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
}; };
class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> { class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
......
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL)) if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL))
return () return ()
endif() endif()
...@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten ...@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc #lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
# DEPS conv_opencl op_registry program context) # DEPS conv_opencl op_registry program context)
#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
# DEPS tensor cl_context cl_wrapper cl_target_wrapper)
#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc #lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
# DEPS depthwise_conv2d_opencl op_registry program context) # DEPS depthwise_conv2d_opencl op_registry program context)
......
...@@ -101,6 +101,7 @@ class ActivationComputeImageDefault ...@@ -101,6 +101,7 @@ class ActivationComputeImageDefault
status = kernel.setArg(++arg_idx, scale_); status = kernel.setArg(++arg_idx, scale_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
...@@ -112,6 +113,7 @@ class ActivationComputeImageDefault ...@@ -112,6 +113,7 @@ class ActivationComputeImageDefault
VLOG(4) << "threshold:" << threshold_; VLOG(4) << "threshold:" << threshold_;
VLOG(4) << "scale:" << scale_; VLOG(4) << "scale:" << scale_;
VLOG(4) << "kernel func name:" << kernel_func_name_; VLOG(4) << "kernel func name:" << kernel_func_name_;
#endif
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
...@@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL( ...@@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL(
// exp // exp
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
exp_act, exp,
kOpenCL, kOpenCL,
kFP16, kFP16,
kImageDefault, kImageDefault,
...@@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL( ...@@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL(
// tanh // tanh
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
tanh_act, tanh,
kOpenCL, kOpenCL,
kFP16, kFP16,
kImageDefault, kImageDefault,
......
...@@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) { ...@@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) {
func_name = "sigmoid"; func_name = "sigmoid";
break; break;
case 6: // tanh case 6: // tanh
func_name = "tanh_act"; func_name = "tanh";
break; break;
case 7: // tanh case 7: // tanh
func_name = "swish"; func_name = "swish";
break; break;
case 8: // tanh case 8: // tanh
func_name = "exp_act"; func_name = "exp";
break; break;
} }
LOG(INFO) << "func_name: " << func_name; LOG(INFO) << "func_name: " << func_name;
...@@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault); ...@@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW); USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
// exp // exp
USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault);
// swish // swish
USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
...@@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault); ...@@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
// tanh act // tanh act
USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault);
// relu image2d fp16 // relu image2d fp16
USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault); USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
......
...@@ -77,17 +77,21 @@ class BilinearInterpImageCompute ...@@ -77,17 +77,21 @@ class BilinearInterpImageCompute
int out_h = out_dims[2]; int out_h = out_dims[2];
int out_w = out_dims[3]; int out_w = out_dims[3];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims; VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims); auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>(); auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>( auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "x_image: " << x_img;
// VLOG(4) << "out_image: " << out_img; // VLOG(4) << "out_image: " << out_img;
VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " " VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
<< out_image_shape["height"]; << out_image_shape["height"];
...@@ -96,6 +100,7 @@ class BilinearInterpImageCompute ...@@ -96,6 +100,7 @@ class BilinearInterpImageCompute
<< ", align_delta: " << align_delta; << ", align_delta: " << align_delta;
VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w; VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w; VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
#endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
...@@ -107,8 +112,10 @@ class BilinearInterpImageCompute ...@@ -107,8 +112,10 @@ class BilinearInterpImageCompute
DDim(std::vector<DDim::value_type>{ DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]), static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])})); static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", " VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2]; << default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img); cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img); status = kernel.setArg(arg_idx++, *out_img);
...@@ -142,9 +149,10 @@ class BilinearInterpImageCompute ...@@ -142,9 +149,10 @@ class BilinearInterpImageCompute
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_); context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2]; << global_work_size[1] << " " << global_work_size[2];
#endif
} }
protected: protected:
......
...@@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
int arg_idx = 0; int arg_idx = 0;
int width = inputs[0]->dims()[inputs[0]->dims().size() - 1]; int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
VLOG(4) << "concat 输入尺寸: "; #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "concat input shape: ";
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
VLOG(4) << "inputs [" << i << "]" VLOG(4) << "inputs [" << i << "]"
<< "[" << inputs[i]->dims().size() << "D]:" << "[" << inputs[i]->dims().size() << "D]:"
...@@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
<< inputs[i]->dims()[3]; << inputs[i]->dims()[3];
} }
VLOG(4) << "concat 输出尺寸: "; VLOG(4) << "concat output shape: ";
VLOG(4) << " out dims: " VLOG(4) << " out dims: "
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1] << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
<< " " << x_dims[2] << " " << x_dims[3]; << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "axis_: " << axis_; VLOG(4) << "axis_: " << axis_;
VLOG(4) << "flag_: " << flag_; VLOG(4) << "flag_: " << flag_;
#endif
auto global_work_size = auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]), cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
...@@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
x_dims[x_dims.size() - 1]), x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])}; static_cast<cl::size_type>(image_shape["height"])};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.output->target()); VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"]; << image_shape["height"];
...@@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " " VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " "
<< (image_shape["width"] / x_dims[x_dims.size() - 1]) << " " << (image_shape["width"] / x_dims[x_dims.size() - 1]) << " "
<< (image_shape["height"]); << (image_shape["height"]);
#endif
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int out_w = x_dims[x_dims.size() - 1]; int out_w = x_dims[x_dims.size() - 1];
...@@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
image_shape = InitImageDimInfoWith(in_dims); image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>(); auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
int in_w = in_dims[in_dims.size() - 1]; int in_w = in_dims[in_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"]; << image_shape["height"];
#endif
global_work_size = global_work_size =
cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]), cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] / static_cast<cl::size_type>(image_shape["width"] /
......
...@@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
void Run() override; void Run() override;
private: private:
void Conv2d1x1(); void Conv2d1x1opt();
void Conv2d3x3(); void Conv2d3x3();
void Conv2d3x3opt(); void Conv2d3x3opt();
void Conv2d5x5(); void Conv2d5x5();
void Conv2d5x5opt();
void Conv2d7x7(); void Conv2d7x7();
void Conv2d7x7opt();
void DepthwiseConv2d3x3s1(); void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3(); void DepthwiseConv2d3x3();
void DepthwiseConv2d(); void DepthwiseConv2d();
......
...@@ -510,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) { ...@@ -510,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
const int dilation = 1; const int dilation = 1;
const int stride = 2; const int stride = 2;
const int group = 1; const int group = 1;
for (int batch_size = 1; batch_size < 2; ++batch_size) { for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 5; ih < 9; ih += 1) { // ih for (int ih = 5; ih < 9; ih += 1) { // ih
int iw = ih; int iw = ih;
...@@ -532,7 +532,7 @@ const int stride = 2; ...@@ -532,7 +532,7 @@ const int stride = 2;
#else // big scale with group #else // big scale with group
const int stride = 1; const int stride = 1;
const int group = 32 / 1; const int group = 32 / 1;
const int batch_size = 1; const int batch_size = 2;
const int ic = 32 / 1; const int ic = 32 / 1;
const int ih = 112 / 1; const int ih = 112 / 1;
const int iw = 112 / 1; const int iw = 112 / 1;
...@@ -558,7 +558,8 @@ const int stride = 2; ...@@ -558,7 +558,8 @@ const int stride = 2;
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)); DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1"; // CHECK(batch_size == 1) << "conv3x3 only supprt
// batch_size == 1";
auto kernel = std::move(kernels.front()); auto kernel = std::move(kernels.front());
SHADOW_LOG << "created conv2d kernel"; SHADOW_LOG << "created conv2d kernel";
...@@ -886,13 +887,14 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -886,13 +887,14 @@ TEST(conv2d, compute_image2d_5x5) {
// int loop_cnt = 0; // int loop_cnt = 0;
#ifdef LOOP_TEST #ifdef LOOP_TEST
for (int batch_size = 2; batch_size < 4; ++batch_size) { for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc for (int oc = 1; oc < 5; oc += 1) { // oc
for (int ih = 5; ih < 9; ih += 1) { // ih for (int ih = 5; ih < 8; ih += 1) { // ih
int iw = ih; int iw = ih;
for (int ic = 2; ic < 10; ic += 1) { // ic for (int ic = 2; ic < 6; ic += 1) { // ic
for (bool bias_flag : {true, false}) { for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {/*true,*/ "relu"}) { for (std::string relu_flag : {""
"relu"}) {
#else #else
const int batch_size = 2; const int batch_size = 2;
const int oc = 1; const int oc = 1;
...@@ -1006,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1006,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) {
SHADOW_LOG << "gen input and filter ..."; SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) { for (auto& i : input_v) {
i = 0.01 * gen(engine); i = 0.5 * gen(engine);
} }
for (auto& f : filter_v) { for (auto& f : filter_v) {
f = 0.01 * gen(engine); f = 0.5 * gen(engine);
} }
SHADOW_LOG << "after gen input and filter ..."; SHADOW_LOG << "after gen input and filter ...";
...@@ -1216,9 +1218,10 @@ TEST(conv2d, compute_image2d_5x5) { ...@@ -1216,9 +1218,10 @@ TEST(conv2d, compute_image2d_5x5) {
#undef LOOP_TEST #undef LOOP_TEST
#undef PRINT_RESULT #undef PRINT_RESULT
#endif #endif
#ifdef TEST_CONV_IMAGE_7x7 #ifdef TEST_CONV_IMAGE_7x7
#undef FP16_ABS_DIFF // #undef FP16_ABS_DIFF
#define FP16_ABS_DIFF (1e0) // #define FP16_ABS_DIFF (1e-1)
// #define LOOP_TEST // #define LOOP_TEST
TEST(conv2d, compute_image2d_7x7) { TEST(conv2d, compute_image2d_7x7) {
// conv infos // conv infos
...@@ -1230,13 +1233,13 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1230,13 +1233,13 @@ TEST(conv2d, compute_image2d_7x7) {
// int loop_cnt = 0; // int loop_cnt = 0;
#ifdef LOOP_TEST #ifdef LOOP_TEST
for (int batch_size = 2; batch_size < 4; ++batch_size) { for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc for (int oc = 1; oc < 6; oc += 1) { // oc
for (int ih = 7; ih < 15; ih += 1) { // ih for (int ih = 7; ih < 8; ih += 1) { // ih
int iw = ih; int iw = ih;
for (int ic = 2; ic < 10; ic += 1) { // ic for (int ic = 2; ic < 4; ic += 1) { // ic
for (bool bias_flag : {true, false}) { for (bool bias_flag : {false, true}) {
for (std::string relu_flag : {"relu"}) { for (std::string relu_flag : {"", "relu"}) {
#else #else
const int batch_size = 2; const int batch_size = 2;
const int oc = 1; const int oc = 1;
...@@ -1343,14 +1346,16 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1343,14 +1346,16 @@ TEST(conv2d, compute_image2d_7x7) {
SHADOW_LOG << "gen input and filter ..."; SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) { for (auto& i : input_v) {
i = gen(engine); i = 0.1 * gen(engine);
#ifdef TEST_CONV_IMAGE_ALL_1 #ifdef TEST_CONV_IMAGE_ALL_1
i = 1; i = 1;
#endif #endif
} }
int fiii = 1;
for (auto& f : filter_v) { for (auto& f : filter_v) {
f = gen(engine); f = 0.1 * gen(engine);
#ifdef TEST_CONV_IMAGE_ALL_1 #ifdef TEST_CONV_IMAGE_ALL_1
// f = fiii++;
f = 1; f = 1;
#endif #endif
} }
...@@ -1424,7 +1429,8 @@ TEST(conv2d, compute_image2d_7x7) { ...@@ -1424,7 +1429,8 @@ TEST(conv2d, compute_image2d_7x7) {
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(), filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim); filter_dim);
// auto* filter_image2d = filter.mutable_data<float, // auto* filter_image2d =
// filter.mutable_data < float,
// cl::Image2D>( // cl::Image2D>(
// filter_image_width, // filter_image_width,
// filter_image_height, // filter_image_height,
......
...@@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() { ...@@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() {
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(ele_param_->X->target()); VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target()); VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target()); VLOG(4) << TargetToStr(ele_param_->Out->target());
#endif
int arg_idx = 0; int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf); cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
...@@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() { ...@@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() {
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) { for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i]; num_ *= x_dims[i];
} }
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "axis: " << axis; VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_; VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_; VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_; VLOG(4) << "num: " << num_;
#endif
} }
} // namespace opencl } // namespace opencl
......
...@@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() { ...@@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() {
auto* out = ele_param_->Out; auto* out = ele_param_->Out;
auto axis = ele_param_->axis; auto axis = ele_param_->axis;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target()); VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "out->target():" << TargetToStr(out->target());
...@@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() { ...@@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims(); VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis; VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor; paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
...@@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() { ...@@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() {
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0], auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]); out_img_shape[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height; VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1]; VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " " VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1]; << out_img_shape[1];
#endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
...@@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() { ...@@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() {
} else if (y_dims.size() == 1) { } else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) { if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1]; int tensor_w = x->dims()[x->dims().size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w; VLOG(4) << "tensor_w:" << tensor_w;
#endif
cl_int status = kernel.setArg(arg_idx, *x_img); cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img); status = kernel.setArg(++arg_idx, *y_img);
...@@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() { ...@@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() {
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width), auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)}; static_cast<cl::size_type>(x_img_height)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height; VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
......
...@@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto out_dims = out->dims(); auto out_dims = out->dims();
auto in_dims = x->dims(); auto in_dims = x->dims();
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims; VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims); auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>(); auto* x_img = x->data<half_t, cl::Image2D>();
...@@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto* out_img = out->mutable_data<half_t, cl::Image2D>( auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img; // VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"]; << out_image_shape["height"];
#endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
...@@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
DDim(std::vector<DDim::value_type>{ DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]), static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])})); static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", " VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2]; << default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img); cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img); status = kernel.setArg(arg_idx++, *grid_img);
...@@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_); context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " " VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2]; << global_work_size[1] << " " << global_work_size[2];
#endif
} }
protected: protected:
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册