提交 f4e27ad1 编写于 作者: J jackzhang235

Merge remote-tracking branch 'upstream/develop' into develop

......@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
build*
......@@ -57,22 +57,20 @@ function(check_linker_flag)
endforeach()
set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections)
endif()
if(LITE_WITH_OPENCL)
if(ARM_TARGET_LANG STREQUAL "clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
endif()
if(LITE_WITH_OPENMP)
find_package(OpenMP REQUIRED)
if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
......
......@@ -285,6 +285,11 @@ set(host_kernels CACHE INTERNAL "host kernels")
set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
file(WRITE ${kernels_src_list} "") # clean
# file to record faked kernels for opt python lib
set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
file(WRITE ${fake_kernels_src_list} "") # clean
if(LITE_BUILD_TAILOR)
set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
......@@ -313,56 +318,65 @@ function(add_kernel TARGET device level)
return()
endif()
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
# the source list will collect for model_optimize_tool to fake kernel generation.
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
# when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
# no need to continue the compilation of the true kernel source.
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
return()
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
if ("${device}" STREQUAL "Host")
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "ARM")
if (NOT LITE_WITH_ARM)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "NPU")
if (NOT LITE_WITH_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "XPU")
if (NOT LITE_WITH_XPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "FPGA")
if (NOT LITE_WITH_FPGA)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "BM")
if (NOT LITE_WITH_BM)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
......@@ -375,6 +389,9 @@ function(add_kernel TARGET device level)
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
......@@ -382,6 +399,9 @@ function(add_kernel TARGET device level)
if ("${device}" STREQUAL "CUDA")
if (NOT LITE_WITH_CUDA)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")
......
......@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
> 不同手机,不同版本,测试模型的性能数据不同。
```shell
run benchmark armv7
run benchmark armv8
--------------------------------------
PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 159.8427 ms
-- mobilenet_v1 avg = 235.0072 ms
-- mobilenet_v2 avg = 173.0387 ms
-- shufflenet_v2 avg = 76.0040 ms
-- squeezenet_v11 avg = 164.2957 ms
mnasnet min = 19.83500 max = 19.38500 average = 19.65503
mobilenetv1 min = 32.00600 max = 31.56900 average = 31.81983
mobilenetv2 min = 22.37900 max = 22.08700 average = 22.28623
shufflenetv2 min = 10.80400 max = 10.62900 average = 10.68890
squeezenet min = 17.67400 max = 17.47900 average = 17.57677
Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 83.1287 ms
-- mobilenet_v1 avg = 121.6029 ms
-- mobilenet_v2 avg = 86.6175 ms
-- shufflenet_v2 avg = 41.5761 ms
-- squeezenet_v11 avg = 87.8678 ms
mnasnet min = 11.85600 max = 11.72000 average = 11.77127
mobilenetv1 min = 18.75000 max = 18.64300 average = 18.70593
mobilenetv2 min = 14.05100 max = 13.59900 average = 13.71450
shufflenetv2 min = 6.67200 max = 6.58300 average = 6.63400
squeezenet min = 12.07100 max = 11.33400 average = 11.41253
Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 73.3880 ms
-- mobilenet_v1 avg = 119.0739 ms
-- mobilenet_v2 avg = 85.3050 ms
-- shufflenet_v2 avg = 38.0762 ms
-- squeezenet_v11 avg = 64.2201 ms
mnasnet min = 7.19300 max = 7.02600 average = 7.08480
mobilenetv1 min = 10.42000 max = 10.29100 average = 10.34267
mobilenetv2 min = 8.61900 max = 8.46900 average = 8.54707
shufflenetv2 min = 4.55200 max = 4.41900 average = 4.46477
squeezenet min = 8.60000 max = 7.85200 average = 7.98407
--------------------------------------
run benchmark armv8
run benchmark armv7
--------------------------------------
PaddleLite Benchmark
Threads=1 Warmup=10 Repeats=30
-- mnasnet avg = 165.3073 ms
-- mobilenet_v1 avg = 306.0188 ms
-- mobilenet_v2 avg = 195.1884 ms
-- shufflenet_v2 avg = 99.3692 ms
-- squeezenet_v11 avg = 156.6971 ms
mnasnet min = 20.98300 max = 20.81400 average = 20.92527
mobilenetv1 min = 33.19000 max = 32.81700 average = 33.08490
mobilenetv2 min = 25.91400 max = 25.61700 average = 25.73097
shufflenetv2 min = 11.14300 max = 10.97600 average = 11.06757
squeezenet min = 19.31800 max = 19.20000 average = 19.26530
Threads=2 Warmup=10 Repeats=30
-- mnasnet avg = 90.2290 ms
-- mobilenet_v1 avg = 157.0007 ms
-- mobilenet_v2 avg = 118.1607 ms
-- shufflenet_v2 avg = 68.6804 ms
-- squeezenet_v11 avg = 91.3090 ms
mnasnet min = 12.59900 max = 12.46600 average = 12.52207
mobilenetv1 min = 19.05800 max = 18.94700 average = 18.97897
mobilenetv2 min = 15.28400 max = 15.11300 average = 15.19843
shufflenetv2 min = 6.97000 max = 6.81400 average = 6.90863
squeezenet min = 12.87900 max = 12.12900 average = 12.22530
Threads=4 Warmup=10 Repeats=30
-- mnasnet avg = 179.9730 ms
-- mobilenet_v1 avg = 204.0684 ms
-- mobilenet_v2 avg = 181.6486 ms
-- shufflenet_v2 avg = 123.2728 ms
-- squeezenet_v11 avg = 412.9046 ms
mnasnet min = 7.31400 max = 7.12900 average = 7.20357
mobilenetv1 min = 11.44000 max = 10.86900 average = 10.94383
mobilenetv2 min = 9.14900 max = 9.03800 average = 9.09907
shufflenetv2 min = 4.60600 max = 4.49400 average = 4.53360
squeezenet min = 8.27000 max = 8.10600 average = 8.19000
--------------------------------------
```
......@@ -103,7 +103,6 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=npu,arm \
--prefer_int8_kernel=(true|false) \
--record_tailoring_info =(true|false)
```
- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子,并没有真正生成NPU HiAI模型,只有在执行时才会将标记的Paddle算子转成HiAI IR,最终生成并执行HiAI模型,具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)
......
......@@ -65,9 +65,11 @@ rm ./lite/api/paddle_use_ops.h
--arm_os=android \
--arm_abi=armv8 \
--arm_lang=gcc \
build_test_arm_opencl
build_opencl
```
注:如果要调试cl kernel,假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件,保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`,该命令会自动将修改后,再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名,cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
### 编译产物说明
编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内,这里仅罗列关键产物:
......
......@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
例如:
```bash
./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
./lite/tools/build.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_static --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
```
**注意**:上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
......@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
#include <stdio.h>
#include <vector>
#include "paddle_api.h" // NOLINT
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#include "paddle_use_passes.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
......@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
2. 使用非combined参数模型时,模型拓扑文件名应为`__model__`,使用非combined参数模型时,集合中各模型的拓扑与参数名应相同,分别由`--model_filename``--param_filename`指定。
3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
4. 需要使用Paddle-Lite 最新版本(release/v2.1.0之后)代码编译出的model_optimize_tool
4. 需要使用Paddle-Lite `release/v2.1.0`之后版本代码编译出的模型优化工具
......@@ -83,7 +83,6 @@ PaddlePaddle模型有两种保存格式:
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86|npu|xpu) \
--prefer_int8_kernel=(true|false) \
--record_tailoring_info =(true|false)
```
......@@ -95,12 +94,12 @@ PaddlePaddle模型有两种保存格式:
| --optimize_out_type | 输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测,请将此选项设置为naive_buffer。默认为protobuf。 |
| --optimize_out | 优化模型的输出路径。 |
| --valid_targets | 指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm。 |
| --prefer_int8_kernel | 若待优化模型为int8量化模型(如量化训练得到的量化模型),则设置该选项为true以使用int8内核函数进行推理加速,默认为false。 |
| --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时,则设置该选项为true,以记录优化后模型含有的kernel和OP信息,默认为false。 |
* 如果待优化的fluid模型是非combined形式,请设置`--model_dir`,忽略`--model_file``--param_file`
* 如果待优化的fluid模型是combined形式,请设置`--model_file``--param_file`,忽略`--model_dir`
* 优化后的模型为以`.nb`名称结尾的单个文件。
* 删除`prefer_int8_kernel`的输入参数,`opt`自动判别是否是量化模型,进行相应的优化操作。
### 功能二:统计模型算子信息、判断是否支持
......
......@@ -245,7 +245,6 @@ python compress.py \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \
--prefer_int8_kernel=true
```
如前所述,量化训练后,float目录下的模型参数范围为int8,但参数数据类型仍为float32类型,这样确实没有起到模型参数压缩的效果。但是,经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果,且模型结构也被优化(如进行了各种operator fuse操作)。
......
......@@ -86,7 +86,6 @@ WeightQuantization.quantize_weight_to_int(save_model_dir,
参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
因为该模型会将量化的权重反量化,然后实际加载并执行FP32预测模型,所以opt命令的输入参数--prefer_int8_kernel不需要设置为true,同时其他参数按照实际情况参考文档设置。
比如在安卓手机ARM端进行预测,模型转换的命令为:
```bash
./opt --model_dir=./mobilenet_v1_quant \
......
......@@ -147,13 +147,12 @@ with fluid.name_scope('skip_quant'):
参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具,建议从Release页面下载。
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。注意opt命令的输入参数--prefer_int8_kernel必须设置为true,其他参数按照实际情况参考文档设置。比如在安卓手机ARM端进行预测,模型转换的命令为:
参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具,参数按照实际情况设置。比如在安卓手机ARM端进行预测,模型转换的命令为:
```bash
./opt --model_dir=./mobilenet_v1_quant \
--optimize_out_type=naive_buffer \
--optimize_out=mobilenet_v1_quant_opt \
--valid_targets=arm \
--prefer_int8_kernel=true
--valid_targets=arm
```
### 3.2 量化模型预测
......
......@@ -24,8 +24,7 @@ $ ./opt \
--param_file=<param_path> \
--optimize_out_type=(protobuf|naive_buffer) \
--optimize_out=<output_optimize_model_dir> \
--valid_targets=(arm|opencl|x86) \
--prefer_int8_kernel=(ture|false)
--valid_targets=(arm|opencl|x86)
```
其中,optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式,其目前支持Protobuf与Naive Buffer两种方式,其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测,那么您需要设置optimize_out_type=naive_buffer。
......
......@@ -84,7 +84,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
if (LITE_WITH_PYTHON)
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so")
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND python setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
......@@ -96,6 +105,7 @@ if (LITE_WITH_PYTHON)
endif()
add_dependencies(publish_inference_python_lib lite_pybind)
add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo)
endif()
......@@ -213,6 +223,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference tiny_publish_cxx_lib)
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
endif()
endif()
......
......@@ -308,6 +308,11 @@ if (LITE_ON_TINY_PUBLISH)
return()
endif()
# add library for opt_base
lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
......
......@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
const std::vector<std::string> quant_dequant_op = {
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
"fake_quantize_moving_average_abs_max",
"fake_quantize_dequantize_moving_average_abs_max",
"fake_dequantize_max_abs",
"fake_channel_wise_dequantize_max_abs"};
bool is_quantized_model = false;
for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
++i) {
auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
std::string op_type = op_desc->Type();
if (std::find(quant_dequant_op.begin(),
quant_dequant_op.end(),
op_type) != quant_dequant_op.end()) {
is_quantized_model = true;
}
}
}
if (is_quantized_model) {
inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(desc, scope_, inner_places);
core::KernelPickFactor factor;
......
......@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {
int main(int argc, char** argv) {
CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
paddle::lite::Run(argv[1], std::stoi(argv[2]));
paddle::lite::Run(argv[1], atoi(argv[2]));
return 0;
}
......
......@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {
std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
return nullptr;
}
std::string LightPredictorImpl::GetVersion() const { return lite::version(); }
......
......@@ -95,7 +95,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
......@@ -119,21 +119,21 @@ void TestModel(const std::vector<Place>& valid_places,
// Get detailed result
size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
VLOG(1) << "output tensor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, out_dims.production());
auto out_std_dev = compute_standard_deviation<float>(
out_data, out_dims.production(), true, out_mean);
VLOG(1) << "output tensor dims:" << out_dims;
VLOG(1) << "output tensor elements num:" << out_dims.production();
VLOG(1) << "output tensor standard deviation:" << out_std_dev;
VLOG(1) << "output tensor mean value:" << out_mean;
// print result
for (int i = 0; i < out_dims.production(); ++i) {
......
......@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = 0.1;
double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
......@@ -121,21 +121,21 @@ void TestModel(const std::vector<Place>& valid_places,
// Get detailed result
size_t output_tensor_num = predictor.GetOutputNames().size();
VLOG(1) << "output tesnor num:" << output_tensor_num;
VLOG(1) << "output tensor num:" << output_tensor_num;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
auto* output_tensor = predictor.GetOutput(tidx);
VLOG(1) << "============= output tensor " << tidx << " =============\n";
auto out_dims = output_tensor->dims();
VLOG(1) << "out_dims:" << out_dims;
float sum = 0.f;
for (int i = 0; i < out_dims.production(); ++i) {
sum += output_tensor->data<float>()[i];
}
VLOG(1) << "out_dims.production():" << out_dims.production();
VLOG(1) << "output tensor sum value:" << sum;
VLOG(1) << "output tensor mean value:" << sum / out_dims.production();
auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, out_dims.production());
auto out_std_dev = compute_standard_deviation<float>(
out_data, out_dims.production(), true, out_mean);
VLOG(1) << "output tensor dims:" << out_dims;
VLOG(1) << "output tensor elements num:" << out_dims.production();
VLOG(1) << "output tensor standard deviation:" << out_std_dev;
VLOG(1) << "output tensor mean value:" << out_mean;
// print result
for (int i = 0; i < out_dims.production(); ++i) {
......
......@@ -138,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
......
......@@ -250,7 +250,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
......
......@@ -264,7 +264,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
std::ofstream out(FLAGS_arg_name + ".txt");
for (size_t i = 0; i < arg_num; ++i) {
sum += arg_tensor->data<float>()[i];
out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
}
LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
<< ", mean value is " << sum * 1. / arg_num;
......
......@@ -67,7 +67,6 @@ DEFINE_string(valid_targets,
"arm",
"The targets this model optimized for, should be one of (arm, "
"opencl, x86), splitted by space");
DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
DEFINE_bool(print_supported_ops,
false,
"Print supported operators on the inputed target");
......@@ -123,11 +122,6 @@ std::vector<Place> ParserValidPlaces() {
<< "At least one target should be set, should set the "
"command argument 'valid_targets'";
if (FLAGS_prefer_int8_kernel) {
LOG(WARNING) << "Int8 mode is only support by ARM target";
valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
return valid_places;
}
......@@ -257,7 +251,6 @@ void PrintHelpInfo() {
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--prefer_int8_kernel=(true|false)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/api/opt_base.h"
#include "all_kernel_faked.cc" // NOLINT
namespace paddle {
namespace lite_api {
void OptBase::SetModelDir(const std::string& model_path) {
opt_config_.set_model_dir(model_path);
}
void OptBase::SetModelFile(const std::string& model_path) {
opt_config_.set_model_file(model_path);
}
void OptBase::SetParamFile(const std::string& param_path) {
opt_config_.set_param_file(param_path);
}
void OptBase::SetModelType(std::string optimize_out_type) {
if (optimize_out_type == "protobuf") {
model_type_ = LiteModelType::kProtobuf;
} else if (optimize_out_type == "naive_buffer") {
model_type_ = LiteModelType::kNaiveBuffer;
} else {
LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
}
}
void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.clear();
auto target_reprs = lite::Split(valid_places, ",");
for (auto& target_repr : target_reprs) {
if (target_repr == "arm") {
valid_places_.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") {
valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU));
} else {
LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag "
"'valid_targets'",
target_repr.c_str());
}
}
CHECK(!valid_places_.empty())
<< "At least one target should be set, should set the "
"command argument 'valid_targets'";
}
void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
optimize_out_path_ = optimized_out_path;
}
void OptBase::RunOptimize(bool record_strip_info) {
CheckIfModelSupported(false);
OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
opt_config_.set_valid_places(valid_places_);
if (model_set_dir_ != "") {
RunOptimizeFromModelSet(record_strip_info);
} else {
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
auto resulted_model_name =
record_strip_info ? "information of striped model" : "optimized model";
std::cout << "Save the " << resulted_model_name
<< " into :" << optimize_out_path_ << "successfully";
}
}
// collect ops info of modelset
void CollectModelMetaInfo(const std::string& output_dir,
const std::vector<std::string>& models,
const std::string& filename) {
std::set<std::string> total;
for (const auto& name : models) {
std::string model_path =
lite::Join<std::string>({output_dir, name, filename}, "/");
auto lines = lite::ReadLines(model_path);
total.insert(lines.begin(), lines.end());
}
std::string output_path =
lite::Join<std::string>({output_dir, filename}, "/");
lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
output_path);
}
void OptBase::SetModelSetDir(const std::string& model_set_path) {
model_set_dir_ = model_set_path;
}
void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
// 1. mkdir of outputed optimized model set.
lite::MkDirRecur(optimize_out_path_);
auto model_dirs = lite::ListDir(model_set_dir_, true);
if (model_dirs.size() == 0) {
LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
}
// 2. optimize each model in inputed model set dir.
std::string model_file = opt_config_.model_file();
std::string param_file = opt_config_.param_file();
for (const auto& name : model_dirs) {
std::string input_model_dir =
lite::Join<std::string>({model_set_dir_, name}, "/");
std::string output_model_dir =
lite::Join<std::string>({optimize_out_path_, name}, "/");
if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
auto model_file_path =
lite::Join<std::string>({input_model_dir, model_file}, "/");
auto param_file_path =
lite::Join<std::string>({input_model_dir, param_file}, "/");
}
std::cout << "Start optimize model: " << input_model_dir;
opt_config_.set_model_dir(input_model_dir);
opt_config_.set_model_file(model_file);
opt_config_.set_param_file(param_file);
auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
opt_predictor->SaveOptimizedModel(
optimize_out_path_, model_type_, record_strip_info);
std::cout << "Optimize done. ";
}
// 3. if record_strip_info = true, we will record striping info
if (record_strip_info) {
// Collect all models information
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
CollectModelMetaInfo(optimize_out_path_,
model_dirs,
lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
CollectModelMetaInfo(
optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
std::cout << "Record the information of stripped models into :"
<< optimize_out_path_ << "successfully";
}
}
void OptBase::PrintHelpInfo() {
const std::string opt_version = lite::version();
const char help_info[] =
"At least one argument should be inputed. Valid arguments are listed "
"below:\n"
" Arguments of help information:\n"
" `help()` Print help infomation\n"
" Arguments of model optimization:\n"
" `set_model_dir(model_dir)`\n"
" `set_model_file(model_file_path)`\n"
" `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n"
" `set_optimize_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
" `run_optimize(false|true)`\n"
" ` ----fasle&true refer to whether to record ops info for "
"tailoring lib, false by default`\n"
" Arguments of model checking and ops information:\n"
" `print_all_ops()` Display all the valid operators of "
"Paddle-Lite\n"
" `print_supported_ops` Display supported operators of valid "
"places\n"
" `check_if_model_supported()` Check if the input model is "
"supported\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
}
// 2. Print supported info of inputed ops
void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
std::vector<std::string> lite_supported_targets = {"kHost",
"kX86",
"kCUDA",
"kARM",
"kOpenCL",
"kFPGA",
"kNPU",
"kXPU",
"kAny",
"kUnk"};
// Get the lengh of the first column: maximum length of the op_type
size_t maximum_optype_length = 0;
for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
maximum_optype_length = it->first.size() > maximum_optype_length
? it->first.size()
: maximum_optype_length;
}
std::cout << std::setiosflags(std::ios::internal);
// Print the first row: OP_nam taget1 target2 ...
std::cout << std::setw(maximum_optype_length) << "OP_name";
for (size_t i = 0; i < lite_supported_targets.size(); i++) {
std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
}
std::cout << std::endl;
// Print the name of supported ops and mark if it's supported by each target
// print the support info of inputed ops: valid_ops
for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
std::cout << std::setw(maximum_optype_length) << *op;
// Check: If this kernel doesn't match any operator, we will skip it.
if (supported_ops.find(*op) == supported_ops.end()) {
continue;
}
// Print OP info.
auto ops_valid_places = supported_ops.at(*op);
for (size_t i = 0; i < lite_supported_targets.size(); i++) {
if (std::find(ops_valid_places.begin(),
ops_valid_places.end(),
lite_supported_targets[i]) != ops_valid_places.end()) {
std::cout << std::setw(10) << "Y";
} else {
std::cout << std::setw(10) << " ";
}
}
std::cout << std::endl;
}
}
void OptBase::DisplayKernelsInfo() { // Display kernel information
std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
}
void OptBase::PrintAllOps() {
// 1. Get supported ops on these targets
std::set<std::string> valid_ops;
for (size_t i = 0; i < supported_ops_target.size(); i++) {
auto ops = supported_ops_target[i];
valid_ops.insert(ops.begin(), ops.end());
}
// 2. Print support info of these ops
PrintOpsInfo(valid_ops);
}
void OptBase::PrintSupportedOps() {
// 1. Get the valid hardware targets
std::vector<TargetType> target_types = {};
for (size_t i = 0; i < valid_places_.size(); i++) {
target_types.push_back(valid_places_[i].target);
}
std::string targets_str = TargetToStr(target_types[0]);
for (size_t i = 1; i < target_types.size(); i++) {
targets_str = targets_str + TargetToStr(target_types[i]);
}
std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
target_types.push_back(TARGET(kHost));
target_types.push_back(TARGET(kUnk));
// 2. Get supported ops on these targets
std::set<std::string> valid_ops;
for (size_t i = 0; i < target_types.size(); i++) {
auto ops = supported_ops_target[static_cast<int>(target_types[i])];
valid_ops.insert(ops.begin(), ops.end());
}
// 3. Print support info of these ops
PrintOpsInfo(valid_ops);
}
// test whether this model is supported
void OptBase::CheckIfModelSupported(bool print_ops_info) {
// 1. parse valid places and valid targets
auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
valid_ops.insert(
valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
for (size_t i = 0; i < valid_places_.size(); i++) {
auto target = valid_places_[i].target;
auto ops = supported_ops_target[static_cast<int>(target)];
valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
}
// get valid ops
std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
// 2.Load model into program to get ops in model
std::string prog_path = opt_config_.model_dir() + "/__model__";
if (!(opt_config_.model_file()).empty() &&
!(opt_config_.param_file()).empty()) {
prog_path = opt_config_.model_file();
}
lite::cpp::ProgramDesc cpp_prog;
framework::proto::ProgramDesc pb_proto_prog =
*lite::LoadProgram(prog_path, false);
lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
// Transform to cpp::ProgramDesc
lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
std::set<std::string> unsupported_ops;
std::set<std::string> input_model_ops;
for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
for (size_t i = 0; i < current_block->OpsSize(); ++i) {
auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
auto op_type = op_desc.Type();
input_model_ops.insert(op_type);
if (valid_ops_set.count(op_type) == 0) {
unsupported_ops.insert(op_type);
}
}
}
// 3. Print ops_info of input model and check if this model is supported
if (print_ops_info) {
std::cout << "OPs in the input model include:\n";
PrintOpsInfo(input_model_ops);
}
if (!unsupported_ops.empty()) {
std::string unsupported_ops_str = *unsupported_ops.begin();
for (auto op_str = ++unsupported_ops.begin();
op_str != unsupported_ops.end();
op_str++) {
unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
}
std::vector<TargetType> targets = {};
for (size_t i = 0; i < valid_places_.size(); i++) {
targets.push_back(valid_places_[i].target);
}
std::sort(targets.begin(), targets.end());
targets.erase(unique(targets.begin(), targets.end()), targets.end());
std::string targets_str = TargetToStr(targets[0]);
for (size_t i = 1; i < targets.size(); i++) {
targets_str = targets_str + "," + TargetToStr(targets[i]);
}
LOG(ERROR) << "Error: This model is not supported, because "
<< unsupported_ops.size() << " ops are not supported on '"
<< targets_str << "'. These unsupported ops are: '"
<< unsupported_ops_str << "'.";
exit(1);
}
if (print_ops_info) {
std::cout << "Paddle-Lite supports this model!" << std::endl;
exit(1);
}
}
} // namespace lite_api
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* This file defines Opt and basic functions about model transformation.
*/
#ifndef PADDLE_LITE_OPT_H_ // NOLINT
#define PADDLE_LITE_OPT_H_
#include <algorithm>
#include <iomanip>
#include <set>
#include <string>
#include <vector>
// stores the map that records the source_file path of each kernel.
#include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
// version of Paddle-lite
#include "lite/core/version.h"
// model parser functions to pre-load model to verify if this model is supported
#include "lite/model_parser/compatible_pb.h"
#include "lite/model_parser/pb/program_desc.h"
#include "lite/utils/string.h"
// recorded all the ops supported by paddle-lite
#include "supported_kernel_op_info.h" // NOLINT
namespace paddle {
namespace lite_api {
/// The PaddlePredictor defines the basic interfaces for different kinds of
/// predictors.
class LITE_API OptBase {
public:
OptBase() = default;
void SetModelSetDir(const std::string &model_set_path);
void SetModelDir(const std::string &model_path);
void SetModelFile(const std::string &model_path);
void SetParamFile(const std::string &param_path);
void SetValidPlaces(const std::string &valid_places);
void SetOptimizeOut(const std::string &optimized_out_path);
// set optimized_model type
void SetModelType(std::string model_type);
// transform and save the optimized model
void RunOptimize(bool record_strip_info = false);
// fuctions of printing info
// 1. help info
void PrintHelpInfo();
// 2. PrintOpsInfo
void PrintOpsInfo(const std::set<std::string> &valid_ops =
{}); // print supported ops on target_types
void PrintAllOps(); // print all ops
void PrintSupportedOps(); // print ops supported on valid_places_
void DisplayKernelsInfo(); // Display kernel information
// 3. Check if this model is supported
void CheckIfModelSupported(bool print_ops_info = true);
private:
CxxConfig opt_config_;
// valid places for the optimized_model
std::vector<Place> valid_places_;
// filename of the optimized_model
std::string optimize_out_path_;
// type of the optimized_model, kNaiveBuffer default.
LiteModelType model_type_{LiteModelType::kNaiveBuffer};
// Dir path of a set of models, this should be combined with model
std::string model_set_dir_;
void RunOptimizeFromModelSet(bool record_strip_info = false);
};
} // namespace lite_api
} // namespace paddle
#endif // NOLINT
......@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
return()
endif()
# to create setup.py for packeting whl for Paddle-Lite and opt
execute_process(
COMMAND git describe --tags --exact-match
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_LITE_TAG
OUTPUT_STRIP_TRAILING_WHITESPACE
)
execute_process(
COMMAND git log -1 --format=%h
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE PADDLE_LITE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_subdirectory(pybind)
#add_subdirectory(interface)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
if (NOT LITE_ON_TINY_PUBLISH)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
endif()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
......
......@@ -26,11 +26,12 @@
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/opt_base.h"
#endif
#include "lite/api/light_api.h"
#include "lite/api/paddle_api.h"
#include "lite/core/tensor.h"
namespace py = pybind11;
......@@ -48,10 +49,27 @@ using lite_api::DataLayoutType;
using lite_api::Place;
using lite_api::MLUCoreVersion;
using lite::LightPredictorImpl;
using lite_api::OptBase;
#ifndef LITE_ON_TINY_PUBLISH
using lite::CxxPaddleApiImpl;
static void BindLiteCxxPredictor(py::module *m);
void BindLiteOpt(py::module *m) {
py::class_<OptBase> opt_base(*m, "Opt");
opt_base.def(py::init<>())
.def("set_model_dir", &OptBase::SetModelDir)
.def("set_modelset_dir", &OptBase::SetModelSetDir)
.def("set_model_file", &OptBase::SetModelFile)
.def("set_param_file", &OptBase::SetParamFile)
.def("set_valid_places", &OptBase::SetValidPlaces)
.def("set_optimize_out", &OptBase::SetOptimizeOut)
.def("set_model_type", &OptBase::SetModelType)
.def("run_optimize", &OptBase::RunOptimize)
.def("help", &OptBase::PrintHelpInfo)
.def("print_supported_ops", &OptBase::PrintSupportedOps)
.def("display_kernels_info", &OptBase::DisplayKernelsInfo)
.def("print_all_ops", &OptBase::PrintAllOps);
}
#endif
static void BindLiteLightPredictor(py::module *m);
static void BindLiteCxxConfig(py::module *m);
......
......@@ -22,11 +22,15 @@ namespace lite {
namespace pybind {
void BindLiteApi(pybind11::module *m);
void BindLiteOpt(pybind11::module *m);
PYBIND11_MODULE(lite_core, m) {
PYBIND11_MODULE(lite, m) {
m.doc() = "C++ core of Paddle-Lite";
BindLiteApi(&m);
#ifndef LITE_ON_TINY_PUBLISH
BindLiteOpt(&m);
#endif
}
} // namespace pybind
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# module of pack whl installer for Paddle-lite
import shutil
import os
from setuptools import setup, Distribution
class BinaryDistribution(Distribution):
'binary distribution'
def has_ext_modules(foo):
return True
# get paddle-lite version, if it's not based on a release tag, we use commit id instead
PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
if PADDLELITE_TAG == "":
PADDLELITE_VERSION = PADDLELITE_COMMITE
else:
PADDLELITE_VERSION = PADDLELITE_TAG
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
# link lite.so to paddlelite.libs
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'):
os.remove(LIB_PATH+'/__init__.py')
# set dir path of each package
PACKAGE_DIR = {
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddlelite.libs': LIB_PATH,
'paddlelite': LITE_PATH
}
setup(
name='paddlelite',
version=PADDLELITE_VERSION,
description='Paddle-Lite Library',
packages=['paddlelite', 'paddlelite.libs'],
package_dir=PACKAGE_DIR,
package_data=PACKAGE_DATA,
distclass=BinaryDistribution
)
......@@ -17,6 +17,7 @@
#include <gflags/gflags.h>
#include <sys/time.h>
#include <time.h>
#include <cmath>
// for eval
DEFINE_string(model_dir, "", "model dir");
......@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
return 1e+6 * time.tv_sec + time.tv_usec;
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
} // namespace lite
} // namespace paddle
......@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx,
}
}
template <>
void elementwise_add_grad<float>(const float* dout_grad,
float* x_grad,
int num) {
int cnt = num >> 4;
int remain = num & 0x0f;
#pragma omp parallel for
for (int i = 0; i < cnt; ++i) {
const float* out_data = dout_grad + 16 * i;
float* x_data = x_grad + 16 * i;
float32x4_t din0 = vld1q_f32(out_data);
float32x4_t din1 = vld1q_f32(out_data + 4);
float32x4_t din2 = vld1q_f32(out_data + 8);
float32x4_t din3 = vld1q_f32(out_data + 12);
vst1q_f32(x_data, din0);
vst1q_f32(x_data + 4, din1);
vst1q_f32(x_data + 8, din2);
vst1q_f32(x_data + 12, din3);
}
if (remain > 0) {
const float* out_data = dout_grad + 16 * cnt;
float* x_data = x_grad + 16 * cnt;
for (int i = 0; i < remain; ++i) {
x_data[i] = out_data[i];
}
}
}
// we assume that y_data numel less than x_data, otherwise, call this function
// by change x_grad and y_grad position
template <>
void elementwise_add_grad_broadcast<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int pre,
int n,
int post) {
if (x_grad) {
elementwise_add_grad(dout_grad, x_grad, pre * n * post);
}
if (y_grad) {
memset(y_grad, 0, n * sizeof(float));
#pragma omp parallel for
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
float sum = 0;
int cnt = post >> 2;
int remain = post & 0x03;
const float* out_data = dout_grad + (i * n + j) * post;
float32x4_t sum_v = vdupq_n_f32(0);
for (int ci = 0; ci < cnt; ++ci) {
float32x4_t din = vld1q_f32(out_data + 4 * ci);
sum_v = vaddq_f32(sum_v, din);
}
out_data += 4 * cnt;
for (int ci = 0; ci < remain; ++ci) {
sum += out_data[ci];
}
float32x2_t high = vget_high_f32(sum_v);
float32x2_t low = vget_low_f32(sum_v);
sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
y_grad[j] += sum;
}
}
}
}
template <>
void elementwise_sub<float>(const float* dinx,
const float* diny,
......@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx,
}
}
}
// we assume the formula is x-y
template <>
void elementwise_sub_grad<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int num) {
if (x_grad) {
elementwise_add_grad(dout_grad, x_grad, num);
}
if (y_grad) {
int cnt = num >> 4;
int remain = num & 0x0f;
float32x4_t minus = vdupq_n_f32(-1);
#pragma omp parallel for
for (int i = 0; i < cnt; ++i) {
const float* out_data = dout_grad + 16 * i;
float* y_data = y_grad + 16 * i;
float32x4_t din0 = vld1q_f32(out_data);
float32x4_t din1 = vld1q_f32(out_data + 4);
float32x4_t din2 = vld1q_f32(out_data + 8);
float32x4_t din3 = vld1q_f32(out_data + 12);
din0 = vmulq_f32(din0, minus);
din1 = vmulq_f32(din1, minus);
din2 = vmulq_f32(din2, minus);
din3 = vmulq_f32(din3, minus);
vst1q_f32(y_data, din0);
vst1q_f32(y_data + 4, din1);
vst1q_f32(y_data + 8, din2);
vst1q_f32(y_data + 12, din3);
}
if (remain > 0) {
const float* out_data = dout_grad + 16 * cnt;
float* y_data = y_grad + 16 * cnt;
for (int i = 0; i < remain; ++i) {
y_data[i] = -out_data[i];
}
}
}
}
// we assume that y_data numel less than x_data, otherwise, call this function
// by change x_grad and y_grad position
template <>
void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int pre,
int n,
int post) {
if (x_grad) {
elementwise_add_grad(dout_grad, x_grad, pre * n * post);
}
if (y_grad) {
memset(y_grad, 0, n * sizeof(float));
#pragma omp parallel for
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) {
float sum = 0;
int cnt = post << 2;
int remain = post & 0x03;
const float* out_data = dout_grad + (i * n + j) * post;
float32x4_t sum_v = vdupq_n_f32(0);
for (int ci = 0; ci < cnt; ++ci) {
float32x4_t din = vld1q_f32(out_data + 4 * ci);
sum_v = vaddq_f32(sum_v, din);
}
out_data += 4 * cnt;
for (int ci = 0; ci < remain; ++ci) {
sum -= out_data[ci];
}
float32x2_t high = vget_high_f32(sum_v);
float32x2_t low = vget_low_f32(sum_v);
sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
y_grad[j] += sum;
}
}
}
}
template <>
void elementwise_mul<float>(const float* dinx,
......
......@@ -183,6 +183,13 @@ template <typename T>
void elementwise_add_relu_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
template <typename T>
void elementwise_add_grad(const T* dout, T* dinx, int num);
template <typename T>
void elementwise_add_grad_broadcast(
const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
template <typename T>
void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);
......@@ -197,6 +204,13 @@ template <typename T>
void elementwise_sub_relu_broadcast(
const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
template <typename T>
void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num);
template <typename T>
void elementwise_sub_grad_broadcast(
const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
template <typename T>
void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);
......
......@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
"vld1.32 {d8-d11}, [%[in]]! @ load input, q4, q5\n" \
"vld1.32 {d12-d15}, [%[w0]]! @ load weights r0, q6,q7\n" \
"vld1.32 {d16-d19}, [%[w1]]! @ load weights r1, q8,q9\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
"vmla.f32 q0, q4, q6 @ mul add\n" \
"vld1.32 {d20-d23}, [%[w2]]! @ load weights r2, q10,q11\n" \
"vmla.f32 q1, q4, q8 @ mul add\n" \
"vld1.32 {d24-d27}, [%[w3]]! @ load weights r3, q12,q13\n" \
/*"vmla.f32 q0, q4, q6 @ mul add\n" */ \
/*"vmla.f32 q1, q4, q8 @ mul add\n" */ \
"vmla.f32 q2, q4, q10 @ mul add\n" \
"vmla.f32 q3, q4, q12 @ mul add\n" \
"subs %[cnt], #1 @ sub loop count \n" \
......
......@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
inline void save_float(float* data, const std::string& name, int len) {
static int counter = 0;
std::string old_string = std::to_string(counter);
std::string old_string = paddle::lite::to_string(counter);
std::string new_string =
std::string(3 - old_string.length(), '0') + old_string;
......
......@@ -351,10 +351,10 @@ class Tensor {
void printScale(std::string type) { printScale(); }
std::string dimsFileName() {
return std::to_string(shape_->num()) + "_" +
std::to_string(shape_->channel()) + "_" +
std::to_string(shape_->height()) + "_" +
std::to_string(shape_->width()) + ".txt";
return paddle::lite::to_string(shape_->num()) + "_" +
paddle::lite::to_string(shape_->channel()) + "_" +
paddle::lite::to_string(shape_->height()) + "_" +
paddle::lite::to_string(shape_->width()) + ".txt";
}
void saveToFile() { std::string path = dimsFileName(); }
......@@ -374,7 +374,7 @@ class Tensor {
invalidate();
std::ofstream ofs;
static int counter = 0;
std::string npath = std::to_string(counter) + "_" + path;
std::string npath = paddle::lite::to_string(counter) + "_" + path;
counter++;
save_file_with_name(npath);
}
......
......@@ -19,8 +19,8 @@ namespace paddle {
namespace lite {
namespace npu {
std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
std::string& model_name, // NOLINT
std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT
) {
......@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
}
// Create a HiAI model manager client to load the HiAI om model
std::unique_ptr<hiai::AiModelMngerClient> model_client(
std::shared_ptr<hiai::AiModelMngerClient> model_client(
new hiai::AiModelMngerClient());
if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
ir_build.ReleaseModelBuff(om_model_buf);
return nullptr;
}
model_name = "model_" + std::to_string(model_count_++) + ".om";
auto model_desc = std::make_shared<hiai::AiModelDescription>(
model_name, freq_level(), framework_type(), model_type(), device_type());
model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
......
......@@ -40,8 +40,8 @@ class Device {
// Build the HiAI IR graph to om model, return HiAI model manager client to
// load om model and run inference.
std::unique_ptr<hiai::AiModelMngerClient> Build(
std::string& model_name, // NOLINT
std::shared_ptr<hiai::AiModelMngerClient> Build(
const std::string model_name, // NOLINT
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT
); // NOLINT
......@@ -51,7 +51,6 @@ class Device {
int framework_type_{0};
int model_type_{0};
int device_type_{0};
int model_count_{0};
};
} // namespace npu
......
......@@ -13,6 +13,5 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
lite_cc_library(cl_caller SRCS cl_caller.cc DEPS cl_context cl_image)
lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper)
add_dependencies(cl_wrapper opencl_clhpp)
#include <cl_common.h>
__kernel void conv2d_1x1(__private const int global_size_dim0,
__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
......
......@@ -14,21 +14,22 @@ limitations under the License. */
#include <cl_common.h>
__kernel void conv2d_3x3_opt(__private const int item_ch,
__kernel void conv2d_3x3_opt(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
......@@ -60,7 +61,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
......@@ -69,23 +71,33 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
......@@ -108,54 +120,76 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
int in_h_val = select(out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image, sampler,
filter[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image, sampler,
filter[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image, sampler,
filter[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image, sampler,
filter[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x, filter[1].x, filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z, filter[1].z, filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w, filter[1].w, filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] =
READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
......@@ -194,23 +228,278 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
// support batch > 1
__kernel void conv2d_3x3_multi_batch(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
#endif
CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(
out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
output[2] = activation_type4(output[2]);
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
// opt version of conv5x5
__kernel void conv2d_5x5_opt(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// filter
const int filter_w = 5;
const int filter_h = 5;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
#endif
CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val =
select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
output[2] = activation_type4(output[2]);
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
// support batch > 1
__kernel void conv2d_5x5_multi_batch(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// filter
const int filter_w = 5;
const int filter_h = 5;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
#endif
CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val = select(
out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
output[2] = activation_type4(output[2]);
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
\ No newline at end of file
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
// opt version of con7x7
__kernel void conv2d_7x7_opt(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// filter
const int filter_w = 7;
const int filter_h = 7;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
#endif
CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val =
select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
output[2] = activation_type4(output[2]);
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
// support batch > 1
__kernel void conv2d_7x7_multi_batch(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int batch,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
// filter
const int filter_w = 7;
const int filter_h = 7;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
CL_DTYPE4 output[5];
output[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
CL_DTYPE4 output[5];
output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
bias,
sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
CL_DTYPE4 output[5] = {0.0f};
#endif
CL_DTYPE4 filter[4] = {0.0f};
CL_DTYPE4 filter_trans[4] = {0.0f};
CL_DTYPE4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val = select(
out_batch_id * in_h + in_h_id + h,
-1,
(out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
-1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
-1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
-1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
-1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
-1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] =
READ_IMG_TYPE(CL_DTYPE_CHAR,
filter_image,
sampler,
(int2)(filter_w_val + w,
filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (CL_DTYPE4)(filter[0].x,
filter[1].x,
filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (CL_DTYPE4)(filter[0].y,
filter[1].y,
filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (CL_DTYPE4)(filter[0].z,
filter[1].z,
filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (CL_DTYPE4)(filter[0].w,
filter[1].w,
filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = READ_IMG_TYPE(
CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
output[0] = activation_type4(output[0]);
output[1] = activation_type4(output[1]);
output[2] = activation_type4(output[2]);
output[3] = activation_type4(output[3]);
output[4] = activation_type4(output[4]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR,
output_image,
(int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
\ No newline at end of file
......@@ -17,6 +17,7 @@
#include <string>
#include "glog/logging.h"
#include "lite/backends/x86/jit/gen/jitcode.h"
#include "lite/utils/string.h"
namespace paddle {
namespace lite {
......@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode {
base += "_Vec";
}
base += (with_relu_ ? "_Relu" : "");
base += "_D" + std::to_string(num_);
base += "_D" + paddle::lite::to_string(num_);
return base;
}
void genCode() override;
......
......@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode {
} else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt";
}
base += ("_W" + std::to_string(tbl_w_));
base += ("_W" + paddle::lite::to_string(tbl_w_));
return base;
}
void genCode() override;
......
......@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode {
std::string name() const override {
std::string base = "MatMulJitCode";
base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
std::to_string(k_);
base = base + "_M" + paddle::lite::to_string(m_) + "_N" +
paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_);
return base;
}
void genCode() override;
......
......@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode {
} else if (type_ == SeqPoolType::kSqrt) {
base += "_Sqrt";
}
base += ("_W" + std::to_string(w_));
base += ("_W" + paddle::lite::to_string(w_));
return base;
}
void genCode() override;
......
......@@ -94,9 +94,13 @@ add_custom_command(
OUTPUT ops.h # not a real path to the output to force it execute every time.
)
# generate fake kernels for memory_optimize_tool
#-------------------------------opt----------------------------------------------------------------
# tricks to create headfiles for opt
add_custom_command(
COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
${kernels_src_list}
${fake_kernels_src_list}
${CMAKE_BINARY_DIR}/all_kernel_faked.cc
${CMAKE_BINARY_DIR}/kernel_src_map.h
OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
......@@ -104,12 +108,12 @@ add_custom_command(
add_custom_target(op_list_h DEPENDS ops.h)
add_custom_target(kernel_list_h DEPENDS kernels.h)
add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
#add_custom_target(opencl_kernels_source_cc DEPENDS opencl_kernels_source.cc)
# create headfile to restore ops info sorted by suppported platforms
add_custom_command(
COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
${kernels_src_list}
${fake_kernels_src_list}
${ops_src_list}
${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.
......
......@@ -490,7 +490,7 @@ class ContextScheduler {
} break;
#endif
default:
#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
LOG(FATAL) << "unsupported target " << TargetToStr(target);
#endif
break;
......
......@@ -48,13 +48,16 @@ std::string Visualize(mir::SSAGraph* graph) {
auto attr_type = op_info->GetAttrType(attr_name);
switch (attr_type) {
case AttrType::INT:
os << ":int:" << std::to_string(op_info->GetAttr<int>(attr_name));
os << ":int:"
<< paddle::lite::to_string(op_info->GetAttr<int>(attr_name));
break;
case AttrType::FLOAT:
os << ":float:" << std::to_string(op_info->GetAttr<float>(attr_name));
os << ":float:"
<< paddle::lite::to_string(op_info->GetAttr<float>(attr_name));
break;
case AttrType::BOOLEAN:
os << ":int:" << std::to_string(op_info->GetAttr<bool>(attr_name));
os << ":int:"
<< paddle::lite::to_string(op_info->GetAttr<bool>(attr_name));
break;
case AttrType::STRING:
os << ":string: \""
......
......@@ -123,7 +123,8 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
// non-tensor(like tensor_array) variables will not be reused
for (auto& node : graph->nodes()) {
if (node.IsArg() && !node.arg()->type->IsTensor()) {
if (node.IsArg() && (node.arg()->type != nullptr) &&
!node.arg()->type->IsTensor()) {
invalid_var_names.insert(node.arg()->name);
}
}
......@@ -237,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan(
if (reuse_table.count(name) && reuse_table.at(name) != name) {
auto replace_name = reuse_table.at(name);
input_node->AsArg().name =
replace_name + "(" + std::to_string(node_append_idx) + ")";
replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
node_append_idx++;
}
}
......@@ -261,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan(
if (reuse_table.count(name) && reuse_table.at(name) != name) {
auto replace_name = reuse_table.at(name);
out_node->AsArg().name =
replace_name + "(" + std::to_string(node_append_idx) + ")";
replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
node_append_idx++;
}
}
......
......@@ -85,7 +85,7 @@ class Node {
struct Arg {
std::string name;
int id{0};
const Type* type{};
const Type* type{nullptr};
// Weight is a special kind of argument, it is marked as weight explicitly
// so that some weight related optimization can take place.
bool is_weight{false};
......
......@@ -58,6 +58,11 @@ void QuantizedOpAttributesInferencePass::Apply(
}
if (found) {
inst.mutable_op_info()->SetAttr("output_scale", output_scale);
} else if (op_info->HasAttr("output_scale")) {
int bit_length = op_info->GetAttr<int>("bit_length");
int range = (1 << (bit_length - 1)) - 1;
output_scale = op_info->GetAttr<float>("output_scale");
inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
}
if (op_info->HasAttr("output_scale")) {
inst.mutable_op_info()->SetAttr("enable_int8", true);
......
......@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass {
}
VLOG(4) << "[score(final)]:" << final_score;
VLOG(4) << "-------- pick summary --------";
VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
VLOG(2) << "-------- pick summary for " << instruct.op_type()
<< " --------";
VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
<< " " << DataLayoutToStr(winner_place.layout) << " "
<< TargetToStr(winner_place.target);
VLOG(4) << " ===> kernel.place():"
VLOG(2) << " ===> kernel.place():"
<< PrecisionToStr(kernel.place().precision) << " "
<< DataLayoutToStr(kernel.place().layout) << " "
<< TargetToStr(kernel.place().target);
......
......@@ -66,11 +66,11 @@ std::string SubgraphVisualizer::operator()() {
} else {
exists_ops[op_type]++;
}
auto op_name = op_type + std::to_string(exists_ops[op_type]);
auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]);
std::string op_color = "white";
if (subgraph_indices.count(node)) {
auto subgraph_idx = subgraph_indices[node];
op_name += "_subgraph_" + std::to_string(subgraph_idx);
op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx);
op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
}
dot.AddNode(op_name,
......@@ -223,6 +223,7 @@ std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
std::vector<std::string> lines = ReadLines(config_file_path);
for (std::string line : lines) {
if (line.empty()) continue;
std::vector<std::string> node_info = Split(line, ":");
std::string op_type = node_info.at(0);
std::vector<std::string> in_vars_name;
......@@ -413,7 +414,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
cpp::OpDesc subgraph_op_desc;
subgraph_op_desc.SetType("subgraph");
// Create a new sub block desc for storing all of Ops an Vars of the target
// Create a new sub block desc for storing all of Ops and Vars of the target
// subgraph and sub_block_idx is set as a attribute of subgraph op,
// sub_block_idx < 0 means it's a new subgraph op
int sub_block_idx = -(subgraph_idx + 1);
......
......@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc(
CHECK_EQ(input_var_names.size(), 1);
CHECK_EQ(wshape.size(), 2);
static int id = 0;
std::string prefix = "fc_" + std::to_string(id);
std::string prefix = "fc_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* wgt = block_desc->AddVar<cpp::VarDesc>();
......@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc(
const std::vector<std::string>& input_Y_names) {
// CHECK_EQ(input_var_names.size(), 2);
static int id = 0;
std::string prefix = "elementwise_add_" + std::to_string(id);
std::string prefix = "elementwise_add_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* out = block_desc->AddVar<cpp::VarDesc>();
......@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc(
const std::vector<std::string>& input_X_names) {
// CHECK_EQ(input_var_names.size(), 1);
static int id = 0;
std::string prefix = "feed_" + std::to_string(id);
std::string prefix = "feed_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* out = block_desc->AddVar<cpp::VarDesc>();
......@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc(
const std::vector<std::string>& input_X_names) {
// CHECK_EQ(input_var_names.size(), 1);
static int id = 0;
std::string prefix = "fetch_" + std::to_string(id);
std::string prefix = "fetch_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
auto* out = block_desc->AddVar<cpp::VarDesc>();
......
......@@ -17,6 +17,7 @@
#include "lite/api/paddle_api.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(model_file, "", "model file path of combined protobuf model");
DEFINE_string(params_file, "", "params file path of combined protobuf model");
......@@ -31,43 +32,17 @@ namespace lite {
// The helper functions for loading and running model from command line and
// verifying output data
std::vector<std::string> TypeParsing(std::string text) {
std::vector<std::string> types;
while (!text.empty()) {
size_t index = text.find_first_of(":");
std::string type = text.substr(0, index);
VLOG(3) << type;
types.push_back(type);
if (index == std::string::npos) {
break;
} else {
text = text.substr(index + 1);
}
}
return types;
return Split(text, ":");
}
std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
std::vector<std::vector<int64_t>> shapes;
while (!text.empty()) {
size_t index = text.find_first_of(":");
std::string slice = text.substr(0, index);
std::vector<int64_t> shape;
while (!slice.empty()) {
size_t index = slice.find_first_of(",");
int d = atoi(slice.substr(0, index).c_str());
VLOG(3) << d;
shape.push_back(d);
if (index == std::string::npos) {
break;
} else {
slice = slice.substr(index + 1);
}
}
shapes.push_back(shape);
if (index == std::string::npos) {
break;
} else {
text = text.substr(index + 1);
std::vector<std::string> shape_strings = Split(text, ":");
shapes.resize(shape_strings.size());
for (int i = 0; i < shape_strings.size(); i++) {
std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
for (auto shape_num : shape_nums) {
shapes[i].push_back(atoi(shape_num.c_str()));
}
}
return shapes;
......
......@@ -41,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
auto inlinks = node->inlinks;
VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
<< " inlinks.size():" << inlinks.size();
VLOG(4) << "============== node->AsStmt().op_type():"
<< node->AsStmt().op_type() << " inlinks.size():" << inlinks.size()
<< " ================";
for (auto* in : inlinks) {
ComplementInputs(graph.get(), node, in);
}
......@@ -68,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
auto decl_arg_type =
inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
CHECK(in->AsArg().type);
VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name
<< "\n in->AsArg().name:" << in->AsArg().name
<< "\n *in->AsArg().type:" << *in->AsArg().type
<< "\n *decl_arg_type:" << *decl_arg_type
<< "\n inst.op()->DebugString():" << inst.op()->DebugString();
// TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL
// layout).
// not a good judge, but don't find the source of this issue from
// static_pick_kernel_pass
// to this pass.
auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
if (in_arg_type->target() == TARGET(kARM) &&
in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
return;
}
if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
<< " for kernel " << inst.op()->DebugString() << " "
......
......@@ -201,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
CHECK(in->IsArg());
// auto node_id = [&] { return graph->nodes().size(); };
auto cast_op_output_name = in->AsArg().name + "/precision_trans";
// in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
// in->AsArg().name + "/precision_trans/" +
// paddle::lite::to_string(node_id());
auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
cast_op_output_arg->AsArg().type =
LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
......
......@@ -65,6 +65,7 @@ class OpLite : public Registry {
virtual bool CheckShape() const { return true; }
// Inference the outputs' shape.
virtual bool InferShape() const { return true; }
virtual bool SmartInferShape() { return this->InferShape(); }
// Run this operator.
virtual bool Run();
// Indicate whether the Op runs only once or not
......@@ -150,6 +151,10 @@ class OpLite : public Registry {
std::vector<Place> valid_places_;
Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
std::unique_ptr<OpInfo> op_info_;
std::vector<DDimLite> last_input_shapes;
std::vector<DDimLite> last_output_shapes;
std::vector<std::vector<std::vector<uint64_t>>> last_output_lods;
std::vector<std::vector<std::vector<uint64_t>>> last_input_lods;
};
/*
......
......@@ -22,18 +22,25 @@
#include <vector>
#include "lite/core/program.h"
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/kernels/opencl/image_helper.h"
#endif
namespace paddle {
namespace lite {
namespace profile {
template <typename dtype>
static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
if (locate.find('/') != std::string::npos) {
return;
return false;
}
FILE* fp = fopen(locate.c_str(), "w");
if (fp == nullptr) {
LOG(ERROR) << "file open field " << locate;
return false;
} else {
const dtype* data = tensor->data<dtype>();
for (int i = 0; i < tensor->numel(); ++i) {
......@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
}
}
fclose(fp);
return true;
}
class PrecisionProfiler {
public:
explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
~PrecisionProfiler() {
LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
<< " on Target " << TargetToStr(inst_->kernel()->target()) << " "
<< PrecisionToStr(inst_->kernel()->precision());
auto tensor_mean = [](const Tensor* in,
PrecisionType ptype,
std::string name = "inst") -> double {
if (!in->data<int8_t>()) {
return -99999;
}
double sum = 0.;
switch (ptype) {
// TODO(ysh329): need to remove `explicit PrecisionProfiler`
// keep this method only for arm/math/conditional
explicit PrecisionProfiler(const Instruction* inst) {
std::string inst_precison_str = GetInstPrecision(inst);
}
PrecisionProfiler() {}
std::string GetSummaryHeader() {
using std::setw;
using std::left;
using std::fixed;
STL::stringstream ss;
ss << "========================================= "
<< "Detailed Precision Profiler Summary "
<< "=========================================" << std::endl;
ss << setw(45) << left << "operator:(kernel_info)"
<< " " << setw(70) << left << "output_tensor_name:(tensor_info)"
<< " " << setw(15) << left << "dims"
<< " " << setw(15) << left << "mean"
<< " " << setw(15) << left << "std_deviation"
<< " " << setw(15) << left << "ave_grow_rate*" << std::endl;
return ss.str();
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
template <typename T>
double compute_average_grow_rate(const T* in, const size_t length) {
const double eps = 1e-5;
double ave_grow_rate = 0.0f;
for (size_t i = 1; i < length; ++i) {
ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
}
ave_grow_rate /= length;
return ave_grow_rate;
}
// check if output tensor unused
bool is_unused(const Tensor* in) {
if (!in->data<int8_t>()) {
return true;
}
return false;
}
void compute_tensor_precision_info(const Tensor* in,
TargetType target_type,
PrecisionType precision_type,
DataLayoutType layout_type,
double* mean,
double* std_dev,
double* ave_grow_rate,
std::string name = "inst",
bool write_result_to_file = false) {
std::string unsupported_error_log =
"Unsupported precision profile for kernel registered on" +
TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
DataLayoutToStr(layout_type);
if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
target_type == TARGET(kX86)) {
switch (precision_type) {
case PRECISION(kFloat): {
auto ptr = in->data<float>();
// write_tensorfile<float>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
return sum / in->numel();
*mean = compute_mean<float>(ptr, in->numel());
*std_dev =
compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
return;
}
case PRECISION(kAny): {
auto ptr = in->data<float>();
// write_tensorfile<float>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
return sum / in->numel();
*mean = compute_mean<float>(ptr, in->numel());
*std_dev =
compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
return;
}
case PRECISION(kInt8): {
auto ptr = in->data<int8_t>();
// write_tensorfile<int8_t>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
return sum / in->numel();
*mean = compute_mean<int8_t>(ptr, in->numel());
*std_dev =
compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int8_t>(in, name);
return;
}
case PRECISION(kInt32): {
auto ptr = in->data<int32_t>();
// write_tensorfile<int32_t>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
return sum / in->numel();
*mean = compute_mean<int32_t>(ptr, in->numel());
*std_dev = compute_standard_deviation<int32_t>(
ptr, in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
write_result_to_file&& write_tensorfile<int32_t>(in, name);
return;
}
default:
LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
return 0.;
*mean = -333333333333;
*std_dev = -33333333333;
*ave_grow_rate = -33333333333;
LOG(ERROR) << unsupported_error_log;
return;
}
};
if (inst_->op()->op_info()->Type() != "fetch") {
auto op = const_cast<lite::OpLite*>(inst_->op());
auto kernel = inst_->kernel();
#ifdef LITE_WITH_OPENCL
} else if (target_type == TARGET(kOpenCL)) {
switch (layout_type) {
case DATALAYOUT(kImageDefault): {
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
<< im_h;
std::vector<uint16_t> in_data_v(im_w * im_h * 4);
std::vector<float> real_out_v(in->numel());
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(in_data_v.data(),
in->data<uint16_t, cl::Image2D>(),
im_w,
im_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
default_convertor.ImageToNCHW(
in_data_v.data(), real_out_v.data(), image_shape, in->dims());
CHECK(real_out_v.size() == in->numel());
*mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
*std_dev = compute_standard_deviation<float>(
real_out_v.data(), in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
real_out_v.size());
write_result_to_file&& write_tensorfile<float>(in, name);
return;
}
case DATALAYOUT(kNCHW): {
std::vector<float> in_data_v(in->numel(), 0);
TargetWrapperCL::MemcpySync(in_data_v.data(),
in->data<float>(),
in->numel() * sizeof(float),
IoDirection::DtoH);
VLOG(1) << name << ":" << in->numel();
*mean = compute_mean<float>(in_data_v.data(), in->numel());
*std_dev = compute_standard_deviation<float>(
in_data_v.data(), in->numel(), true, *mean);
*ave_grow_rate =
compute_average_grow_rate<float>(in_data_v.data(), in->numel());
write_result_to_file&& write_tensorfile<float>(in, name);
return;
}
default:
*mean = -222222222222;
*std_dev = -22222222222;
*ave_grow_rate = -22222222222;
LOG(ERROR) << unsupported_error_log;
return;
}
#endif
} else {
*mean = -111111111111;
*std_dev = -11111111111;
*ave_grow_rate = -11111111111;
LOG(ERROR) << unsupported_error_log;
return;
}
}
std::string GetInstPrecision(const Instruction* inst = nullptr) {
using std::setw;
using std::left;
using std::fixed;
STL::stringstream ss;
bool write_result_to_file = false;
VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
<< " registered on " << TargetToStr(inst->kernel()->target()) << "/"
<< PrecisionToStr(inst->kernel()->precision()) << "/"
<< DataLayoutToStr(inst->kernel()->layout());
std::string kernel_repr = inst->op()->op_info()->Repr();
std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
PrecisionToStr(inst->kernel()->precision()) +
"/" + DataLayoutToStr(inst->kernel()->layout());
std::string op_name = inst->op()->op_info()->Type();
if (inst->op()->op_info()->Type() != "fetch") {
auto op = const_cast<lite::OpLite*>(inst->op());
auto kernel = inst->kernel();
auto op_scope = op->scope();
auto out_names = op->op_info()->output_names();
for (auto& out_name : out_names) {
......@@ -106,32 +277,90 @@ class PrecisionProfiler {
auto type = kernel->GetOutputDeclType(out_arg_name);
if (type->IsTensor()) {
auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
double mean = tensor_mean(tout, type->precision(), out_name);
LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
<< ", precision: " << PrecisionToStr(type->precision())
<< ", mean value: " << mean << " shape:" << tout->dims();
const Tensor* tout =
op_scope->FindVar(out_name)->GetMutable<Tensor>();
double mean = -999999;
double std_dev = -100000;
double ave_grow_rate = 99999;
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
type->target(),
type->precision(),
type->layout(),
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
ss << setw(45) << left << kernel_info << " " << setw(70) << left
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl;
} else if (type->IsTensorList()) {
auto tout =
auto touts =
op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
for (auto& t : *tout) {
double mean = tensor_mean(&t, type->precision(), out_name);
LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
<< ", precision: " << PrecisionToStr(type->precision())
<< ", mean value: " << mean;
for (auto t : *touts) {
const Tensor* tout = &t;
double mean = -999999;
double std_dev = -100000;
double ave_grow_rate = 99999;
std::string mean_str{"unused"};
std::string std_dev_str{"unused"};
std::string ave_grow_rate_str{"unused"};
if (!is_unused(tout)) {
compute_tensor_precision_info(tout,
type->target(),
type->precision(),
type->layout(),
&mean,
&std_dev,
&ave_grow_rate,
out_name,
write_result_to_file);
mean_str = std::to_string(mean);
std_dev_str = std::to_string(std_dev);
ave_grow_rate_str = std::to_string(ave_grow_rate);
}
std::string kernel_info = op_name + ":" + kernel_place;
std::string output_arg_info = out_name + ":" +
TargetToStr(type->target()) + "/" +
PrecisionToStr(type->precision()) +
"/" + DataLayoutToStr(type->layout());
ss << setw(45) << left << kernel_info << " " << setw(70) << left
<< output_arg_info << " " << setw(15) << left << tout->dims()
<< " " << setw(15) << left << mean_str << " " << setw(15) << left
<< std_dev_str << " " << setw(15) << left << ave_grow_rate_str
<< std::endl;
}
}
}
}
return ss.str();
}
private:
const Instruction* inst_{nullptr};
};
} // namespace profile
} // namespace lite
} // namespace paddle
// TODO(ysh329): need to remove.
// keep this method only for arm/math/conditional_block_compute
#define LITE_PRECISION_PROFILE(inst) \
{ auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
......@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
}
void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
std::string precision_profiler_summary =
inst_precision_profiler.GetSummaryHeader();
#endif
#endif
for (auto& inst : instructions_) {
#ifndef LITE_WITH_FPGA
if (inst.is_feed_fetch_op()) continue;
......@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA
LITE_PRECISION_PROFILE(inst)
precision_profiler_summary +=
inst_precision_profiler.GetInstPrecision(&inst);
#endif
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
......@@ -274,7 +286,8 @@ void Instruction::Run() {
return;
}
op_->InferShape();
// op_->InferShape();
op_->SmartInferShape();
kernel_->Launch();
has_run_ = true;
}
......
......@@ -30,9 +30,9 @@ Program FakeProgram() {
auto add_fc = [&](int id, std::string x) {
// create variables
std::string w1 = "w" + std::to_string(id);
std::string b1 = "b" + std::to_string(id);
std::string out1 = "out" + std::to_string(id);
std::string w1 = "w" + paddle::lite::to_string(id);
std::string b1 = "b" + paddle::lite::to_string(id);
std::string out1 = "out" + paddle::lite::to_string(id);
auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();
......
......@@ -53,9 +53,9 @@ static std::string version() {
static int64_t int_version(const std::string& version) {
const std::vector<std::string> vec = Split(version, ".");
if (vec.size() == 3) {
return std::stoi(vec[0]) * MAJOR_COEFF +
std::stoi(vec[1]) * MINOR_COEFF +
std::stoi(vec[2]) * PATCH_COEFF;
return atoi(vec[0].c_str()) * MAJOR_COEFF +
atoi(vec[1].c_str()) * MINOR_COEFF +
atoi(vec[2].c_str()) * PATCH_COEFF;
}
return -1;
}
......
......@@ -207,7 +207,8 @@ void RunModel(std::string det_model_file,
cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h);
// uncomment two lines below, save roi img to disk
// std::string roi_name = "roi_" + std::to_string(i) + ".jpg";
// std::string roi_name = "roi_" + paddle::lite::to_string(i)
// + ".jpg";
// imwrite(roi_name, roi);
// Do PreProcess
......
......@@ -14,6 +14,7 @@
#include <sys/time.h>
#include <time.h>
#include <cmath>
#include <iostream>
#include <string>
#include <vector>
......@@ -36,6 +37,32 @@ std::string ShapePrint(const shape_t& shape) {
return shape_str;
}
template <typename T>
double compute_mean(const T* in, const size_t length) {
double sum = 0.;
for (size_t i = 0; i < length; ++i) {
sum += in[i];
}
return sum / length;
}
template <typename T>
double compute_standard_deviation(const T* in,
const size_t length,
bool has_mean = false,
double mean = 10000) {
if (!has_mean) {
mean = compute_mean<T>(in, length);
}
double variance = 0.;
for (size_t i = 0; i < length; ++i) {
variance += pow((in[i] - mean), 2);
}
variance /= length;
return sqrt(variance);
}
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
......@@ -101,24 +128,24 @@ void RunModel(std::string model_dir,
// 5. Get output
std::cout << "\n====== output summary ====== " << std::endl;
size_t output_tensor_num = predictor->GetOutputNames().size();
std::cout << "output tesnor num:" << output_tensor_num << std::endl;
std::cout << "output tensor num:" << output_tensor_num << std::endl;
for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
predictor->GetOutput(tidx);
std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
auto out_shape = output_tensor->shape();
std::cout << "out_shape(NCHW):" << ShapePrint(out_shape) << std::endl;
auto out_data = output_tensor->data<float>();
auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
auto out_std_dev = compute_standard_deviation<float>(
out_data, ShapeProduction(out_shape), true, out_mean);
float sum = 0.f;
for (int i = 0; i < ShapeProduction(out_shape); ++i) {
sum += output_tensor->data<float>()[i];
}
std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
std::cout << "output tensor " << tidx
<< " elem num:" << ShapeProduction(out_shape) << std::endl;
std::cout << "output tensor " << tidx << " sum value:" << sum << std::endl;
std::cout << "output tensor " << tidx
<< " mean value:" << sum / ShapeProduction(out_shape)
<< " standard deviation:" << out_std_dev << std::endl;
std::cout << "output tensor " << tidx << " mean value:" << out_mean
<< std::endl;
// print output
......
......@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id,
switch (type) {
case AttrType::INT:
return std::to_string(desc.GetAttr<int>(name));
return paddle::lite::to_string(desc.GetAttr<int>(name));
case AttrType::FLOAT:
return std::to_string(desc.GetAttr<float>(name));
return paddle::lite::to_string(desc.GetAttr<float>(name));
case AttrType::BOOLEAN:
return std::to_string(desc.GetAttr<bool>(name));
return paddle::lite::to_string(desc.GetAttr<bool>(name));
case AttrType::STRING:
return "\"" + desc.GetAttr<std::string>(name) + "\"";
case AttrType::FLOATS: {
......
......@@ -153,16 +153,16 @@ class Module {
private:
std::string WeightUniqueName() const {
return "w_" + std::to_string(weight_counter_++);
return "w_" + paddle::lite::to_string(weight_counter_++);
}
std::string TmpVarUniqueName() const {
return "tmp_" + std::to_string(tmp_var_counter_++);
return "tmp_" + paddle::lite::to_string(tmp_var_counter_++);
}
std::string OpUniqueName() const {
return "op_" + std::to_string(op_counter_++);
return "op_" + paddle::lite::to_string(op_counter_++);
}
std::string KernelUniqueName() const {
return "kernel_" + std::to_string(kernel_counter_++);
return "kernel_" + paddle::lite::to_string(kernel_counter_++);
}
std::string DataRepr(const std::string &raw_data, PrecisionType dtype);
......
# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
# to the model_optimize_tool.
if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
return()
endif()
......@@ -109,6 +109,8 @@ add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_de
if(LITE_WITH_TRAIN)
add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
endif()
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/elementwise_grad_compute.h"
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
inline DDim trim_trailing_singular_dims(const DDim& dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
}
std::vector<int64_t> trim_dims;
trim_dims.resize(actual_dims_size);
for (int i = 0; i < actual_dims_size; ++i) {
trim_dims[i] = dims[i];
}
if (trim_dims.size() == 0) {
return DDim();
}
return DDim(trim_dims);
}
inline bool is_broadcast(const DDim& x_dims,
const DDim& y_dims,
int axis,
int* pre,
int* n,
int* post) {
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
if (x_dims.size() == y_dim_trim.size()) {
return false;
}
*pre = 1;
*n = 1;
*post = 1;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dim_trim.size(); ++i) {
CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
<< "Broadcast dimension mismatch.";
(*n) *= y_dim_trim[i];
}
for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
return true;
}
void ElementwiseAddGradCompute::Run() {
auto& param = Param<operators::ElementwiseGradParam>();
const float* x_data = param.X->data<float>();
const float* y_data = param.Y->data<float>();
const float* out_grad_data = param.OutGrad->data<float>();
float* x_grad_data;
float* y_grad_data;
if (param.XGrad) {
x_grad_data = param.XGrad->mutable_data<float>();
}
if (param.YGrad) {
y_grad_data = param.YGrad->mutable_data<float>();
}
int axis = param.axis;
auto x_dims = param.X->dims();
auto y_dims = param.Y->dims();
int pre, n, post;
if (!param.XGrad) {
CHECK(param.YGrad);
lite::arm::math::elementwise_add_grad(
out_grad_data, y_grad_data, y_dims.production());
return;
}
if (!param.YGrad) {
CHECK(param.XGrad);
lite::arm::math::elementwise_add_grad(
out_grad_data, x_grad_data, x_dims.production());
return;
}
if (x_dims.size() < y_dims.size() &&
is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_add_grad_broadcast(
out_grad_data, y_grad_data, x_grad_data, pre, n, post);
} else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_add_grad_broadcast(
out_grad_data, x_grad_data, y_grad_data, pre, n, post);
} else {
lite::arm::math::elementwise_add_grad(
out_grad_data, x_grad_data, x_dims.production());
lite::arm::math::elementwise_add_grad(
out_grad_data, y_grad_data, y_dims.production());
}
}
void ElementwiseSubGradCompute::Run() {
auto& param = Param<operators::ElementwiseGradParam>();
const float* x_data = param.X->data<float>();
const float* y_data = param.Y->data<float>();
const float* out_data = param.OutGrad->data<float>();
float* x_grad_data;
float* y_grad_data;
if (param.XGrad) {
x_grad_data = param.XGrad->mutable_data<float>();
}
if (param.YGrad) {
y_grad_data = param.YGrad->mutable_data<float>();
}
int axis = param.axis;
auto x_dims = param.X->dims();
auto y_dims = param.Y->dims();
int pre, n, post;
if (!param.XGrad || !param.YGrad) {
CHECK(param.XGrad || param.YGrad);
lite::arm::math::elementwise_sub_grad(
out_data, x_grad_data, y_grad_data, y_dims.production());
return;
}
if (x_dims.size() < y_dims.size()) {
LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size";
}
if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
lite::arm::math::elementwise_sub_grad_broadcast(
out_data, x_grad_data, y_grad_data, pre, n, post);
} else {
lite::arm::math::elementwise_sub_grad(
out_data, x_grad_data, y_grad_data, x_dims.production());
}
}
template <typename T, PrecisionType PType>
void ElementwiseMulGradCompute<T, PType>::Run() {
LOG(FATAL) << "elementwise mul_grad not implement yet";
}
void ElementwiseMaxGradCompute::Run() {
LOG(FATAL) << "elementwise max_grad not implement yet";
}
void ElementwiseDivGradCompute::Run() {
LOG(FATAL) << "elementwise div_grad not implement yet";
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
using elementwise_mul_grad_float =
paddle::lite::kernels::arm::ElementwiseMulGradCompute<float,
PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(elementwise_add_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseAddGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseSubGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_div_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseDivGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(
elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_max_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseMaxGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class ElementwiseAddGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseAddGradCompute() = default;
};
class ElementwiseSubGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseSubGradCompute() = default;
};
template <typename T, PrecisionType PType>
class ElementwiseMulGradCompute : public KernelLite<TARGET(kARM), PType> {
public:
void Run() override;
virtual ~ElementwiseMulGradCompute() = default;
};
class ElementwiseMaxGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseMaxGradCompute() = default;
};
class ElementwiseDivGradCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseDivGradCompute() = default;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/mul_grad_compute.h"
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/arm/math/sgemm.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void MulGradCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>();
}
void MulGradCompute::Run() {
// step1 flatten_2d
auto& param = Param<param_t>();
const auto x_dims = param.x->dims();
const auto y_dims = param.y->dims();
const auto out_dims = param.output_grad->dims();
m_ = static_cast<int>(x_dims.Slice(0, param.x_num_col_dims).production());
k_ = static_cast<int>(
x_dims.Slice(param.x_num_col_dims, x_dims.size()).production());
n_ = static_cast<int>(
y_dims.Slice(param.y_num_col_dims, y_dims.size()).production());
const auto* out_grad_data = param.output_grad->data<float>();
const auto* x_data = param.x->data<float>();
const auto* y_data = param.y->data<float>();
float* x_grad_data;
float* y_grad_data;
if (param.x_grad) {
x_grad_data = param.x_grad->mutable_data<float>();
}
if (param.y_grad) {
y_grad_data = param.y_grad->mutable_data<float>();
}
paddle::lite::operators::ActivationParam act_param;
act_param.has_active = false;
// out_grad * y^T = x_grad
// (m, n), (n, k) -> (m, k)
auto& ctx = this->ctx_->template As<ARMContext>();
if (param.x_grad) {
if (m_ == 1) {
lite::arm::math::sgemv(y_data,
out_grad_data,
x_grad_data,
false,
k_, // M
n_, // N
false,
nullptr,
false,
lite_api::ActivationType::kIndentity,
&ctx);
} else {
paddle::lite::arm::math::sgemm(false,
true, // is_transB,
m_, // M
k_, // N
n_, // K
1.0f, // alpha
out_grad_data, // A
n_, // lda
y_data, // B
n_, // ldb
0.f, // beta
x_grad_data, // C
k_, // ldc
NULL, // bias
false, // is_bias
act_param, // act_param
&ctx); // ctx
}
}
// x^T * out_grad = y_grad
// (k, m) (m, n) -> (k, n)
if (param.y_grad) {
if (n_ == 1) {
lite::arm::math::sgemv(x_data,
out_grad_data,
y_grad_data,
true,
k_, // M
m_, // N
false,
nullptr,
false,
lite_api::ActivationType::kIndentity,
&ctx);
} else {
paddle::lite::arm::math::sgemm(true, // is_transA
false, // is_transB,
k_, // M
n_, // N
m_, // K
1.0f, // alpha
x_data, // A
k_, // lda
out_grad_data, // B
n_, // ldb
0.f, // beta
y_grad_data, // C
n_, // ldc
NULL, // bias
false, // is_bias
act_param, // act_param
&ctx); // ctx
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(mul_grad,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::MulGradCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class MulGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::MulGradParam;
void PrepareForRun() override;
void Run() override;
virtual ~MulGradCompute() = default;
private:
int m_, n_, k_;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) {
counter = ++(it->second);
}
return prefix + "_" + std::to_string(counter);
return prefix + "_" + paddle::lite::to_string(counter);
}
bool HasInputArg(const OpInfo* op_info,
......
if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA))
return()
endif()
......
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
return()
endif()
......
......@@ -87,7 +87,8 @@ class Graph {
auto idx = Add(name, node);
CHECK_GE(idx, 1);
// Generate a unique name for the created HiAI IR
node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
node->set_data(
std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
return node;
}
......
......@@ -64,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
split_op->create_dynamic_output_y(out_names.size());
int idx = 1;
for (auto& out_name : out_names) {
auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
auto zero_node =
graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0);
auto add_node = graph->Add<ge::op::Add>(out_name);
auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
add_op->set_input_x1(*split_node->data(),
"y" + paddle::lite::to_string(idx));
add_op->set_input_x2(*zero_node->data());
idx++;
}
......
......@@ -85,22 +85,31 @@ int SubgraphEngine::BuildDeviceProgram() {
<< "[NPU] No input nodes found for building NPU model";
CHECK(!device_onames_.empty())
<< "[NPU] No output nodes found for building NPU model";
// Build the HiAI IR graph to HiAI om model as the device program
device_program_ = lite::npu::Device::Global().Build(
if (device_program_map_.count(inputs_shape_) > 0) {
return status;
}
auto device_client = lite::npu::Device::Global().Build(
model_name_, device_inodes, device_onodes);
if (device_program_ == nullptr) {
if (device_client == nullptr) {
LOG(WARNING) << "[NPU] Build model failed!";
return subgraph::FAILED;
}
auto device_program = std::make_shared<device_program_t>(device_client);
device_program_map_[inputs_shape_] = device_program;
// Query and check the dimensions of valid input and output tensors
std::vector<hiai::TensorDimension> device_idims, device_odims;
if (device_program_->GetModelIOTensorDim(
if (device_program->client->GetModelIOTensorDim(
model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
LOG(WARNING)
<< "[NPU] Get the dimensions of input and output tensors failed!";
return subgraph::FAILED;
}
device_program->device_idims = device_idims;
device_program->device_odims = device_odims;
CHECK_EQ(device_idims.size(), device_inames_.size());
CHECK_EQ(device_odims.size(), device_onames_.size());
origin_idims_.resize(device_inames_.size());
......@@ -109,6 +118,7 @@ int SubgraphEngine::BuildDeviceProgram() {
origin_odims_.resize(device_onames_.size());
origin_otensors_.resize(device_onames_.size());
device_otensors_.resize(device_onames_.size());
for (int i = 0; i < device_inames_.size(); i++) {
auto node = graph.Get(device_inames_[i]);
auto precision = node->precision();
......@@ -130,6 +140,8 @@ int SubgraphEngine::BuildDeviceProgram() {
device_itensors_[i].reset(new hiai::AiTensor);
device_itensors_[i]->Init(&(device_idims[i]));
}
device_program->origin_idims = origin_idims_;
for (int i = 0; i < device_onames_.size(); i++) {
auto node = graph.Get(device_onames_[i]);
auto precision = node->precision();
......@@ -170,6 +182,8 @@ int SubgraphEngine::BuildDeviceProgram() {
<< PrecisionToStr(precision);
break;
}
device_program->origin_odims = origin_odims_;
CHECK_EQ(origin_odims_[i].production(),
device_odims[i].GetNumber() * device_odims[i].GetChannel() *
device_odims[i].GetHeight() * device_odims[i].GetWidth());
......@@ -181,14 +195,25 @@ int SubgraphEngine::BuildDeviceProgram() {
int SubgraphEngine::LaunchDeviceProgram() {
// Copy the data of origin input tensors to the buffer of input HiAI tensors
// init device_itensors_, device_otensors_, origin_otensors_
auto device_program = device_program_map_[inputs_shape_];
for (size_t i = 0; i < device_itensors_.size(); i++) {
device_itensors_[i]->Init(&(device_program->device_idims[i]));
std::memcpy(device_itensors_[i]->GetBuffer(),
origin_itensors_[i]->raw_data(),
origin_itensors_[i]->memory_size());
}
for (size_t i = 0; i < device_otensors_.size(); i++) {
device_otensors_[i]->Init(&(device_program->device_odims[i]));
}
for (size_t i = 0; i < origin_otensors_.size(); i++) {
origin_otensors_[i]->Resize(device_program->origin_odims[i]);
}
// Run the HiAI model by name
std::string key = "model_name"; // Note: key seems must be model_name
model_context_.AddPara(key, model_name_);
hiai::AiContext model_context;
model_context.AddPara(key, model_name_);
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
......@@ -196,11 +221,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
};
int istamp;
auto start_time = GetCurrentUS();
CHECK_EQ(
device_program_->Process(
model_context_, device_itensors_, device_otensors_, 1000, istamp),
hiai::AI_SUCCESS);
CHECK_EQ(device_program->client->Process(
model_context, device_itensors_, device_otensors_, 1000, istamp),
hiai::AI_SUCCESS);
VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
// Copy the data of output HiAI tensor to the buffer of origin output tensors
for (size_t i = 0; i < device_otensors_.size(); i++) {
std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
......@@ -210,6 +235,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
return 0;
}
bool SubgraphEngine::InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape;
for (auto origin_itensor : origin_itensors_) {
new_shape.push_back(origin_itensor->dims().Vectorize());
}
inputs_shape_ = new_shape;
if (device_program_map_.count(inputs_shape_) > 0) {
return false;
}
return true;
}
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(ctx_.get(),
......
......@@ -14,6 +14,7 @@
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
......@@ -38,17 +39,29 @@ class SubgraphEngine : public subgraph::Engine {
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {}
struct device_program_t {
explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
: client(_client) {}
std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
std::vector<DDim> origin_idims{};
std::vector<DDim> origin_odims{};
std::vector<hiai::TensorDimension> device_idims{};
std::vector<hiai::TensorDimension> device_odims{};
};
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
bool InputShapeChanged() override;
std::string model_name_;
hiai::AiContext model_context_;
std::vector<std::string> device_inames_;
std::vector<std::string> device_onames_;
std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
std::string model_name_{"model.om"};
std::vector<std::vector<int64_t>> inputs_shape_{};
std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
device_program_map_{};
std::vector<std::string> device_inames_{};
std::vector<std::string> device_onames_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
};
class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
......
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL))
return ()
endif()
......@@ -128,6 +128,9 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
# DEPS conv_opencl op_registry program context)
#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
# DEPS tensor cl_context cl_wrapper cl_target_wrapper)
#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
# DEPS depthwise_conv2d_opencl op_registry program context)
......
......@@ -101,6 +101,7 @@ class ActivationComputeImageDefault
status = kernel.setArg(++arg_idx, scale_);
CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
......@@ -112,6 +113,7 @@ class ActivationComputeImageDefault
VLOG(4) << "threshold:" << threshold_;
VLOG(4) << "scale:" << scale_;
VLOG(4) << "kernel func name:" << kernel_func_name_;
#endif
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
......@@ -177,7 +179,7 @@ REGISTER_LITE_KERNEL(
// exp
REGISTER_LITE_KERNEL(
exp_act,
exp,
kOpenCL,
kFP16,
kImageDefault,
......@@ -195,7 +197,7 @@ REGISTER_LITE_KERNEL(
// tanh
REGISTER_LITE_KERNEL(
tanh_act,
tanh,
kOpenCL,
kFP16,
kImageDefault,
......
......@@ -109,13 +109,13 @@ TEST(act_image2d_fp16, compute) {
func_name = "sigmoid";
break;
case 6: // tanh
func_name = "tanh_act";
func_name = "tanh";
break;
case 7: // tanh
func_name = "swish";
break;
case 8: // tanh
func_name = "exp_act";
func_name = "exp";
break;
}
LOG(INFO) << "func_name: " << func_name;
......@@ -307,7 +307,7 @@ USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
// exp
USE_LITE_KERNEL(exp_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault);
// swish
USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
......@@ -316,7 +316,7 @@ USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
// tanh act
USE_LITE_KERNEL(tanh_act, kOpenCL, kFP16, kImageDefault, ImageDefault);
USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault);
// relu image2d fp16
USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
......
......@@ -77,17 +77,21 @@ class BilinearInterpImageCompute
int out_h = out_dims[2];
int out_w = out_dims[3];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "x_image: " << x_img;
// VLOG(4) << "out_image: " << out_img;
VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
<< out_image_shape["height"];
......@@ -96,6 +100,7 @@ class BilinearInterpImageCompute
<< ", align_delta: " << align_delta;
VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......@@ -107,8 +112,10 @@ class BilinearInterpImageCompute
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img);
......@@ -142,9 +149,10 @@ class BilinearInterpImageCompute
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}
protected:
......
......@@ -123,7 +123,8 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
int arg_idx = 0;
int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
VLOG(4) << "concat 输入尺寸: ";
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "concat input shape: ";
for (size_t i = 0; i < inputs.size(); i++) {
VLOG(4) << "inputs [" << i << "]"
<< "[" << inputs[i]->dims().size() << "D]:"
......@@ -132,12 +133,13 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
<< inputs[i]->dims()[3];
}
VLOG(4) << "concat 输出尺寸: ";
VLOG(4) << "concat output shape: ";
VLOG(4) << " out dims: "
<< "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
<< " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "axis_: " << axis_;
VLOG(4) << "flag_: " << flag_;
#endif
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
......@@ -145,6 +147,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
x_dims[x_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["height"])};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.output->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
......@@ -157,6 +160,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " "
<< (image_shape["width"] / x_dims[x_dims.size() - 1]) << " "
<< (image_shape["height"]);
#endif
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int out_w = x_dims[x_dims.size() - 1];
......@@ -198,8 +202,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
image_shape = InitImageDimInfoWith(in_dims);
auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
int in_w = in_dims[in_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
#endif
global_work_size =
cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
static_cast<cl::size_type>(image_shape["width"] /
......
......@@ -78,6 +78,7 @@ void ConvImageCompute::PrepareForRun() {
VLOG(3) << "dilation_equal:" << dilation_equal;
VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3];
CHECK(pad_equal && stride_equal && dilation_equal);
if (kernel_h == 1 && kernel_w == 1) {
......@@ -85,9 +86,9 @@ void ConvImageCompute::PrepareForRun() {
if (param.x->dims()[1] % 4 == 0) {
kernel_func_names_.push_back("conv2d_1x1_simple");
} else {
kernel_func_names_.push_back("conv2d_1x1");
kernel_func_names_.push_back("conv2d_1x1_opt");
}
kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl");
CLImageConverterNWBlock converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
......@@ -97,7 +98,7 @@ void ConvImageCompute::PrepareForRun() {
filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d1x1;
impl_ = &ConvImageCompute::Conv2d1x1opt;
#define DEPTH_CONV_USE_SPL
#ifdef DEPTH_CONV_USE_SPL
} else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
......@@ -141,9 +142,10 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (kernel_h == 3 && kernel_h == 3) {
} else if (kernel_w == 3 && kernel_h == 3) {
// conv2d_3x3
kernel_func_names_.push_back("conv2d_3x3_opt");
kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
: "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
CLImageConverterFolder converter;
......@@ -156,6 +158,8 @@ void ConvImageCompute::PrepareForRun() {
impl_ = &ConvImageCompute::Conv2d3x3opt;
} else if (kernel_h == 5 && kernel_w == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
// conv2d_5x5
kernel_func_names_.push_back("conv2d_5x5");
kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
......@@ -169,7 +173,27 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d5x5;
#else
// conv2d_5x5_opt
kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
: "conv2d_5x5_opt");
kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<half_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d5x5opt;
#endif
#undef CONV_5x5_OPT
} else if (kernel_h == 7 && kernel_w == 7) {
#define CONV_7x7_OPT
#ifndef CONV_7x7_OPT
// conv2d_7x7
kernel_func_names_.push_back("conv2d_7x7");
kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
......@@ -183,6 +207,25 @@ void ConvImageCompute::PrepareForRun() {
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d7x7;
#else
// conv2d_7x7
kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
: "conv2d_7x7_opt");
kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
std::vector<half_t> filter_image_v(filter_image_dims[0] *
filter_image_dims[1] * 4); // 4 : RGBA
converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
impl_ = &ConvImageCompute::Conv2d7x7opt;
#endif
#undef CONV_7x7_OPT
} else {
LOG(FATAL) << "conv image compute not support this condition yet! ";
}
......@@ -229,7 +272,7 @@ void ConvImageCompute::PrepareForRun() {
}
}
void ConvImageCompute::Conv2d1x1() {
void ConvImageCompute::Conv2d1x1opt() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
......@@ -269,6 +312,7 @@ void ConvImageCompute::Conv2d1x1() {
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d_1x1 params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
......@@ -290,7 +334,7 @@ void ConvImageCompute::Conv2d1x1() {
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
......@@ -313,10 +357,12 @@ void ConvImageCompute::Conv2d1x1() {
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int maped_w = maptofactor(w, 4);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "maped_w: " << maped_w;
VLOG(4) << "hasbias: " << has_bias;
#endif
cl_int status;
int arg_idx = 0;
......@@ -363,21 +409,27 @@ void ConvImageCompute::Conv2d1x1() {
static_cast<size_t>(maped_w),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
......@@ -453,6 +505,7 @@ void ConvImageCompute::Conv2d3x3() {
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
......@@ -477,6 +530,7 @@ void ConvImageCompute::Conv2d3x3() {
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -496,9 +550,12 @@ void ConvImageCompute::Conv2d3x3() {
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status;
int arg_idx = 0;
......@@ -513,7 +570,9 @@ void ConvImageCompute::Conv2d3x3() {
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -553,9 +612,11 @@ void ConvImageCompute::Conv2d3x3() {
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
......@@ -586,7 +647,8 @@ void ConvImageCompute::Conv2d3x3opt() {
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
CHECK_EQ(input_dims[0], output_dims[0]);
int batch = input_dims[0];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
......@@ -611,8 +673,9 @@ void ConvImageCompute::Conv2d3x3opt() {
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
......@@ -632,6 +695,7 @@ void ConvImageCompute::Conv2d3x3opt() {
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -651,8 +715,11 @@ void ConvImageCompute::Conv2d3x3opt() {
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
cl_int status;
int arg_idx = 0;
......@@ -667,7 +734,9 @@ void ConvImageCompute::Conv2d3x3opt() {
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -681,6 +750,8 @@ void ConvImageCompute::Conv2d3x3opt() {
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, batch);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
......@@ -696,22 +767,27 @@ void ConvImageCompute::Conv2d3x3opt() {
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
......@@ -767,6 +843,7 @@ void ConvImageCompute::Conv2d5x5() {
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
......@@ -789,6 +866,7 @@ void ConvImageCompute::Conv2d5x5() {
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -808,9 +886,12 @@ void ConvImageCompute::Conv2d5x5() {
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status;
int arg_idx = 0;
......@@ -825,7 +906,9 @@ void ConvImageCompute::Conv2d5x5() {
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -855,9 +938,11 @@ void ConvImageCompute::Conv2d5x5() {
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
......@@ -870,6 +955,172 @@ void ConvImageCompute::Conv2d5x5() {
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d5x5opt() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
CHECK_EQ(input_dims[0], output_dims[0]);
int batch = input_dims[0];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, paddings[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, batch);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
// VLOG(4) << "out_image: " << out_image;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
......@@ -912,6 +1163,7 @@ void ConvImageCompute::Conv2d7x7() {
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
......@@ -934,6 +1186,7 @@ void ConvImageCompute::Conv2d7x7() {
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -953,9 +1206,12 @@ void ConvImageCompute::Conv2d7x7() {
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status;
int arg_idx = 0;
......@@ -970,7 +1226,9 @@ void ConvImageCompute::Conv2d7x7() {
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -1000,9 +1258,11 @@ void ConvImageCompute::Conv2d7x7() {
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
......@@ -1014,7 +1274,167 @@ void ConvImageCompute::Conv2d7x7() {
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Conv2d7x7opt() {
const auto& param = *param_.get_mutable<param_t>();
auto input_dims = param.x->dims();
auto paddings = *param.paddings;
auto strides = param.strides;
auto dilations = *param.dilations;
auto* input_image = param.x->data<half_t, cl::Image2D>();
auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
auto filter_dims = param.filter->dims();
auto output_dims = param.output->dims();
int input_width = input_dims[3];
int input_height = input_dims[2];
int input_channel = input_dims[1];
int output_width = output_dims[3];
int output_height = output_dims[2];
int output_channel = output_dims[1];
CHECK_EQ(input_dims[0], output_dims[0]);
int batch = input_dims[0];
auto out_image_shape = InitImageDimInfoWith(output_dims);
auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
const bool has_bias = param.bias != nullptr;
const bool is_element_wise_bias =
has_bias && param.output->dims() == param.bias->dims();
const std::vector<size_t>& default_work_size =
DefaultWorkSize(output_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
int c_block = default_work_size[0];
int w = default_work_size[1];
int nh = default_work_size[2];
int w_blk_size = 5;
int w_blk = (w + w_blk_size - 1) / w_blk_size;
// default_work_size[1] = w_blk;
int h_blk_size = 1;
int h_blk = (nh + h_blk_size - 1) / h_blk_size;
// default_work_size[2] = h_blk;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ conv2d 7x7 params ============";
// VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
// << input_image_shape["height"];
// VLOG(4) << "input_image: " << input_image;
VLOG(4) << "input_dims: " << input_dims;
VLOG(4) << "filter_dims: " << filter_dims;
// VLOG(4) << "filter_image: " << filter_image;
VLOG(4) << "output_dims: " << output_dims;
VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
<< out_image_shape["height"];
VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
VLOG(4) << "has bias: " << has_bias;
VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
VLOG(4) << "strides: " << strides[0] << "," << strides[1];
VLOG(4) << "dilations.size : " << dilations.size();
VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
CHECK_GE(input_dims.size(), 4);
CHECK_GE(paddings.size(), 2);
CHECK(paddings[0] == paddings[1]);
CHECK_GE(strides.size(), 2);
CHECK(strides[0] == strides[1]);
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
}
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
#endif
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, c_block);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, w_blk);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, h_blk);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *input_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(++arg_idx, *out_image);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, strides[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, paddings[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, dilations[0]);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, batch);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_channel);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, input_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_width);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
static_cast<size_t>(w_blk),
static_cast<size_t>(h_blk)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
size_t max_work_group_size = 0;
kernel.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
local_work_size,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::DepthwiseConv2d3x3s1() {
const auto& param = *param_.get_mutable<param_t>();
auto x_dims = param.x->dims();
......@@ -1071,7 +1491,9 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -1099,12 +1521,16 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
CL_KERNEL_WORK_GROUP_SIZE,
&max_work_group_size);
cl::NDRange local_work_size = cl::NullRange;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "max_work_group_size: " << max_work_group_size;
#endif
if (max_work_group_size > 0 && use_lws) {
local_work_size = context.cl_context()->LocalWorkSize(global_work_size,
max_work_group_size);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size[3D]: {" << local_work_size[0] << ","
<< local_work_size[1] << "," << local_work_size[2] << "}";
#endif
}
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
......@@ -1153,6 +1579,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
int nh = output_dims[0] * output_dims[2];
auto global_work_size = cl::NDRange(c_block, w, nh);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "setArg";
VLOG(4) << "c_block = " << c_block;
VLOG(4) << "w = " << w;
......@@ -1166,6 +1593,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
VLOG(4) << "x_dims[2] = " << x_dims[2];
VLOG(4) << "output_dims[3] = " << output_dims[3];
VLOG(4) << "output_dims[2] = " << output_dims[2];
#endif
cl_int status;
int arg_idx = 0;
......@@ -1185,7 +1613,9 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
const cl::Image2D* bias_image = nullptr;
if (has_bias) {
bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -1261,6 +1691,7 @@ void ConvImageCompute::DepthwiseConv2d() {
int w = default_work_size[1];
int nh = default_work_size[2];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "============ depthwise conv2d params ============";
VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
<< input_image_shape["height"];
......@@ -1282,6 +1713,7 @@ void ConvImageCompute::DepthwiseConv2d() {
VLOG(4) << "default work size{c_block, w, nh}: "
<< "{" << c_block << ", " << w << ", " << nh << ""
<< "}";
#endif
CHECK_GE(dilations.size(), 2);
CHECK(dilations[0] == dilations[1]);
......@@ -1303,9 +1735,12 @@ void ConvImageCompute::DepthwiseConv2d() {
STL::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
VLOG(4) << "w: " << w;
#endif
cl_int status;
int arg_idx = 0;
......@@ -1320,7 +1755,9 @@ void ConvImageCompute::DepthwiseConv2d() {
status = kernel.setArg(++arg_idx, *filter_image);
CL_CHECK_FATAL(status);
if (has_bias) {
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "set bias_image: ";
#endif
status = kernel.setArg(++arg_idx, *bias_image);
CL_CHECK_FATAL(status);
}
......@@ -1354,9 +1791,11 @@ void ConvImageCompute::DepthwiseConv2d() {
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
<< global_work_size[1] << "," << global_work_size[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
......
......@@ -41,11 +41,13 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
void Run() override;
private:
void Conv2d1x1();
void Conv2d1x1opt();
void Conv2d3x3();
void Conv2d3x3opt();
void Conv2d5x5();
void Conv2d5x5opt();
void Conv2d7x7();
void Conv2d7x7opt();
void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3();
void DepthwiseConv2d();
......
......@@ -510,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
const int dilation = 1;
const int stride = 2;
const int group = 1;
for (int batch_size = 1; batch_size < 2; ++batch_size) {
for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 5; ih < 9; ih += 1) { // ih
int iw = ih;
......@@ -532,7 +532,7 @@ const int stride = 2;
#else // big scale with group
const int stride = 1;
const int group = 32 / 1;
const int batch_size = 1;
const int batch_size = 2;
const int ic = 32 / 1;
const int ih = 112 / 1;
const int iw = 112 / 1;
......@@ -558,7 +558,8 @@ const int stride = 2;
PRECISION(kFP16),
DATALAYOUT(kImageDefault));
ASSERT_FALSE(kernels.empty());
CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
// CHECK(batch_size == 1) << "conv3x3 only supprt
// batch_size == 1";
auto kernel = std::move(kernels.front());
SHADOW_LOG << "created conv2d kernel";
......@@ -886,13 +887,14 @@ TEST(conv2d, compute_image2d_5x5) {
// int loop_cnt = 0;
#ifdef LOOP_TEST
for (int batch_size = 2; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 5; ih < 9; ih += 1) { // ih
for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 5; oc += 1) { // oc
for (int ih = 5; ih < 8; ih += 1) { // ih
int iw = ih;
for (int ic = 2; ic < 10; ic += 1) { // ic
for (int ic = 2; ic < 6; ic += 1) { // ic
for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {/*true,*/ "relu"}) {
for (std::string relu_flag : {""
"relu"}) {
#else
const int batch_size = 2;
const int oc = 1;
......@@ -1006,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) {
SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) {
i = 0.01 * gen(engine);
i = 0.5 * gen(engine);
}
for (auto& f : filter_v) {
f = 0.01 * gen(engine);
f = 0.5 * gen(engine);
}
SHADOW_LOG << "after gen input and filter ...";
......@@ -1216,9 +1218,10 @@ TEST(conv2d, compute_image2d_5x5) {
#undef LOOP_TEST
#undef PRINT_RESULT
#endif
#ifdef TEST_CONV_IMAGE_7x7
#undef FP16_ABS_DIFF
#define FP16_ABS_DIFF (1e0)
// #undef FP16_ABS_DIFF
// #define FP16_ABS_DIFF (1e-1)
// #define LOOP_TEST
TEST(conv2d, compute_image2d_7x7) {
// conv infos
......@@ -1230,13 +1233,13 @@ TEST(conv2d, compute_image2d_7x7) {
// int loop_cnt = 0;
#ifdef LOOP_TEST
for (int batch_size = 2; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 10; oc += 1) { // oc
for (int ih = 7; ih < 15; ih += 1) { // ih
for (int batch_size = 1; batch_size < 4; ++batch_size) {
for (int oc = 1; oc < 6; oc += 1) { // oc
for (int ih = 7; ih < 8; ih += 1) { // ih
int iw = ih;
for (int ic = 2; ic < 10; ic += 1) { // ic
for (bool bias_flag : {true, false}) {
for (std::string relu_flag : {"relu"}) {
for (int ic = 2; ic < 4; ic += 1) { // ic
for (bool bias_flag : {false, true}) {
for (std::string relu_flag : {"", "relu"}) {
#else
const int batch_size = 2;
const int oc = 1;
......@@ -1343,14 +1346,16 @@ TEST(conv2d, compute_image2d_7x7) {
SHADOW_LOG << "gen input and filter ...";
for (auto& i : input_v) {
i = gen(engine);
i = 0.1 * gen(engine);
#ifdef TEST_CONV_IMAGE_ALL_1
i = 1;
#endif
}
int fiii = 1;
for (auto& f : filter_v) {
f = gen(engine);
f = 0.1 * gen(engine);
#ifdef TEST_CONV_IMAGE_ALL_1
// f = fiii++;
f = 1;
#endif
}
......@@ -1424,7 +1429,8 @@ TEST(conv2d, compute_image2d_7x7) {
filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
filter_dim);
// auto* filter_image2d = filter.mutable_data<float,
// auto* filter_image2d =
// filter.mutable_data < float,
// cl::Image2D>(
// filter_image_width,
// filter_image_height,
......
......@@ -41,9 +41,11 @@ void ElementwiseAddCompute::Run() {
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(ele_param_->X->target());
VLOG(4) << TargetToStr(ele_param_->Y->target());
VLOG(4) << TargetToStr(ele_param_->Out->target());
#endif
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
......@@ -87,10 +89,12 @@ void ElementwiseAddCompute::UpdateParams() {
for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
num_ *= x_dims[i];
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "axis: " << axis;
VLOG(4) << "batch: " << batch_;
VLOG(4) << "channels: " << channels_;
VLOG(4) << "num: " << num_;
#endif
}
} // namespace opencl
......
......@@ -62,6 +62,7 @@ void ElementwiseAddImageCompute::Run() {
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
......@@ -69,6 +70,7 @@ void ElementwiseAddImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
......@@ -83,10 +85,12 @@ void ElementwiseAddImageCompute::Run() {
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......@@ -104,8 +108,9 @@ void ElementwiseAddImageCompute::Run() {
} else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w;
#endif
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
......@@ -127,7 +132,9 @@ void ElementwiseAddImageCompute::Run() {
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......
......@@ -56,7 +56,7 @@ class ElementwiseMulImageCompute
} else {
kernel_func_name_ = "channel_mul_d2_hw";
}
} else if (y_dims.size() == 4) {
} else if (y_dims.size() == 4 || x_dims.size() == 4) {
kernel_func_name_ = "channel_mul_d4";
} else {
LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
......@@ -80,12 +80,14 @@ class ElementwiseMulImageCompute
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << x->dims();
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape =
......@@ -101,10 +103,12 @@ class ElementwiseMulImageCompute
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......@@ -123,7 +127,9 @@ class ElementwiseMulImageCompute
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1 || y_dims.size() == 4) {
auto tensor_w = x_dims[x_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w;
#endif
// kernel: channel_mul_d1 / channel_mul_d4
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
......@@ -136,7 +142,9 @@ class ElementwiseMulImageCompute
} else if (y_dims.size() == 2) {
if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
auto tensor_w = x_dims[x_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w;
#endif
// kernel: channel_mul_d2_nc
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
......@@ -149,7 +157,9 @@ class ElementwiseMulImageCompute
} else {
auto y_tensor_h = y->dims()[0];
auto y_tensor_w = y->dims()[1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
#endif
// kernel: channel_mul_d2_hw
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
......@@ -162,6 +172,18 @@ class ElementwiseMulImageCompute
status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
CL_CHECK_FATAL(status);
}
} else if (x_dims.size() == 4) {
auto tensor_w = y_dims[y_dims.size() - 1];
VLOG(4) << "tensor_w:" << tensor_w;
// kernel: channel_mul_d4
cl_int status = kernel.setArg(arg_idx, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
<< y_dims.size();
......@@ -179,8 +201,9 @@ class ElementwiseMulImageCompute
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
}
protected:
......
......@@ -62,6 +62,7 @@ void ElementwiseSubImageCompute::Run() {
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "y->target():" << TargetToStr(y->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
......@@ -69,6 +70,7 @@ void ElementwiseSubImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
......@@ -83,10 +85,12 @@ void ElementwiseSubImageCompute::Run() {
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......@@ -104,8 +108,9 @@ void ElementwiseSubImageCompute::Run() {
} else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w;
#endif
cl_int status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
......@@ -127,7 +132,10 @@ void ElementwiseSubImageCompute::Run() {
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......
......@@ -57,10 +57,12 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto out_dims = out->dims();
auto in_dims = x->dims();
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
......@@ -71,10 +73,11 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
......@@ -87,8 +90,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img);
......@@ -114,9 +119,10 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}
protected:
......
......@@ -89,19 +89,23 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
int in_h = in_dims[2];
int in_w = in_dims[3];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(in_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
<< out_image_shape["height"];
VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
#endif
int threads = 512;
int group_size_x = (channel + 3) / 4;
......@@ -113,10 +117,13 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
cl::NDRange{static_cast<cl::size_type>(group_size_x * threads),
static_cast<cl::size_type>(group_size_y),
static_cast<cl::size_type>(1)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "local_work_size:[2D]:" << local_work_size[0] << " "
<< local_work_size[1] << " " << local_work_size[2];
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......
......@@ -42,11 +42,13 @@ class IoCopyHostToOpenCLCompute
CHECK(param.x->target() == TARGET(kHost) ||
param.x->target() == TARGET(kARM));
auto mem_size = param.x->memory_size();
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "param.x->memory_size():" << mem_size;
VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
VLOG(2) << "param.x->dims():" << param.x->dims();
VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
VLOG(2) << "param.y->dims():" << param.y->dims();
#endif
auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
CopyFromHostSync(data, param.x->raw_data(), mem_size);
}
......@@ -85,12 +87,14 @@ class IoCopykOpenCLToHostCompute
CHECK(param.x->target() == TARGET(kOpenCL));
auto mem_size = param.x->memory_size();
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "copy size " << mem_size;
VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
VLOG(2) << "param.x->dims():" << param.x->dims();
VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
VLOG(2) << "param.y->dims():" << param.y->dims();
VLOG(2) << "param.process_type:" << param.process_type;
#endif
auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
const cl::Buffer* x_ptr;
......@@ -104,7 +108,9 @@ class IoCopykOpenCLToHostCompute
auto* wait_list = context.cl_wait_list();
auto it = wait_list->find(x_ptr);
if (it != wait_list->end()) {
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
#endif
auto& event = *(it->second);
event.wait();
} else {
......
......@@ -74,6 +74,7 @@ class LayoutComputeBufferChwToImageDefault
const int Stride1 = out_H * out_W;
const int Stride0 = out_W;
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "param.process_type:" << param.process_type;
VLOG(2) << "x_dims:" << x_dims;
VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
......@@ -89,6 +90,7 @@ class LayoutComputeBufferChwToImageDefault
VLOG(2) << "Stride2:" << Stride2;
VLOG(2) << "Stride1:" << Stride1;
VLOG(2) << "Stride0:" << Stride0;
#endif
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
......@@ -177,6 +179,7 @@ class LayoutComputeImageDefaultToBufferChw
new_dims[4 - x_dims.size() + j] = x_dims[j];
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "param.process_type:" << param.process_type;
VLOG(2) << "x_dims:" << x_dims;
VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
......@@ -186,6 +189,7 @@ class LayoutComputeImageDefaultToBufferChw
<< new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
VLOG(2) << "y_dims:" << y_dims;
VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
#endif
size_t C = new_dims[1];
size_t in_height = new_dims[2];
......@@ -217,8 +221,10 @@ class LayoutComputeImageDefaultToBufferChw
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(C));
CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
<< " " << (new_dims[0] * new_dims[2]);
#endif
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
static_cast<cl::size_type>(new_dims[3]),
......
......@@ -65,6 +65,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
auto out_dims = out->dims();
auto in_dims = x->dims();
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target(): " << TargetToStr(x->target());
VLOG(4) << "out->target(): " << TargetToStr(out->target());
VLOG(4) << "x->dims(): " << in_dims;
......@@ -74,6 +75,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
VLOG(4) << "alpha: " << alpha_;
VLOG(4) << "beta: " << beta_;
VLOG(4) << "norm_region: " << norm_region_;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
......@@ -81,9 +83,12 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......@@ -97,8 +102,10 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[3];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img);
......@@ -130,9 +137,10 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}
protected:
......
......@@ -87,6 +87,7 @@ class NearestInterpComputeImageDefault
status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "out_image_shape(w,h):" << out_image_shape["width"] << " "
......@@ -95,6 +96,7 @@ class NearestInterpComputeImageDefault
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
<< y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
#endif
const std::vector<size_t>& default_work_size =
DefaultWorkSize(y_dims,
......
......@@ -71,10 +71,12 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
int out_h = out_dims[2];
int out_w = out_dims[3];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
......@@ -82,11 +84,13 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
<< out_image_shape["height"];
VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
......@@ -98,9 +102,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
int pad_h0 = pad2d_param_->paddings[0];
int pad_h1 = pad2d_param_->paddings[1];
int pad_w0 = pad2d_param_->paddings[2];
......@@ -144,9 +149,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}
protected:
......
......@@ -89,7 +89,7 @@ void pad2d_ref(const float *x_data,
}
}
#define LOOP_TEST
// #define LOOP_TEST
// #define PRINT_RESULT
TEST(pad2d_image2d, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
......
......@@ -59,10 +59,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
std::vector<int> paddings = *param.paddings;
std::vector<int> strides = param.strides;
std::vector<int> ksize = param.ksize;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_pooling: " << global_pooling;
VLOG(4) << "pooling_type: " << pooling_type;
VLOG(4) << "paddings : " << paddings[0] << " " << paddings[1] << " "
<< paddings[2] << " " << paddings[3] << " ";
#endif
if (global_pooling) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[2 * i] = 0;
......@@ -70,6 +74,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
ksize[i] = static_cast<int>(in_dims[i + 2]);
}
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << " "
<< in_dims[1] << " " << in_dims[2] << " " << in_dims[3];
VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << " "
......@@ -82,6 +88,8 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
<< ksize[1] << " " << ksize[2] << " " << ksize[3];
VLOG(4) << "paddings : [" << paddings.size() << "]" << paddings[0] << " "
<< paddings[1] << " " << paddings[2] << " " << paddings[3];
#endif
bool pads_equal =
(paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
if (!pads_equal) {
......@@ -95,8 +103,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
// VLOG(4) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(out_dims);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// VLOG(4) << "out_image" << out_img;
......@@ -109,8 +119,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
int w = out_dims[3];
int nh = out_dims[0] * out_dims[2];
auto global_work_size = cl::NDRange(c_block, w, nh);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size : [" << 3 << "]" << c_block << " " << w
<< " " << nh << " ";
#endif
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_img);
......
......@@ -41,8 +41,6 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
}
void Run() override {
VLOG(4) << "reshape_compute run ... ";
auto& param = *param_.get_mutable<param_t>();
const Tensor* const x = param.x;
......@@ -64,8 +62,9 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
InitImageDimInfoWith(out_dims);
cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
out_image_shape.at("width"), out_image_shape.at("height"));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_dims= " << out_dims;
#endif
const std::vector<size_t>& default_work_size = DefaultWorkSize(
out_dims,
DDim(std::vector<DDim::value_type>{
......@@ -94,6 +93,8 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
int out_Stride0 = out_W;
int out_Stride1 = out_H * out_W;
int out_Stride2 = out_C * out_H * out_W;
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_C=" << out_C;
VLOG(4) << "out_H=" << out_H;
VLOG(4) << "out_W=" << out_W;
......@@ -104,17 +105,20 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
VLOG(4) << "in_Stride1=" << in_Stride1;
VLOG(4) << "out_Stride0=" << out_Stride0;
VLOG(4) << "out_Stride1=" << out_Stride1;
#endif
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(x->target());
VLOG(4) << TargetToStr(param.output->target());
#endif
int arg_idx = 0;
cl_int status;
status = kernel.setArg(arg_idx, *x_image);
CL_CHECK_FATAL(status);
......@@ -199,8 +203,8 @@ REGISTER_LITE_KERNEL(reshape2,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......@@ -217,7 +221,7 @@ REGISTER_LITE_KERNEL(flatten,
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......@@ -235,7 +239,7 @@ REGISTER_LITE_KERNEL(flatten2,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
......
......@@ -51,8 +51,10 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
// LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(in_dims);
LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// LOG(INFO) << "out_image" << out_img;
......
......@@ -49,7 +49,7 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
CHECK_GE(idx, 1);
node->set_data(std::make_shared<xtcl::xExpr>(layer));
// Generate a unique name for the current XTCL layer
builder_.SetLayer(name + "__" + std::to_string(idx));
builder_.SetLayer(name + "__" + paddle::lite::to_string(idx));
return node;
}
......
......@@ -155,7 +155,7 @@ TEST(ListBuilder, basic) {
for (int i = 0; i < num_elems; i++) {
auto* elem = li.New();
elem->set("elem-" + std::to_string(i));
elem->set("elem-" + paddle::lite::to_string(i));
}
li.Save();
table.SaveToFile("2.bf");
......@@ -169,7 +169,7 @@ TEST(ListBuilder, basic) {
li1.Load();
for (int i = 0; i < num_elems; i++) {
ASSERT_EQ(li1.Get(i).data(), "elem-" + std::to_string(i));
ASSERT_EQ(li1.Get(i).data(), "elem-" + paddle::lite::to_string(i));
}
}
......
......@@ -144,6 +144,8 @@ add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
if (LITE_WITH_TRAIN)
add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
endif()
......
......@@ -80,6 +80,34 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
}
}
bool ConvOpLite::SmartInferShape() {
if (!last_input_shapes.empty()) {
if (last_input_shapes[0] == param_.x->dims() &&
last_input_lods[0] == param_.x->lod()) {
param_.output->Resize(last_output_shapes[0]);
param_.output->set_lod(last_output_lods[0]);
return true;
}
}
this->InferShape();
if (!last_input_shapes.empty()) {
last_input_shapes.clear();
last_input_lods.clear();
}
last_input_shapes.push_back(param_.x->dims());
last_input_lods.push_back(param_.x->lod());
if (!last_output_shapes.empty()) {
last_output_shapes.clear();
last_output_lods.clear();
}
last_output_shapes.push_back(param_.output->dims());
last_output_lods.push_back(param_.output->lod());
return true;
}
bool ConvOpLite::InferShape() const {
const auto in_dims = param_.x->dims();
const auto filter_dims = param_.filter->dims();
......@@ -104,9 +132,9 @@ bool ConvOpLite::InferShape() const {
// Set output dims
param_.output->Resize(lite::DDim(output_shape));
// share LoD
// param_.output->set_lod(param_.x->lod());
param_.output->set_lod(param_.x->lod());
return true;
}
......
......@@ -36,6 +36,7 @@ class ConvOpLite : public OpLite {
bool CheckShape() const override;
bool InferShape() const override;
bool SmartInferShape() override;
// TODO(Superjomn) replace framework::OpDesc with a lite one.
bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/elementwise_grad_ops.h"
#include <algorithm>
#include <cmath>
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool ElementwiseGradOp::CheckShape() const {
CHECK_OR_FALSE(param_.XGrad || param_.YGrad);
CHECK_OR_FALSE(param_.OutGrad);
return true;
}
bool ElementwiseGradOp::InferShape() const {
auto x_dim = param_.X->dims();
auto y_dim = param_.Y->dims();
if (param_.XGrad) {
param_.XGrad->Resize(x_dim);
}
if (param_.YGrad) {
param_.YGrad->Resize(y_dim);
}
return true;
}
bool ElementwiseGradOp::AttachImpl(const cpp::OpDesc& opdesc,
lite::Scope* scope) {
auto Y_name = opdesc.Input("Y").front();
auto X_name = opdesc.Input("X").front();
auto Out_name = opdesc.Input("Out@GRAD").front();
CHECK(!opdesc.Output("X@GRAD").empty() || !opdesc.Output("Y@GRAD").empty())
<< "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
if (!opdesc.Output("X@GRAD").empty()) {
auto x_grad_name = opdesc.Output("X@GRAD").front();
param_.XGrad = GetMutableVar<lite::Tensor>(scope, x_grad_name);
}
if (!opdesc.Output("Y@GRAD").empty()) {
auto y_grad_name = opdesc.Output("Y@GRAD").front();
param_.YGrad = GetMutableVar<lite::Tensor>(scope, y_grad_name);
}
param_.X = GetVar<lite::Tensor>(scope, X_name);
param_.Y = GetVar<lite::Tensor>(scope, Y_name);
param_.OutGrad = GetVar<lite::Tensor>(scope, Out_name);
param_.axis = opdesc.GetAttr<int>("axis");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(elementwise_sub_grad,
paddle::lite::operators::ElementwiseGradOp);
REGISTER_LITE_OP(elementwise_add_grad,
paddle::lite::operators::ElementwiseGradOp);
REGISTER_LITE_OP(elementwise_grad_mul,
paddle::lite::operators::ElementwiseGradOp);
REGISTER_LITE_OP(elementwise_grad_max,
paddle::lite::operators::ElementwiseGradOp);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
namespace paddle {
namespace lite {
namespace operators {
class ElementwiseGradOp : public OpLite {
public:
explicit ElementwiseGradOp(const std::string& op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "elementwise_grad_op"; }
private:
mutable operators::ElementwiseGradParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
......@@ -26,7 +26,38 @@ bool ElementwiseOp::CheckShape() const {
CHECK_OR_FALSE(param_.Out);
return true;
}
bool ElementwiseOp::SmartInferShape() {
if (!last_input_shapes.empty()) {
if (last_input_shapes[0] == param_.X->dims() &&
last_input_shapes[1] == param_.Y->dims() &&
last_input_lods[0] == param_.X->lod() &&
last_input_lods[1] == param_.Y->lod()) {
param_.Out->Resize(last_output_shapes[0]);
param_.Out->set_lod(last_output_lods[0]);
return true;
}
}
this->InferShape();
if (!last_input_shapes.empty()) {
last_input_shapes.clear();
last_input_lods.clear();
}
last_input_shapes.push_back(param_.X->dims());
last_input_lods.push_back(param_.X->lod());
last_input_shapes.push_back(param_.Y->dims());
last_input_lods.push_back(param_.Y->lod());
if (!last_output_shapes.empty()) {
last_output_shapes.clear();
last_output_lods.clear();
}
last_output_shapes.push_back(param_.Out->dims());
last_output_lods.push_back(param_.Out->lod());
return true;
}
bool ElementwiseOp::InferShape() const {
auto x_dim = param_.X->dims();
auto y_dim = param_.Y->dims();
......@@ -81,6 +112,7 @@ bool ElementwiseOp::InferShape() const {
auto out_lod = param_.Out->mutable_lod();
*out_lod = param_.X->lod();
}
return true;
}
......
......@@ -28,6 +28,7 @@ class ElementwiseOp : public OpLite {
bool CheckShape() const override;
bool InferShape() const override;
bool SmartInferShape() override;
bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
......
......@@ -48,6 +48,33 @@ bool FcOpLite::CheckShape() const {
return true;
}
bool FcOpLite::SmartInferShape() {
if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
if (last_input_shapes[0] == param_.input->dims() &&
last_input_lods[0] == param_.input->lod()) {
param_.output->Resize(last_output_shapes[0]);
param_.output->set_lod(last_output_lods[0]);
return true;
}
}
this->InferShape();
if (!last_input_shapes.empty()) {
last_input_shapes.clear();
last_input_lods.clear();
}
last_input_shapes.push_back(param_.input->dims());
last_input_lods.push_back(param_.input->lod());
if (!last_output_shapes.empty()) {
last_output_shapes.clear();
last_output_lods.clear();
}
last_output_shapes.push_back(param_.output->dims());
last_output_lods.push_back(param_.output->lod());
return true;
}
bool FcOpLite::InferShape() const {
const auto& input_dims = param_.input->dims();
const auto& w_dims = param_.w->dims();
......@@ -64,6 +91,7 @@ bool FcOpLite::InferShape() const {
// share LoD
param_.output->set_lod(param_.input->lod());
return true;
}
......
......@@ -36,6 +36,7 @@ class FcOpLite : public OpLite {
bool CheckShape() const override;
bool InferShape() const override;
bool SmartInferShape() override;
bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/mul_grad_op.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace operators {
bool MulGradOpLite::CheckShape() const {
CHECK_OR_FALSE(param_.x);
CHECK_OR_FALSE(param_.y);
CHECK_OR_FALSE(param_.output_grad);
CHECK_OR_FALSE(param_.x_grad || param_.y_grad);
CHECK_OR_FALSE(param_.x_num_col_dims);
CHECK_OR_FALSE(param_.y_num_col_dims);
const auto x_dims = param_.x->dims();
const auto y_dims = param_.y->dims();
const auto out_dims = param_.output_grad->dims();
CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
auto x_flatten_dims = flatten_2d(x_dims, param_.x_num_col_dims);
auto y_flatten_dims = flatten_2d(y_dims, param_.y_num_col_dims);
auto out_flatten_dims = flatten_2d(out_dims, param_.x_num_col_dims);
// Out = X * Y;
CHECK_EQ_OR_FALSE(x_flatten_dims[1], y_flatten_dims[0]);
CHECK_EQ_OR_FALSE(x_flatten_dims[0], out_flatten_dims[0]);
CHECK_EQ_OR_FALSE(y_flatten_dims[1], out_flatten_dims[1]);
return true;
}
bool MulGradOpLite::InferShape() const {
const auto x_dims = param_.x->dims();
const auto y_dims = param_.y->dims();
if (param_.x_grad) {
param_.x_grad->Resize(x_dims);
param_.x_grad->set_lod(param_.x->lod());
}
if (param_.y_grad) {
param_.y_grad->Resize(y_dims);
param_.y_grad->set_lod(param_.y->lod());
}
}
bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
CHECK(!op_desc.Input("X").empty());
CHECK(!op_desc.Input("Y").empty());
CHECK(!op_desc.Input("Out@GRAD").empty());
CHECK(!op_desc.Output("X@GRAD").empty() || !op_desc.Output("Y@GRAD").empty())
<< "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
auto *x_var = scope->FindVar(op_desc.Input("X").front());
CHECK(x_var);
param_.x = &x_var->Get<Tensor>();
auto *y_var = scope->FindVar(op_desc.Input("Y").front());
CHECK(y_var);
param_.y = &y_var->Get<Tensor>();
auto *out_grad_var = scope->FindVar(op_desc.Input("Out@GRAD").front());
CHECK(out_grad_var);
param_.output_grad = &out_grad_var->Get<Tensor>();
if (!op_desc.Output("X@GRAD").empty()) {
auto *x_grad_var = scope->FindVar(op_desc.Output("X@GRAD").front());
CHECK(x_grad_var);
param_.x_grad = x_grad_var->GetMutable<Tensor>();
}
if (!op_desc.Output("Y@GRAD").empty()) {
auto *y_grad_var = scope->FindVar(op_desc.Output("Y@GRAD").front());
CHECK(y_grad_var);
param_.y_grad = y_grad_var->GetMutable<Tensor>();
}
param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/operators/op_params.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class MulGradOpLite : public OpLite {
public:
MulGradOpLite() {}
explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
bool CheckShape() const override;
bool InferShape() const override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
std::string DebugString() const override { return "mul_grad"; }
private:
mutable MulGradParam param_;
};
std::vector<int64_t> flatten_2d(DDim dims, int num_col_dims) {
std::vector<int64_t> flatten_dims{1, 1};
for (int i = 0; i < dims.size(); i++) {
if (i < num_col_dims) {
flatten_dims[0] *= dims[i];
} else {
flatten_dims[1] *= dims[i];
}
}
return flatten_dims;
}
} // namespace operators
} // namespace lite
} // namespace paddle
......@@ -66,28 +66,6 @@ class MulOpLite : public OpLite {
mutable MulParam param_;
};
#ifdef LITE_WITH_TRAIN
class MulGradOpLite : public OpLite {
public:
MulGradOpLite() {}
explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
bool CheckShape() const override;
bool InferShape() const override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
std::string DebugString() const override { return "mul_grad"; }
private:
mutable MulGradParam param_;
};
#endif
} // namespace operators
} // namespace lite
} // namespace paddle
......@@ -387,10 +387,11 @@ struct ElementwiseParam {
};
struct ElementwiseGradParam {
const lite::Tensor* X{};
const lite::Tensor* Y{};
const lite::Tensor* Out_grad{};
lite::Tensor* X_grad{};
lite::Tensor* Y_grad{};
const lite::Tensor* OutGrad{};
lite::Tensor* XGrad{};
lite::Tensor* YGrad{};
int axis{-1}; // for broadcasting.
};
......
......@@ -29,10 +29,39 @@ bool SoftmaxOp::CheckShape() const {
return true;
}
bool SoftmaxOp::SmartInferShape() {
if (!last_input_shapes.empty() && !last_output_shapes.empty()) {
if (param_.x->dims() == last_input_shapes[0] &&
param_.x->lod() == last_input_lods[0]) {
param_.output->Resize(last_output_shapes[0]);
param_.output->set_lod(last_output_lods[0]);
return true;
}
}
this->InferShape();
if (!last_input_shapes.empty()) {
last_input_shapes.clear();
last_input_lods.clear();
}
last_input_shapes.push_back(param_.x->dims());
last_input_lods.push_back(param_.x->lod());
if (!last_output_shapes.empty()) {
last_output_shapes.clear();
last_output_lods.clear();
}
last_output_shapes.push_back(param_.output->dims());
last_output_lods.push_back(param_.output->lod());
return true;
}
bool SoftmaxOp::InferShape() const {
param_.output->Resize(param_.x->dims());
auto out_lod = param_.output->mutable_lod();
*out_lod = param_.x->lod();
return true;
}
......
......@@ -31,6 +31,7 @@ class SoftmaxOp : public OpLite {
bool CheckShape() const override;
bool InferShape() const override;
bool SmartInferShape() override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
......
......@@ -65,6 +65,8 @@ if(LITE_BUILD_EXTRA)
if (LITE_WITH_TRAIN)
lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......
......@@ -128,7 +128,7 @@ class ConcateComputeTester : public arena::TestCase {
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = static_cast<float>(i + n);
}
const std::string x_name = "x_tensor_" + std::to_string(n);
const std::string x_name = "x_tensor_" + paddle::lite::to_string(n);
x_vct_.push_back(x_name);
SetCommonTensor(x_name, x_dims_, x_data.data());
}
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/elementwise_grad_compute.h"
#include <gtest/gtest.h>
#include "lite/core/op_registry.h"
#include "lite/kernels/arm/elementwise_compute.h"
#include "lite/tests/utils/fill_data.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
using param_t = operators::ElementwiseParam;
using grad_param_t = operators::ElementwiseGradParam;
using kernel_add_t = ElementwiseAddCompute;
using grad_kernel_add_t = ElementwiseAddGradCompute;
using kernel_sub_t = ElementwiseSubCompute;
using grad_kernel_sub_t = ElementwiseSubGradCompute;
void elementwise_common(grad_param_t& param, // NOLINT
std::vector<float>& out_grad, // NOLINT
std::vector<float>& x_grad, // NOLINT
std::vector<float>& y_grad, // NOLINT
std::string flag) {
auto x_dims = param.X->dims();
auto y_dims = param.Y->dims();
if (x_dims == y_dims) {
for (int i = 0; i < x_dims.production(); ++i) {
if (flag == "add") {
x_grad[i] = out_grad[i];
y_grad[i] = out_grad[i];
}
if (flag == "sub") {
x_grad[i] = out_grad[i];
y_grad[i] = -out_grad[i];
}
}
} else {
LOG(FATAL) << "unsupport dims";
}
}
class ElementwiseAddGradTester {
public:
explicit ElementwiseAddGradTester(const DDim& x_dims,
const DDim& y_dims,
int axis)
: x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
void prepare_kernel() {
std::unique_ptr<KernelContext> ctx1(new KernelContext);
ctx1->As<ARMContext>();
kernel_.SetContext(std::move(ctx1));
std::unique_ptr<KernelContext> ctx3(new KernelContext);
ctx3->As<ARMContext>();
grad_kernel_.SetContext(std::move(ctx3));
}
void run_forward(param_t* param,
kernel_add_t* kernel,
const std::vector<float>& x_vec,
const std::vector<float>& y_vec,
float* out_vec) {
Tensor x;
Tensor y;
Tensor output;
x.Resize(x_dims_);
y.Resize(y_dims_);
output.Resize(DDim(out_dims_));
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = x_vec[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_data[i] = y_vec[i];
}
param->X = &x;
param->Y = &y;
param->Out = &output;
param->axis = axis_;
kernel->SetParam(*param);
kernel->Launch();
auto* output_data = output.mutable_data<float>();
for (int i = 0; i < out_dims_.production(); i++) {
out_vec[i] = output_data[i];
}
}
void run_backward(grad_param_t* param,
grad_kernel_add_t* kernel,
const std::vector<float>& x_vec,
const std::vector<float>& y_vec,
const std::vector<float>& out_grad_vec,
float* x_grad_vec,
float* y_grad_vec) {
Tensor x;
Tensor x_grad;
Tensor y;
Tensor y_grad;
Tensor out_grad;
x.Resize(x_dims_);
x_grad.Resize(x_dims_);
y.Resize(y_dims_);
y_grad.Resize(y_dims_);
out_grad.Resize(out_dims_);
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
auto* out_grad_data = out_grad.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = x_vec[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_data[i] = y_vec[i];
}
for (int i = 0; i < out_dims_.production(); i++) {
out_grad_data[i] = out_grad_vec[i];
}
param->X = &x;
param->XGrad = &x_grad;
param->Y = &y;
param->YGrad = &y_grad;
param->OutGrad = &out_grad;
param->axis = axis_;
kernel->SetParam(*param);
kernel->Launch();
auto* x_grad_data = x_grad.mutable_data<float>();
auto* y_grad_data = y_grad.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_grad_vec[i] = x_grad_data[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_grad_vec[i] = y_grad_data[i];
}
}
void check_grad(float delta2, float max_grad_delta2) {
std::vector<int64_t> out_shape;
// infer shape
auto x_dim = x_dims_;
auto y_dim = y_dims_;
if (x_dim == y_dim) {
out_dims_ = x_dim;
} else {
int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
int axis = param_.axis;
axis =
(axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
: axis);
std::vector<int64_t> x_dims_array(max_dim);
std::vector<int64_t> y_dims_array(max_dim);
std::vector<int64_t> out_dims_array(max_dim);
if (x_dim.size() > y_dim.size()) {
for (int i = 0; i < axis; ++i) {
y_dims_array[i] = 1;
}
if (axis + y_dim.size() < max_dim) {
for (int i = axis + y_dim.size(); i < max_dim; ++i) {
y_dims_array[i] = 1;
}
}
x_dims_array = x_dim.Vectorize();
for (int i = 0; i < y_dim.size(); ++i) {
y_dims_array[i + axis] = y_dim[i];
}
} else {
for (int i = 0; i < axis; ++i) {
x_dims_array[i] = 1;
}
if (axis + x_dim.size() < max_dim) {
for (int i = axis + x_dim.size(); i < max_dim; ++i) {
x_dims_array[i] = 1;
}
}
y_dims_array = y_dim.Vectorize();
for (int i = 0; i < x_dim.size(); ++i) {
x_dims_array[i + axis] = x_dim[i];
}
}
for (int i = 0; i < max_dim; i++) {
if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
out_dims_array[i] = -1;
} else {
out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
}
}
out_dims_ = DDim(out_dims_array);
}
// infer end
// forward
std::vector<float> x(x_dims_.production());
std::vector<float> y(y_dims_.production());
std::vector<float> out(out_dims_.production());
fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
this->run_forward(&param_, &kernel_, x, y, out.data());
for (int i = 0; i < x_dims_.production(); i++) {
LOG(INFO) << "x_" << i << ": " << x[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
LOG(INFO) << "y_" << i << ": " << y[i];
}
for (int i = 0; i < out_dims_.production(); i++) {
LOG(INFO) << "out_" << i << ": " << out[i];
}
// backward
std::vector<float> out_grad(out_dims_.production());
std::vector<float> x_grad(x_dims_.production());
std::vector<float> y_grad(y_dims_.production());
for (int i = 0; i < out_dims_.production(); i++) {
out_grad[i] = 1.0;
}
this->run_backward(&grad_param_,
&grad_kernel_,
x,
y,
out_grad,
x_grad.data(),
y_grad.data());
for (int i = 0; i < x_grad.size(); i++) {
LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
}
for (int i = 0; i < y_grad.size(); i++) {
LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
}
// get numeric gradient
std::vector<float> x_delta(x_dims_.production());
std::vector<float> y_delta(y_dims_.production());
std::vector<float> out_delta(out_dims_.production());
Tensor tensor_x;
Tensor tensor_y;
tensor_x.Resize(x_dims_);
tensor_y.Resize(y_dims_);
grad_param_.X = &tensor_x;
grad_param_.Y = &tensor_y;
elementwise_common(grad_param_, out_grad, x_delta, y_delta, "add");
float max_grad_delta = 0.0005;
for (int i = 0; i < x_dims_.production(); i++) {
EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
}
}
private:
DDim x_dims_;
DDim y_dims_;
DDim out_dims_;
int axis_;
kernel_add_t kernel_;
grad_kernel_add_t grad_kernel_;
param_t param_;
grad_param_t grad_param_;
};
class ElementwiseSubGradTester {
public:
explicit ElementwiseSubGradTester(const DDim& x_dims,
const DDim& y_dims,
int axis)
: x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
void prepare_kernel() {
std::unique_ptr<KernelContext> ctx1(new KernelContext);
ctx1->As<ARMContext>();
kernel_.SetContext(std::move(ctx1));
std::unique_ptr<KernelContext> ctx3(new KernelContext);
ctx3->As<ARMContext>();
grad_kernel_.SetContext(std::move(ctx3));
}
void run_forward(param_t* param,
kernel_sub_t* kernel,
const std::vector<float>& x_vec,
const std::vector<float>& y_vec,
float* out_vec) {
Tensor x;
Tensor y;
Tensor output;
x.Resize(x_dims_);
y.Resize(y_dims_);
output.Resize(DDim(out_dims_));
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = x_vec[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_data[i] = y_vec[i];
}
param->X = &x;
param->Y = &y;
param->Out = &output;
param->axis = axis_;
kernel->SetParam(*param);
kernel->Launch();
auto* output_data = output.mutable_data<float>();
for (int i = 0; i < out_dims_.production(); i++) {
out_vec[i] = output_data[i];
}
}
void run_backward(grad_param_t* param,
grad_kernel_sub_t* kernel,
const std::vector<float>& x_vec,
const std::vector<float>& y_vec,
const std::vector<float>& out_grad_vec,
float* x_grad_vec,
float* y_grad_vec) {
Tensor x;
Tensor x_grad;
Tensor y;
Tensor y_grad;
Tensor out_grad;
x.Resize(x_dims_);
x_grad.Resize(x_dims_);
y.Resize(y_dims_);
y_grad.Resize(y_dims_);
out_grad.Resize(out_dims_);
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
auto* out_grad_data = out_grad.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = x_vec[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_data[i] = y_vec[i];
}
for (int i = 0; i < out_dims_.production(); i++) {
out_grad_data[i] = out_grad_vec[i];
}
param->X = &x;
param->XGrad = &x_grad;
param->Y = &y;
param->YGrad = &y_grad;
param->OutGrad = &out_grad;
param->axis = axis_;
kernel->SetParam(*param);
kernel->Launch();
auto* x_grad_data = x_grad.mutable_data<float>();
auto* y_grad_data = y_grad.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_grad_vec[i] = x_grad_data[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_grad_vec[i] = y_grad_data[i];
}
}
void check_grad(float delta2, float max_grad_delta2) {
std::vector<int64_t> out_shape;
// infer shape
auto x_dim = x_dims_;
auto y_dim = y_dims_;
if (x_dim == y_dim) {
out_dims_ = x_dim;
} else {
int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
int axis = param_.axis;
axis =
(axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
: axis);
std::vector<int64_t> x_dims_array(max_dim);
std::vector<int64_t> y_dims_array(max_dim);
std::vector<int64_t> out_dims_array(max_dim);
if (x_dim.size() > y_dim.size()) {
for (int i = 0; i < axis; ++i) {
y_dims_array[i] = 1;
}
if (axis + y_dim.size() < max_dim) {
for (int i = axis + y_dim.size(); i < max_dim; ++i) {
y_dims_array[i] = 1;
}
}
x_dims_array = x_dim.Vectorize();
for (int i = 0; i < y_dim.size(); ++i) {
y_dims_array[i + axis] = y_dim[i];
}
} else {
for (int i = 0; i < axis; ++i) {
x_dims_array[i] = 1;
}
if (axis + x_dim.size() < max_dim) {
for (int i = axis + x_dim.size(); i < max_dim; ++i) {
x_dims_array[i] = 1;
}
}
y_dims_array = y_dim.Vectorize();
for (int i = 0; i < x_dim.size(); ++i) {
x_dims_array[i + axis] = x_dim[i];
}
}
for (int i = 0; i < max_dim; i++) {
if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
out_dims_array[i] = -1;
} else {
out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
}
}
out_dims_ = DDim(out_dims_array);
}
// infer end
// forward
std::vector<float> x(x_dims_.production());
std::vector<float> y(y_dims_.production());
std::vector<float> out(out_dims_.production());
fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
this->run_forward(&param_, &kernel_, x, y, out.data());
for (int i = 0; i < x_dims_.production(); i++) {
LOG(INFO) << "x_" << i << ": " << x[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
LOG(INFO) << "y_" << i << ": " << y[i];
}
for (int i = 0; i < out_dims_.production(); i++) {
LOG(INFO) << "out_" << i << ": " << out[i];
}
// backward
std::vector<float> out_grad(out_dims_.production());
std::vector<float> x_grad(x_dims_.production());
std::vector<float> y_grad(y_dims_.production());
for (int i = 0; i < out_dims_.production(); i++) {
out_grad[i] = 1.0;
}
this->run_backward(&grad_param_,
&grad_kernel_,
x,
y,
out_grad,
x_grad.data(),
y_grad.data());
for (int i = 0; i < x_grad.size(); i++) {
LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
}
for (int i = 0; i < y_grad.size(); i++) {
LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
}
// get numeric gradient
std::vector<float> x_delta(x_dims_.production());
std::vector<float> y_delta(y_dims_.production());
std::vector<float> out_delta(out_dims_.production());
Tensor tensor_x;
Tensor tensor_y;
tensor_x.Resize(x_dims_);
tensor_y.Resize(y_dims_);
grad_param_.X = &tensor_x;
grad_param_.Y = &tensor_y;
elementwise_common(grad_param_, out_grad, x_delta, y_delta, "sub");
float max_grad_delta = 0.0005;
for (int i = 0; i < x_dims_.production(); i++) {
EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
}
}
private:
DDim x_dims_;
DDim y_dims_;
DDim out_dims_;
int axis_;
kernel_sub_t kernel_;
grad_kernel_sub_t grad_kernel_;
param_t param_;
grad_param_t grad_param_;
};
void TestNormalCase(const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& y_dims,
int axis) {
std::unique_ptr<ElementwiseAddGradTester> tester_add(
new ElementwiseAddGradTester(DDim(x_dims), DDim(y_dims), axis));
std::unique_ptr<ElementwiseSubGradTester> tester_sub(
new ElementwiseSubGradTester(DDim(x_dims), DDim(y_dims), axis));
tester_add->prepare_kernel();
tester_sub->prepare_kernel();
float delta = 0.001;
float max_grad_delta = 0.005;
tester_add->check_grad(delta, max_grad_delta);
tester_sub->check_grad(delta, max_grad_delta);
}
TEST(mul_grad_arm, compute) {
LOG(INFO) << "Test Elementwise grad";
DeviceInfo::Init();
TestNormalCase({3, 2}, {3, 2}, 0);
TestNormalCase({3, 5}, {3, 5}, 1);
TestNormalCase({3, 4, 3}, {3, 4, 3}, 0);
TestNormalCase({9, 2, 5}, {9, 2, 5}, 1);
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(elementwise_add_grad, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
......@@ -52,7 +52,8 @@ class FillConstantComputeTester : public arena::TestCase {
is_use_shape_tensor_list_(is_use_shape_tensor_list) {
if (is_use_shape_tensor_list) {
for (int i = 0; i < shape.size(); i++) {
shape_tensor_list_.push_back(shape_tensor_ + std::to_string(i));
shape_tensor_list_.push_back(shape_tensor_ +
paddle::lite::to_string(i));
}
}
}
......
......@@ -109,6 +109,7 @@ void TestMul(const std::vector<int64_t>& x_dims,
int y_num_col_dims,
const Place& place,
float abs_error) {
LOG(INFO) << "run test arm";
std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
"def",
DDim(x_dims),
......@@ -131,7 +132,6 @@ TEST(Mul, precision) {
#else
return;
#endif
TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/mul_grad_compute.h"
#include <gtest/gtest.h>
#include "lite/core/op_registry.h"
#include "lite/kernels/arm/mul_compute.h"
#include "lite/tests/utils/fill_data.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
using param_t = operators::MulParam;
using grad_param_t = operators::MulGradParam;
using kernel_t = MulCompute;
using grad_kernel_t = MulGradCompute;
class MulGradTester {
public:
explicit MulGradTester(const DDim& x_dims,
const DDim& y_dims,
int x_num_col_dims,
int y_num_col_dims)
: x_dims_(x_dims),
y_dims_(y_dims),
x_num_col_dims_(x_num_col_dims),
y_num_col_dims_(y_num_col_dims) {}
void prepare_kernel() {
std::unique_ptr<KernelContext> ctx1(new KernelContext);
ctx1->As<ARMContext>();
kernel_.SetContext(std::move(ctx1));
std::unique_ptr<KernelContext> ctx2(new KernelContext);
ctx2->As<ARMContext>();
delta_kernel_.SetContext(std::move(ctx2));
std::unique_ptr<KernelContext> ctx3(new KernelContext);
ctx3->As<ARMContext>();
grad_kernel_.SetContext(std::move(ctx3));
}
void run_forward(param_t* param,
kernel_t* kernel,
const std::vector<float>& x_vec,
const std::vector<float>& y_vec,
float* out_vec) {
Tensor x;
Tensor y;
Tensor output;
x.Resize(x_dims_);
y.Resize(y_dims_);
output.Resize(DDim(out_dims_));
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = x_vec[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_data[i] = y_vec[i];
}
param->x = &x;
param->y = &y;
param->output = &output;
param->x_num_col_dims = x_num_col_dims_;
param->y_num_col_dims = y_num_col_dims_;
kernel->SetParam(*param);
kernel->Launch();
auto* output_data = output.mutable_data<float>();
for (int i = 0; i < out_dims_.production(); i++) {
out_vec[i] = output_data[i];
}
}
void run_backward(grad_param_t* param,
grad_kernel_t* kernel,
const std::vector<float>& x_vec,
const std::vector<float>& y_vec,
const std::vector<float>& out_grad_vec,
float* x_grad_vec,
float* y_grad_vec) {
Tensor x;
Tensor x_grad;
Tensor y;
Tensor y_grad;
Tensor out_grad;
x.Resize(x_dims_);
x_grad.Resize(x_dims_);
y.Resize(y_dims_);
y_grad.Resize(y_dims_);
out_grad.Resize(out_dims_);
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
auto* out_grad_data = out_grad.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_data[i] = x_vec[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_data[i] = y_vec[i];
}
for (int i = 0; i < out_dims_.production(); i++) {
out_grad_data[i] = out_grad_vec[i];
}
param->x = &x;
param->x_grad = &x_grad;
param->y = &y;
param->y_grad = &y_grad;
param->output_grad = &out_grad;
param->x_num_col_dims = x_num_col_dims_;
param->y_num_col_dims = y_num_col_dims_;
kernel->SetParam(*param);
kernel->Launch();
auto* x_grad_data = x_grad.mutable_data<float>();
auto* y_grad_data = y_grad.mutable_data<float>();
for (int i = 0; i < x_dims_.production(); i++) {
x_grad_vec[i] = x_grad_data[i];
}
for (int i = 0; i < y_dims_.production(); i++) {
y_grad_vec[i] = y_grad_data[i];
}
}
void check_grad() {
std::vector<int64_t> out_shape;
for (int i = 0; i < x_num_col_dims_; i++) {
out_shape.push_back(x_dims_[i]);
}
for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
out_shape.push_back(y_dims_[i]);
}
out_dims_ = DDim(out_shape);
// forward
std::vector<float> x(x_dims_.production());
std::vector<float> y(y_dims_.production());
std::vector<float> out(out_dims_.production());
fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
this->run_forward(&param_, &kernel_, x, y, out.data());
// backward
std::vector<float> out_grad(out_dims_.production());
std::vector<float> x_grad(x_dims_.production());
std::vector<float> y_grad(y_dims_.production());
for (int i = 0; i < out_dims_.production(); i++) {
out_grad[i] = 1.0;
}
this->run_backward(&grad_param_,
&grad_kernel_,
x,
y,
out_grad,
x_grad.data(),
y_grad.data());
// get numeric gradient
std::vector<float> x_delta(x_dims_.production());
std::vector<float> y_delta(y_dims_.production());
std::vector<float> out_delta(out_dims_.production());
float delta = 0.001;
float max_grad_delta = 0.005;
for (int i = 0; i < x_dims_.production(); i++) {
for (int j = 0; j < x_dims_.production(); j++) {
if (i == j) {
x_delta[j] = x[j] + delta;
} else {
x_delta[j] = x[j];
}
}
this->run_forward(
&delta_param_, &delta_kernel_, x_delta, y, out_delta.data());
float sum = 0;
for (int j = 0; j < out_dims_.production(); j++) {
sum += (out_delta[j] - out[j]);
}
EXPECT_NEAR(x_grad[i], sum / delta, max_grad_delta);
}
for (int i = 0; i < y_dims_.production(); i++) {
for (int j = 0; j < y_dims_.production(); j++) {
y_delta[j] = i == j ? y[j] + delta : y[j];
}
this->run_forward(
&delta_param_, &delta_kernel_, x, y_delta, out_delta.data());
float sum = 0;
for (int j = 0; j < out_dims_.production(); j++) {
sum += out_delta[j] - out[j];
}
EXPECT_NEAR(y_grad[i], sum / delta, max_grad_delta);
}
}
private:
DDim x_dims_;
DDim y_dims_;
DDim out_dims_;
int x_num_col_dims_;
int y_num_col_dims_;
kernel_t kernel_;
kernel_t delta_kernel_;
grad_kernel_t grad_kernel_;
param_t param_;
param_t delta_param_;
grad_param_t grad_param_;
};
void TestNormalCase(const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& y_dims,
int x_num_col_dims,
int y_num_col_dims) {
std::unique_ptr<MulGradTester> tester(new MulGradTester(
DDim(x_dims), DDim(y_dims), x_num_col_dims, y_num_col_dims));
tester->prepare_kernel();
tester->check_grad();
}
TEST(mul_grad_arm, compute) {
LOG(INFO) << "Test Mul grad";
DeviceInfo::Init();
TestNormalCase({1, 3}, {3, 2}, 1, 1);
TestNormalCase({3, 2}, {2, 1}, 1, 1);
TestNormalCase({3, 1}, {1, 7}, 1, 1);
TestNormalCase({2, 3}, {3, 2}, 1, 1);
TestNormalCase({4, 5}, {5, 4}, 1, 1);
TestNormalCase({4, 5}, {5, 4, 3, 2}, 1, 1);
TestNormalCase({3, 4}, {2, 2, 3}, 1, 2);
TestNormalCase({4, 20}, {5, 4, 3, 2}, 1, 2);
TestNormalCase({4, 60}, {5, 4, 3, 2}, 1, 3);
TestNormalCase({2, 3, 4, 5}, {60, 4}, 1, 1);
TestNormalCase({2, 3, 4, 5}, {20, 4}, 2, 1);
TestNormalCase({2, 3, 4, 5}, {5, 4}, 3, 1);
TestNormalCase({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1);
TestNormalCase({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2);
TestNormalCase({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2);
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul_grad, kARM, kFloat, kNCHW, def);
......@@ -45,7 +45,8 @@ class ReshapeComputeTester : public arena::TestCase {
: TestCase(place, alias), dims_(dims) {
if (is_shape_tensor_vct) {
for (size_t i = 0; i < shape.size(); i++) {
shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
shape_tensor_vct_.emplace_back(op_type_ + "/shape" +
paddle::lite::to_string(i));
}
} else if (is_shape_tensor) {
shape_tensor_ = op_type_ + "/shape";
......
......@@ -168,8 +168,9 @@ class SliceComputeTester : public arena::TestCase {
std::vector<std::string> ends_tensor_list_;
for (int i = 0; i < starts_.size(); ++i) {
starts_tensor_list_.push_back("starts_tensor_list_" +
std::to_string(i));
ends_tensor_list_.push_back("ends_tensor_list_" + std::to_string(i));
paddle::lite::to_string(i));
ends_tensor_list_.push_back("ends_tensor_list_" +
paddle::lite::to_string(i));
}
op_desc->SetInput("StartsTensorList", {starts_tensor_list_});
op_desc->SetInput("EndsTensorList", {ends_tensor_list_});
......@@ -203,15 +204,15 @@ class SliceComputeTester : public arena::TestCase {
} else if (use_tensor_list_) {
Scope& scope_ = this->scope();
for (int i = 0; i < starts_.size(); ++i) {
auto* tensor =
scope_.NewTensor("starts_tensor_list_" + std::to_string(i));
auto* tensor = scope_.NewTensor("starts_tensor_list_" +
paddle::lite::to_string(i));
tensor->Resize(DDim({1}));
auto* d = tensor->mutable_data<int>();
d[0] = starts_[i];
}
for (int i = 0; i < ends_.size(); ++i) {
auto* tensor =
scope_.NewTensor("ends_tensor_list_" + std::to_string(i));
scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i));
tensor->Resize(DDim({1}));
auto* d = tensor->mutable_data<int>();
d[0] = ends_[i];
......
......@@ -123,7 +123,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
} else if (input_axes_flag_ == 3) {
std::string name = "axes_tensor_";
for (size_t i = 0; i < axes_.size(); i++) {
name = name + std::to_string(i);
name = name + paddle::lite::to_string(i);
axes_tensor_list_.push_back(name);
SetCommonTensor(name, DDim({1}), &axes_[i]);
}
......
......@@ -291,6 +291,8 @@ function make_ios {
-DLITE_ON_TINY_PUBLISH=ON \
-DLITE_WITH_OPENMP=OFF \
-DWITH_ARM_DOTPROD=OFF \
-DLITE_BUILD_TAILOR=$BUILD_TAILOR \
-DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DARM_TARGET_ARCH_ABI=$abi \
-DLITE_BUILD_EXTRA=$BUILD_EXTRA \
......@@ -354,10 +356,12 @@ function make_x86 {
-DWITH_LITE=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-DLITE_WITH_ARM=OFF \
-DLITE_WITH_PYTHON=$BUILD_PYTHON \
-DWITH_GPU=OFF \
-DLITE_WITH_PYTHON=${BUILD_PYTHON} \
-DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_XPU=$BUID_XPU \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT
make publish_inference -j$NUM_PROC
cd -
......
......@@ -184,7 +184,7 @@ function build_opencl {
return 0
fi
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
mkdir -p $build_dir
cd $build_dir
......@@ -193,11 +193,10 @@ function build_opencl {
cmake_opencl ${os} ${abi} ${lang}
make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
build $TESTS_FILE
# test publish inference lib
make publish_inference -j$NUM_CORES_FOR_COMPILE
}
# This method is only called in CI.
function cmake_x86_for_CI {
prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
......@@ -387,7 +386,7 @@ function test_arm_android {
echo "test name: ${test_name}"
adb_work_dir="/data/local/tmp"
skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass")
skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
for skip_name in ${skip_list[@]} ; do
[[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
done
......@@ -755,16 +754,58 @@ function arm_push_necessary_file {
adb -s ${device} push ${testpath} ${adb_work_dir}
}
function test_opencl {
os=$1
abi=$2
lang=$3
device=$4
if [[ ${os} == "armlinux" ]]; then
# TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
echo "Skip test arm linux yet. armlinux must in another docker"
return 0
fi
if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
echo "android do not need armv7hf"
return 0
fi
# prepare for CXXApi test
local adb="adb -s ${device}"
$adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
# opencl test should be marked with `opencl`
opencl_test_mark="opencl"
for _test in $(cat $TESTS_FILE); do
# tell if this test is marked with `opencl`
if [[ $_test == *$opencl_test_mark* ]]; then
test_arm_android $_test $device
fi
done
}
function build_test_arm_opencl {
########################################################################
cur=$PWD
# job 1-4 must be in one runner
prepare_adb_devices
# job 1
build_opencl "android" "armv8" "gcc"
adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
run_gen_code_test ${device_armv8}
test_opencl "android" "armv8" "gcc" ${device_armv8}
cd $cur
# job 2
build_opencl "android" "armv7" "gcc"
adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*'
run_gen_code_test ${device_armv7}
test_opencl "android" "armv7" "gcc" ${device_armv7}
cd $cur
echo "Done"
......@@ -1099,6 +1140,8 @@ function main {
;;
build_test_arm_opencl)
build_test_arm_opencl
build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
shift
;;
build_test_arm_subtask_android)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this module will record kernels in unvalid_places into all_kernel_faked.cc
from __future__ import print_function
import sys
......@@ -18,12 +19,13 @@ import logging
from ast import RegisterLiteKernelParser
from utils import *
if len(sys.argv) != 4:
if len(sys.argv) != 5:
print("Error: create_fake_kernel_registry.py requires three inputs!")
exit(1)
ops_list_path = sys.argv[1]
dest_path = sys.argv[2]
kernelmap_path = sys.argv[3]
kernels_list_path = sys.argv[1]
faked_kernels_list_path = sys.argv[2]
dest_path = sys.argv[3]
kernelmap_path = sys.argv[4]
out_lines = [
'#pragma once',
......@@ -77,68 +79,85 @@ const std::map<std::string, std::string> kernel2path_map{
'''
]
def parse_fake_kernels_from_path(list_path):
with open(list_path) as f:
paths = set([path for path in f])
for path in paths:
print('path', path)
with open(path.strip()) as g:
c = g.read()
kernel_parser = RegisterLiteKernelParser(c)
kernel_parser.parse()
for k in kernel_parser.kernels:
kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
op_type=k.op_type,
target=k.target,
precision=k.precision,
data_layout=k.data_layout,
alias=k.alias
)
kernel_define = fake_kernel % (
kernel_name,
k.target,
k.precision,
k.data_layout,
kernel_name
)
out_lines.append(kernel_define)
out_lines.append("")
key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
k.op_type,
k.target,
k.precision,
k.data_layout,
'::paddle::lite::' + kernel_name,
k.alias
)
out_lines.append(key)
for input in k.inputs:
io = ' .BindInput("%s", {%s})' % (input.name, input.type)
out_lines.append(io)
for output in k.outputs:
io = ' .BindOutput("%s", {%s})' % (output.name, output.type)
out_lines.append(io)
out_lines.append(" .Finalize();")
out_lines.append("")
out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
def parse_sppported_kernels_from_path(list_path):
with open(list_path) as f:
paths = set([path for path in f])
for path in paths:
print('path', path)
with open(path.strip()) as g:
c = g.read()
kernel_parser = RegisterLiteKernelParser(c)
kernel_parser.parse()
for k in kernel_parser.kernels:
index = path.rindex('/')
filename = path[index + 1:]
map_element = ' {"%s,%s,%s,%s,%s", "%s"},' % (
k.op_type,
k.target,
k.precision,
k.data_layout,
k.alias,
filename.strip()
)
kernel_src_map_lines.append(map_element)
parse_fake_kernels_from_path(faked_kernels_list_path)
parse_sppported_kernels_from_path(faked_kernels_list_path)
parse_sppported_kernels_from_path(kernels_list_path)
with open(ops_list_path) as f:
paths = set([path for path in f])
for path in paths:
print('path', path)
with open(path.strip()) as g:
c = g.read()
kernel_parser = RegisterLiteKernelParser(c)
kernel_parser.parse()
for k in kernel_parser.kernels:
kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
op_type = k.op_type,
target = k.target,
precision = k.precision,
data_layout = k.data_layout,
alias = k.alias,
)
kernel_define = fake_kernel % (
kernel_name,
k.target,
k.precision,
k.data_layout,
kernel_name,
)
out_lines.append(kernel_define)
out_lines.append("")
key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
k.op_type,
k.target,
k.precision,
k.data_layout,
'::paddle::lite::' + kernel_name,
k.alias,
)
out_lines.append(key)
for input in k.inputs:
io = ' .BindInput("%s", {%s})' % (input.name, input.type)
out_lines.append(io)
for output in k.outputs:
io = ' .BindOutput("%s", {%s})' % (output.name, output.type)
out_lines.append(io)
out_lines.append(" .Finalize();")
out_lines.append("")
out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
index = path.rindex('/')
filename = path[index + 1:]
map_element = ' {"%s,%s,%s,%s,%s", "%s"},' % (
k.op_type,
k.target,
k.precision,
k.data_layout,
k.alias,
filename.strip()
)
kernel_src_map_lines.append(map_element)
with open(dest_path, 'w') as f:
logging.info("write kernel list to %s" % dest_path)
f.write('\n'.join(out_lines))
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this module will record supported ops from kernels_src.txt
from __future__ import print_function
import sys
......@@ -18,12 +19,13 @@ import logging
from ast import RegisterLiteKernelParser
from ast import RegisterLiteOpParser
if len(sys.argv) != 4:
print("Error: record_supported_kernel_op.py requires three inputs!")
exit(1)
if len(sys.argv) != 5:
print("Error: record_supported_kernel_op.py requires four inputs!")
sys.exit(1)
kernels_list_path = sys.argv[1]
ops_list_path = sys.argv[2]
kernel_op_map_dest_path = sys.argv[3]
faked_kernels_list_path = sys.argv[2]
ops_list_path = sys.argv[3]
kernel_op_map_dest_path = sys.argv[4]
out_lines = [
......@@ -51,11 +53,11 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
'''
]
ops_lines=[]
ops_lines = []
# valid targets and valid_ops
valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
valid_ops = [[],[],[],[],[],[],[],[],[],[]]
valid_ops = [[], [], [], [], [], [], [], [], [], []]
class TargetType:
kUnk = 0
kHost = 1
......@@ -78,8 +80,21 @@ with open(kernels_list_path) as f:
kernel_parser.parse()
for k in kernel_parser.kernels:
if hasattr(TargetType, k.target):
index=getattr(TargetType, k.target)
index = getattr(TargetType, k.target)
valid_ops[index].append(k.op_type)
# record op_info of valid kernels into `valid_ops` according to different target type
with open(faked_kernels_list_path) as f:
paths = set([path for path in f])
for path in paths:
with open(path.strip()) as g:
c = g.read()
kernel_parser = RegisterLiteKernelParser(c)
kernel_parser.parse()
for k in kernel_parser.kernels:
if hasattr(TargetType, k.target):
index = getattr(TargetType, k.target)
valid_ops[index].append(k.op_type)
# clear the repeated ops
for target in valid_targets:
......@@ -114,7 +129,7 @@ with open(kernel_op_map_dest_path, 'w') as f:
f.write('\n'.join(out_lines))
# write kernels into head file
for target in valid_targets:
if len(valid_ops[getattr(TargetType, target)]) == 0 :
if len(valid_ops[getattr(TargetType, target)]) == 0:
f.write("\n // %s_OPS: " %target)
f.write('\n {},')
else:
......
if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
lite_cc_library(paddle_cv_arm SRCS
image_convert.cc
bgr_rotate.cc
paddle_image_preprocess.cc
image2tensor.cc
image_flip.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// ncnn license
// Tencent is pleased to support the open source community by making ncnn
// available.
//
// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this
// file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software
// distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
#include "lite/utils/cv/bgr_rotate.h"
#include <arm_neon.h>
#include <math.h>
#include <algorithm>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
void bgr_rotate_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
if (angle == 90) {
rotate90_hwc(src, dst, w_in, h_in);
}
if (angle == 270) {
rotate270_hwc(src, dst, w_in, h_in);
}
if (angle == 180) {
rotate180_hwc(src, dst, w_in, h_in);
}
}
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr7 bgr4 bgr1
bgr8 bgr5 bgr2
bgr9 bgr6 bgr3
*/
#ifdef __aarch64__
void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int64_t stride_h = 4 * win;
int64_t stride_h_w = 4 * win - 24;
int ww = w_out - 8;
[w_out * h_out * 3];
// block 8*8. -- 8*8
int i = 0;
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
uint8_t* outptr1 = outptr0 + wout;
uint8_t* outptr2 = outptr1 + wout;
uint8_t* outptr3 = outptr2 + wout;
uint8_t* outptr4 = outptr3 + wout;
uint8_t* outptr5 = outptr4 + wout;
uint8_t* outptr6 = outptr5 + wout;
uint8_t* outptr7 = outptr6 + wout;
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" // v0={00,01,02, 03,
// 04, 05, 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]] \n" // v0={10,11,12, 13,
// 14, 15, 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"add %[inptr0], %[inptr0], %[stride_h] \n" // 4 + 4*w_in
"add %[inptr1], %[inptr1], %[stride_h] \n" // 5
"add %[inptr2], %[inptr2], %[stride_h] \n" // 6
"add %[inptr3], %[inptr3], %[stride_h] \n" // 7
// b
"trn1 v12.8b, v0.8b, v3.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v15.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v18.8b, v0.8b, v3.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v21.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// g
"trn1 v13.8b, v1.8b, v4.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v16.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v19.8b, v1.8b, v4.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v22.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// r
"trn1 v14.8b, v2.8b, v5.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v17.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v20.8b, v2.8b, v5.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v23.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b1
"trn1 v24.4h, v12.4h, v15.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v27.4h, v18.4h, v21.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v0.4h, v12.4h, v15.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v3.4h, v18.4h, v21.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// g1
"trn1 v25.4h, v13.4h, v16.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v28.4h, v19.4h, v22.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v1.4h, v13.4h, v16.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v4.4h, v19.4h, v22.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// r1
"trn1 v26.4h, v14.4h, v17.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v29.4h, v20.4h, v23.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v2.4h, v14.4h, v17.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v5.4h, v20.4h, v23.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr0]] \n" // v0={00,01,02,
// 03, 04, 05, 06,
// 07}"
"ld3 {v15.8b, v16.8b, v17.8b}, [%[inptr1]] \n" // v0={10,11,12,
// 13, 14, 15, 16,
// 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" // 4 - 4*w_in + 8
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" // 5
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" // 6
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" // 7
// b2
"trn1 v18.8b, v12.8b, v15.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v21.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// g2
"trn1 v19.8b, v13.8b, v16.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v22.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// r2
"trn1 v20.8b, v14.8b, v17.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v23.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v12.8b, v12.8b, v15.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v13.8b, v13.8b, v16.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v14.8b, v14.8b, v17.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v15.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v16.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v17.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b2
"trn1 v6.4h, v18.4h, v21.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// g2
"trn1 v7.4h, v19.4h, v22.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// r2
"trn1 v8.4h, v20.4h, v23.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// bgr
"trn1 v9.4h, v12.4h, v15.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v10.4h, v13.4h, v16.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v11.4h, v14.4h, v17.4h \n" // v2={01 11 21 31 05 15
// 25 35}
// bgr
"trn2 v18.4h, v18.4h, v21.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v19.4h, v19.4h, v22.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v20.4h, v20.4h, v23.4h \n" // v1={02 12 22 32 06 16
// 26 36}
// bgr
"trn2 v21.4h, v12.4h, v15.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v22.4h, v13.4h, v16.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v23.4h, v14.4h, v17.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// b1 b2
"trn1 v12.2s, v24.2s, v6.2s \n" // v8={00 10 20 30 40 50
// 60 70} b
"trn1 v13.2s, v25.2s, v7.2s \n" // v6={00 10 20 30 40 50
// 60 70} g
"trn1 v14.2s, v26.2s, v8.2s \n" // v6={00 10 20 30 40 50
// 60 70} r
// b1 b2
"trn2 v15.2s, v24.2s, v6.2s \n" // v8={04 14 24 34 44 54
// 64 74} b
"trn2 v16.2s, v25.2s, v7.2s \n" // v6={04 14 24 34 44 54
// 64 74} g
"trn2 v17.2s, v26.2s, v8.2s \n" // v6={04 14 24 34 44 54
// 64 74} r
// b1 b2
"trn1 v6.2s, v27.2s, v9.2s \n" // v8={01 11 20 30 40 50
// 60 70} b
"trn1 v7.2s, v28.2s, v10.2s \n" // v6={01 10 20 30 40 50
// 60 70} g
"trn1 v8.2s, v29.2s, v11.2s \n" // v6={01 10 20 30 40 50
// 60 70} r
"rev64 v12.8b, v12.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v13.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v14.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v15.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v16.8b, v16.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v17.8b, v17.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
// b1 b2
"trn2 v24.2s, v27.2s, v9.2s \n" // v8={05 10 20 30 40 50
// 60 70} b
"trn2 v25.2s, v28.2s, v10.2s \n" // v6={05 10 20 30 40 50
// 60 70} g
"trn2 v26.2s, v29.2s, v11.2s \n" // v6={05 10 20 30 40 50
// 60 70} r
// "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n"
// //00 10 20 30 04 14 24 34
// "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n"
// //02 12 22 32
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" // 00 10 20 30 04 14 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" // 02 12 22 32
// b1 b2
"trn1 v9.2s, v0.2s, v18.2s \n" // v8={02 11 20 30 40 50
// 60 70} b
"trn1 v10.2s, v1.2s, v19.2s \n" // v6={02 10 20 30 40 50
// 60 70} g
"trn1 v11.2s, v2.2s, v20.2s \n" // v6={02 10 20 30 40 50
// 60 70} r
"trn2 v27.2s, v0.2s, v18.2s \n" // v8={06 11 20 30 40 50
// 60 70} b
"trn2 v28.2s, v1.2s, v19.2s \n" // v6={06 10 20 30 40 50
// 60 70} g
"trn2 v29.2s, v2.2s, v20.2s \n" // v6={06 10 20 30 40 50
// 60 70} r
// b1 b2
"trn1 v0.2s, v3.2s, v21.2s \n" // v8={03 11 20 30 40 50
// 60 70} b
"trn1 v1.2s, v4.2s, v22.2s \n" // v6={03 10 20 30 40 50
// 60 70} g
"trn1 v2.2s, v5.2s, v23.2s \n" // v6={03 10 20 30 40 50
// 60 70} r
"trn2 v18.2s, v3.2s, v21.2s \n" // v8={07 11 20 30 40 50
// 60 70} b
"trn2 v19.2s, v4.2s, v22.2s \n" // v6={07 10 20 30 40 50
// 60 70} g
"trn2 v20.2s, v5.2s, v23.2s \n" // v6={07 10 20 30 40 50
// 60 70} r
"rev64 v6.8b, v6.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 b
"rev64 v7.8b, v7.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 g
"rev64 v8.8b, v8.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 r
"rev64 v24.8b, v24.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v25.8b, v25.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v26.8b, v26.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v9.8b, v9.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 b
"rev64 v10.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v11.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v27.8b, v27.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v28.8b, v28.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v29.8b, v29.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v0.8b, v0.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 b
"rev64 v1.8b, v1.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 g
"rev64 v2.8b, v2.8b \n" //@ reverse 07 06 05 04 03 02
// 01 00 r
"rev64 v18.8b, v18.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v19.8b, v19.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v20.8b, v20.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24 \n" // 02 12 22 32
"st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24 \n" // 02 12 22 32
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24 \n" // 02 12 22 32
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7),
[stride_h] "+r"(stride_h),
[stride_h_w] "+r"(stride_h_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30");
}
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 3;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#else
void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int ww = w_out - 8;
// block 8*8. -- 8*8
int i = 0;
for (i = 0; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
"pld [%[ptr4]] @ preload a, 64byte\n"
"pld [%[ptr4], #64] @ preload a, 64byte\n"
"pld [%[ptr5]] @ preload a, 64byte\n"
"pld [%[ptr5], #64] @ preload a, 64byte\n"
"pld [%[ptr6]] @ preload a, 64byte\n"
"pld [%[ptr6], #64] @ preload a, 64byte\n"
"pld [%[ptr7]] @ preload a, 64byte\n"
"pld [%[ptr7], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
int j = 0;
for (; j < w_in; j++) {
int tmpx = (ww - i) * 3;
uint8_t* outptr = dst + j * wout + tmpx;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
}
}
ww = w_out - 1;
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr3 bgr6 bgr9
bgr2 bgr5 bgr8
bgr1 bgr4 bgr7
*/
// dst = (h_out - 1) * w_out
// 类似rotate90,将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
#ifdef __aarch64__
void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int64_t stride_h = 4 * win;
int64_t stride_h_w = 4 * win - 24;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
for (; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr0], #64] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr1], #64] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr2], #64] \n"
"prfm pldl1keep, [%[ptr3]] \n"
"prfm pldl1keep, [%[ptr3], #64] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w_in - 7; j += 8) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
uint8_t* outptr1 = outptr0 - wout;
uint8_t* outptr2 = outptr1 - wout;
uint8_t* outptr3 = outptr2 - wout;
uint8_t* outptr4 = outptr3 - wout;
uint8_t* outptr5 = outptr4 - wout;
uint8_t* outptr6 = outptr5 - wout;
uint8_t* outptr7 = outptr6 - wout;
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]] \n" // v0={00,01,02, 03,
// 04, 05, 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]] \n" // v0={10,11,12, 13,
// 14, 15, 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"add %[inptr0], %[inptr0], %[stride_h] \n" // 4 + 4*w_in
"add %[inptr1], %[inptr1], %[stride_h] \n" // 5
"add %[inptr2], %[inptr2], %[stride_h] \n" // 6
"add %[inptr3], %[inptr3], %[stride_h] \n" // 7
// b
"trn1 v12.8b, v0.8b, v3.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v15.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v18.8b, v0.8b, v3.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v21.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// g
"trn1 v13.8b, v1.8b, v4.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v16.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v19.8b, v1.8b, v4.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v22.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// r
"trn1 v14.8b, v2.8b, v5.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v17.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v20.8b, v2.8b, v5.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v23.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b1
"trn1 v24.4h, v12.4h, v15.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v27.4h, v18.4h, v21.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v0.4h, v12.4h, v15.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v3.4h, v18.4h, v21.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// g1
"trn1 v25.4h, v13.4h, v16.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v28.4h, v19.4h, v22.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v1.4h, v13.4h, v16.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v4.4h, v19.4h, v22.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// r1
"trn1 v26.4h, v14.4h, v17.4h \n" // v0={00 10 20 30 04 14
// 24 34}
"trn1 v29.4h, v20.4h, v23.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn2 v2.4h, v14.4h, v17.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v5.4h, v20.4h, v23.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"ld3 {v12.8b, v13.8b, v14.8b}, [%[inptr0]] \n" // v0={00,01,02,
// 03, 04, 05, 06,
// 07}"
"ld3 {v15.8b, v16.8b, v17.8b}, [%[inptr1]] \n" // v0={10,11,12,
// 13, 14, 15, 16,
// 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]] \n" // v0={20,21,22, 23,
// 24, 25, 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]] \n" // v0={30,31,32,
// 33, 34, 35, 36,
// 37}"
"sub %[inptr0], %[inptr0], %[stride_h_w] \n" // 4 - 4*w_in + 8
"sub %[inptr1], %[inptr1], %[stride_h_w] \n" // 5
"sub %[inptr2], %[inptr2], %[stride_h_w] \n" // 6
"sub %[inptr3], %[inptr3], %[stride_h_w] \n" // 7
// b2
"trn1 v18.8b, v12.8b, v15.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v21.8b, v6.8b, v9.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// g2
"trn1 v19.8b, v13.8b, v16.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v22.8b, v7.8b, v10.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
// r2
"trn1 v20.8b, v14.8b, v17.8b \n" // v4={00 10 02 12 04 14
// 06 16 }
"trn1 v23.8b, v8.8b, v11.8b \n" // v4={20 30 22 32 24 34
// 26 36 }
"trn2 v12.8b, v12.8b, v15.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v13.8b, v13.8b, v16.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v14.8b, v14.8b, v17.8b \n" // v5={01 11 03 13 05 15
// 07 17 }
"trn2 v15.8b, v6.8b, v9.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v16.8b, v7.8b, v10.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
"trn2 v17.8b, v8.8b, v11.8b \n" // v7={21 31 23 33 25 35
// 27 37 }
// b2
"trn1 v6.4h, v18.4h, v21.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// g2
"trn1 v7.4h, v19.4h, v22.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// r2
"trn1 v8.4h, v20.4h, v23.4h \n" // v0={00 10 20 30 04 14
// 24 34}
// bgr
"trn1 v9.4h, v12.4h, v15.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v10.4h, v13.4h, v16.4h \n" // v2={01 11 21 31 05 15
// 25 35}
"trn1 v11.4h, v14.4h, v17.4h \n" // v2={01 11 21 31 05 15
// 25 35}
// bgr
"trn2 v18.4h, v18.4h, v21.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v19.4h, v19.4h, v22.4h \n" // v1={02 12 22 32 06 16
// 26 36}
"trn2 v20.4h, v20.4h, v23.4h \n" // v1={02 12 22 32 06 16
// 26 36}
// bgr
"trn2 v21.4h, v12.4h, v15.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v22.4h, v13.4h, v16.4h \n" // v3={03 13 23 33 07 17
// 27 37}
"trn2 v23.4h, v14.4h, v17.4h \n" // v3={03 13 23 33 07 17
// 27 37}
// b1 b2
"trn1 v12.2s, v24.2s, v6.2s \n" // v8={00 10 20 30 40 50
// 60 70} b
"trn1 v13.2s, v25.2s, v7.2s \n" // v6={00 10 20 30 40 50
// 60 70} g
"trn1 v14.2s, v26.2s, v8.2s \n" // v6={00 10 20 30 40 50
// 60 70} r
// b1 b2
"trn2 v15.2s, v24.2s, v6.2s \n" // v8={04 14 24 34 44 54
// 64 74} b
"trn2 v16.2s, v25.2s, v7.2s \n" // v6={04 14 24 34 44 54
// 64 74} g
"trn2 v17.2s, v26.2s, v8.2s \n" // v6={04 14 24 34 44 54
// 64 74} r
// b1 b2
"trn1 v6.2s, v27.2s, v9.2s \n" // v8={01 11 20 30 40 50
// 60 70} b
"trn1 v7.2s, v28.2s, v10.2s \n" // v6={01 10 20 30 40 50
// 60 70} g
"trn1 v8.2s, v29.2s, v11.2s \n" // v6={01 10 20 30 40 50
// 60 70} r
// b1 b2
"trn2 v24.2s, v27.2s, v9.2s \n" // v8={05 10 20 30 40 50
// 60 70} b
"trn2 v25.2s, v28.2s, v10.2s \n" // v6={05 10 20 30 40 50
// 60 70} g
"trn2 v26.2s, v29.2s, v11.2s \n" // v6={05 10 20 30 40 50
// 60 70} r
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24 \n" // 00 10 20 30 04 14 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24 \n" // 02 12 22 32
// b1 b2
"trn1 v9.2s, v0.2s, v18.2s \n" // v8={02 11 20 30 40 50
// 60 70} b
"trn1 v10.2s, v1.2s, v19.2s \n" // v6={02 10 20 30 40 50
// 60 70} g
"trn1 v11.2s, v2.2s, v20.2s \n" // v6={02 10 20 30 40 50
// 60 70} r
"trn2 v27.2s, v0.2s, v18.2s \n" // v8={06 11 20 30 40 50
// 60 70} b
"trn2 v28.2s, v1.2s, v19.2s \n" // v6={06 10 20 30 40 50
// 60 70} g
"trn2 v29.2s, v2.2s, v20.2s \n" // v6={06 10 20 30 40 50
// 60 70} r
// b1 b2
"trn1 v0.2s, v3.2s, v21.2s \n" // v8={03 11 20 30 40 50
// 60 70} b
"trn1 v1.2s, v4.2s, v22.2s \n" // v6={03 10 20 30 40 50
// 60 70} g
"trn1 v2.2s, v5.2s, v23.2s \n" // v6={03 10 20 30 40 50
// 60 70} r
"trn2 v18.2s, v3.2s, v21.2s \n" // v8={07 11 20 30 40 50
// 60 70} b
"trn2 v19.2s, v4.2s, v22.2s \n" // v6={07 10 20 30 40 50
// 60 70} g
"trn2 v20.2s, v5.2s, v23.2s \n" // v6={07 10 20 30 40 50
// 60 70} r
"st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24 \n" // 02 12 22 32
"st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24 \n" // 02 12 22 32
"st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24 \n" // 00
// 10
// 20
// 30
// 04
// 14
// 24
// 34
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24 \n" // 02 12 22 32
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[outptr4] "+r"(outptr4),
[outptr5] "+r"(outptr5),
[outptr6] "+r"(outptr6),
[outptr7] "+r"(outptr7),
[stride_h] "+r"(stride_h),
[stride_h_w] "+r"(stride_h_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29");
}
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
for (; j < w_in; j++) {
int tmpx = i * 3;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#else
void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
int w_out = h_in;
int h_out = w_in;
int win = w_in * 3;
int wout = w_out * 3;
int hremain = h_in % 8;
int stride_h = 4 * win;
int stride_h_w = 4 * win - 24;
int hout = h_out - 1;
// block 8*8. -- 8*8
int i = 0;
for (; i < h_in - 7; i += 8) {
const uint8_t* inptr0 = src + i * win;
const uint8_t* inptr1 = inptr0 + win;
const uint8_t* inptr2 = inptr1 + win;
const uint8_t* inptr3 = inptr2 + win;
const uint8_t* inptr4 = inptr3 + win;
const uint8_t* inptr5 = inptr4 + win;
const uint8_t* inptr6 = inptr5 + win;
const uint8_t* inptr7 = inptr6 + win;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr0], #64] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr1], #64] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr2], #64] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
"pld [%[ptr3], #64] @ preload a, 64byte\n"
"pld [%[ptr4]] @ preload a, 64byte\n"
"pld [%[ptr4], #64] @ preload a, 64byte\n"
"pld [%[ptr5]] @ preload a, 64byte\n"
"pld [%[ptr5], #64] @ preload a, 64byte\n"
"pld [%[ptr6]] @ preload a, 64byte\n"
"pld [%[ptr6], #64] @ preload a, 64byte\n"
"pld [%[ptr7]] @ preload a, 64byte\n"
"pld [%[ptr7], #64] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3),
[ptr4] "r"(inptr4),
[ptr5] "r"(inptr5),
[ptr6] "r"(inptr6),
[ptr7] "r"(inptr7)
: "memory");
int j = 0;
for (; j < w_in; j++) {
int tmpx = i * 3;
uint8_t* outptr = dst + (hout - j) * wout + tmpx;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr2++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr3++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr4++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr5++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr6++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
*outptr++ = *inptr7++;
}
}
for (; i < h_in; i++) {
const uint8_t* inptr0 = src + i * win;
for (int j = 0; j < w_in; j++) {
uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
}
}
}
#endif
/*
bgr1 bgr2 bgr3
bgr4 bgr5 bgr6
bgr7 bgr8 bgr9
rotate:
bgr9 bgr8 bgr7
bgr6 bgr5 bgr4
bgr3 bgr2 bgr1
*/
// filp y
#ifdef __aarch64__
void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[30000]; // [w_in];
memset(zerobuff, 0, w_in * sizeof(uint8_t));
int64_t stride_w = 24;
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last col
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"prfm pldl1keep, [%[ptr0]] \n"
"prfm pldl1keep, [%[ptr1]] \n"
"prfm pldl1keep, [%[ptr2]] \n"
"prfm pldl1keep, [%[ptr3]] \n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w - 7; j += 8) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
asm volatile(
"ld3 {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24 \n" // v0={00,01,02,
// 03, 04, 05,
// 06, 07}"
"ld3 {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24 \n" // v0={10,11,12,
// 13, 14, 15,
// 16, 17}"
"ld3 {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24 \n" // v0={20,21,22,
// 23, 24, 25,
// 26, 27}"
"ld3 {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24 \n" // v0={30,31,32,
// 33, 34, 35,
// 36, 37}"
"rev64 v12.8b, v0.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 b
"rev64 v13.8b, v1.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 g
"rev64 v14.8b, v2.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00 r
"rev64 v15.8b, v3.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v16.8b, v4.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v17.8b, v5.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v18.8b, v6.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v19.8b, v7.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v20.8b, v8.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v21.8b, v9.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v22.8b, v10.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"rev64 v23.8b, v11.8b \n" //@ reverse 07 06 05 04 03
// 02 01 00
"prfm pldl1keep, [%[inptr0]] \n"
"prfm pldl1keep, [%[inptr1]] \n"
"prfm pldl1keep, [%[inptr2]] \n"
"prfm pldl1keep, [%[inptr3]] \n"
"st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]] \n" // 00 10
// 20 30
// 04 14
// 24 34
"st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]] \n" // 02 12
// 22 32
"st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]] \n" // 01 11
// 21 31
"st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]] \n" // 03 13
// 23 33
"sub %[outptr0], %[outptr0], %[stride_w] \n" //@ ptr - stride_w
"sub %[outptr1], %[outptr1], %[stride_w] \n"
"sub %[outptr2], %[outptr2], %[stride_w] \n"
"sub %[outptr3], %[outptr3], %[stride_w] \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
delete[] zerobuff;
}
#else
void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
int w_in = w * 3;
uint8_t zerobuff[30000]; // w_in
memset(zerobuff, 0, w_in * sizeof(uint8_t));
int stride_w = 24;
// 4*8
for (int i = 0; i < h_in; i += 4) {
const uint8_t* inptr0 = src + i * w_in;
const uint8_t* inptr1 = inptr0 + w_in;
const uint8_t* inptr2 = inptr1 + w_in;
const uint8_t* inptr3 = inptr2 + w_in;
uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w; // last
uint8_t* outptr1 = outptr0 - w_in;
uint8_t* outptr2 = outptr1 - w_in;
uint8_t* outptr3 = outptr2 - w_in;
asm volatile(
"pld [%[ptr0]] @ preload a, 64byte\n"
"pld [%[ptr1]] @ preload a, 64byte\n"
"pld [%[ptr2]] @ preload a, 64byte\n"
"pld [%[ptr3]] @ preload a, 64byte\n"
:
: [ptr0] "r"(inptr0),
[ptr1] "r"(inptr1),
[ptr2] "r"(inptr2),
[ptr3] "r"(inptr3)
: "memory");
int j = 0;
for (; j < w - 7; j += 8) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 3:
inptr0 = zerobuff;
outptr0 = zerobuff;
case 2:
inptr1 = zerobuff;
outptr1 = zerobuff;
case 1:
inptr2 = zerobuff;
outptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
outptr3 = zerobuff;
default:
break;
}
}
asm volatile(
"vld3.8 {d0, d1, d2}, [%[inptr0]]! @ zip load r0, d0 =00 01 02 03 "
"04 05 06 07\n"
"vld3.8 {d3, d4, d5}, [%[inptr1]]! @ zip load r1, d2 =10 11 12 13 "
"14 15 16 17\n"
"vld3.8 {d6, d7, d8}, [%[inptr2]]! @ zip load r1, d4 =20 21 22 23 "
"24 25 26 27\n"
"vld3.8 {d9, d10, d11}, [%[inptr3]]! @ zip load r1, d6 = 30 31 32 "
"33 34 35 36 37\n"
"vrev64.8 d12, d0 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d13, d1 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d14, d2 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d15, d3 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d16, d4 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d17, d5 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d18, d6 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d19, d7 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d20, d8 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d21, d9 @ reverse 07 06 05 04 03 02 01 00 \n"
"vrev64.8 d22, d10 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"vrev64.8 d23, d11 @ reverse 07 06 05 04 03 02 01 00 "
"\n"
"pld [%[inptr0]] @ preload a, 64byte\n"
"pld [%[inptr1]] @ preload a, 64byte\n"
"pld [%[inptr2]] @ preload a, 64byte\n"
"pld [%[inptr3]] @ preload a, 64byte\n"
"vst3.8 {d12, d13, d14}, [%[outptr0]] @ write "
"d0(q0,low),r00,r10 20 30\n"
"vst3.8 {d15, d16, d17}, [%[outptr1]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d18, d19, d20}, [%[outptr2]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"vst3.8 {d21, d22, d23}, [%[outptr3]] @ write "
"d4(q0,low),r01,r11 21 31\n"
"sub %[outptr0], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr1], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr2], %[stride_w] @ ptr - stride_w \n"
"sub %[outptr3], %[stride_w] @ ptr - stride_w \n"
: [inptr0] "+r"(inptr0),
[inptr1] "+r"(inptr1),
[inptr2] "+r"(inptr2),
[inptr3] "+r"(inptr3),
[outptr0] "+r"(outptr0),
[outptr1] "+r"(outptr1),
[outptr2] "+r"(outptr2),
[outptr3] "+r"(outptr3),
[stride_w] "+r"(stride_w)
:
: "q0",
"q1",
"q2",
"q3",
"q4",
"q5",
"q6",
"q7",
"q8",
"q9",
"q10",
"q11",
"q12");
}
outptr3 += stride_w - 3;
outptr2 += stride_w - 3;
outptr1 += stride_w - 3;
outptr0 += stride_w - 3;
for (; j < w; j++) {
if (i + 3 >= h_in) {
switch ((i + 3) - h_in) {
case 0:
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
case 1:
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
case 2:
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
case 3:
// inptr3 = zerobuff;
default:
break;
}
} else {
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
*outptr3++ = *inptr3++;
outptr3 -= 6;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
*outptr2++ = *inptr2++;
outptr2 -= 6;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
*outptr1++ = *inptr1++;
outptr1 -= 6;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
*outptr0++ = *inptr0++;
outptr0 -= 6;
}
}
}
delete[] zerobuff;
}
#endif
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace paddle {
namespace lite {
namespace utils {
namespace cv {
void bgr_rotate_hwc(
const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
} // namespace cv
} // namespace utils
} // namespace lite
} // namespace paddle
......@@ -664,15 +664,6 @@ void resize(const uint8_t* src,
memcpy(dst, src, sizeof(uint8_t) * size);
return;
}
double scale_x = static_cast<double>(srcw) / dstw;
double scale_y = static_cast<double>(srch) / dsth;
int* buf = new int[dstw * 2 + dsth * 3];
int* xofs = buf;
int* yofs = buf + dstw;
int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
int w_out = dstw;
int w_in = srcw;
......@@ -692,12 +683,19 @@ void resize(const uint8_t* src,
w_in = srcw * 3;
w_out = dstw * 3;
num = 3;
} else if (srcFormat == BGRA || srcFormat == RGBA) {
w_in = srcw * 4;
w_out = dstw * 4;
num = 4;
}
double scale_x = static_cast<double>(srcw) / dstw;
double scale_y = static_cast<double>(srch) / dsth;
int* buf = new int[dstw * 2 + dsth * 3];
int* xofs = buf;
int* yofs = buf + dstw;
int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
compute_xy(
srcw, srch, dstw, orih, num, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
......@@ -726,10 +724,10 @@ void resize(const uint8_t* src,
int remain = w_out % 8;
int32x4_t _v2 = vdupq_n_s32(2);
int prev_sy1 = -1;
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
#pragma omp parallel for
for (int dy = 0; dy < dsth; dy++) {
int16_t* rowsbuf0 = new int16_t[w_out + 1];
int16_t* rowsbuf1 = new int16_t[w_out + 1];
int sy = yofs[dy];
if (dy >= orih) {
xofs = xofs1;
......@@ -853,8 +851,6 @@ void resize(const uint8_t* src,
2);
}
ibeta += 2;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
if (orih < dsth) { // uv
delete[] xofs1;
......@@ -862,6 +858,8 @@ void resize(const uint8_t* src,
delete[] ialpha1;
}
delete[] buf;
delete[] rowsbuf0;
delete[] rowsbuf1;
}
// compute xofs, yofs, alpha, beta
void compute_xy(int srcw,
......
......@@ -15,6 +15,7 @@
#include "lite/utils/cv/image_rotate.h"
#include <math.h>
#include <string.h>
#include "lite/utils/cv/bgr_rotate.h"
namespace paddle {
namespace lite {
namespace utils {
......@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
if (srcFormat == GRAY) {
rotate_hwc1(src, dst, srcw, srch, degree);
} else if (srcFormat == BGR || srcFormat == RGB) {
rotate_hwc3(src, dst, srcw, srch, degree);
// rotate_hwc3(src, dst, srcw, srch, degree);
bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
} else if (srcFormat == BGRA || srcFormat == RGBA) {
rotate_hwc4(src, dst, srcw, srch, degree);
} else {
......
......@@ -29,6 +29,7 @@
#include <cstring>
#include <string>
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_ANDROID
#include <android/log.h>
......@@ -171,7 +172,7 @@ class VLogMessage {
if (GLOG_v_int < level_int) {
return;
}
const char* level = std::to_string(level_int).c_str();
const char* level = paddle::lite::to_string(level_int).c_str();
paddle::lite::gen_log(log_stream_, file, func, lineno, level);
}
......
......@@ -15,6 +15,7 @@
#include "lite/utils/replace_stl/stream.h"
#include <assert.h>
#include <stdio.h>
#include "lite/utils/string.h"
#ifdef LITE_ON_TINY_PUBLISH
......@@ -39,9 +40,9 @@ void ostream::pad(const std::string& text) {
#ifdef LITE_SHUTDOWN_LOG
#define ADD_DATA_AS_STRING(data_, obj_)
#else
#define ADD_DATA_AS_STRING(data_, obj_) \
std::string text = std::to_string(obj_); \
pad(text); \
#define ADD_DATA_AS_STRING(data_, obj_) \
std::string text = paddle::lite::to_string(obj_); \
pad(text); \
data_ = data_ + text;
#endif
......
......@@ -48,7 +48,14 @@ template <typename T>
static std::string to_string_with_precision(const T& v, const int n = 6) {
STL::stringstream ss;
ss.precision(n);
// ss << std::fixed << v;
ss << v;
return ss.str();
}
template <typename T>
static std::string to_string(const T& v) {
STL::stringstream ss;
ss << v;
return ss.str();
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册