Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
917620e8
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
917620e8
编写于
4月 07, 2020
作者:
H
huzhiqiang
提交者:
GitHub
4月 07, 2020
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into test_result
上级
7446e257
c3d78a6e
变更
92
展开全部
隐藏空白更改
内联
并排
Showing
92 changed file
with
2456 addition
and
1168 deletion
+2456
-1168
CMakeLists.txt
CMakeLists.txt
+1
-1
cmake/configure.cmake
cmake/configure.cmake
+4
-3
cmake/lite.cmake
cmake/lite.cmake
+7
-7
lite/CMakeLists.txt
lite/CMakeLists.txt
+2
-1
lite/api/CMakeLists.txt
lite/api/CMakeLists.txt
+17
-13
lite/api/benchmark.cc
lite/api/benchmark.cc
+7
-13
lite/api/cxx_api.cc
lite/api/cxx_api.cc
+6
-1
lite/api/light_api.cc
lite/api/light_api.cc
+1
-5
lite/api/light_api_shared.cc
lite/api/light_api_shared.cc
+0
-29
lite/api/opt.cc
lite/api/opt.cc
+1
-0
lite/api/test_classify_lite_bm.cc
lite/api/test_classify_lite_bm.cc
+8
-9
lite/api/transform_test.cc
lite/api/transform_test.cc
+2
-0
lite/backends/opencl/cl_context.cc
lite/backends/opencl/cl_context.cc
+75
-27
lite/backends/opencl/cl_context.h
lite/backends/opencl/cl_context.h
+7
-5
lite/backends/opencl/cl_runtime.cc
lite/backends/opencl/cl_runtime.cc
+20
-3
lite/backends/opencl/cl_runtime.h
lite/backends/opencl/cl_runtime.h
+14
-1
lite/backends/opencl/cl_utility.h
lite/backends/opencl/cl_utility.h
+4
-1
lite/core/mir/fusion/conv_bn_fuse_pass.cc
lite/core/mir/fusion/conv_bn_fuse_pass.cc
+2
-1
lite/core/mir/fusion/conv_bn_fuser.cc
lite/core/mir/fusion/conv_bn_fuser.cc
+51
-13
lite/core/mir/fusion/quant_dequant_fuse_pass.cc
lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+3
-5
lite/core/mir/fusion/quant_dequant_op_fuser.cc
lite/core/mir/fusion/quant_dequant_op_fuser.cc
+85
-175
lite/core/mir/fusion/quant_dequant_op_fuser.h
lite/core/mir/fusion/quant_dequant_op_fuser.h
+2
-10
lite/core/op_lite.h
lite/core/op_lite.h
+26
-0
lite/core/profile/precision_profiler.h
lite/core/profile/precision_profiler.h
+1
-0
lite/core/program.cc
lite/core/program.cc
+3
-7
lite/demo/cxx/train_demo/README.md
lite/demo/cxx/train_demo/README.md
+191
-0
lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
+24
-0
lite/demo/cxx/train_demo/cplus_train/data_reader.cc
lite/demo/cxx/train_demo/cplus_train/data_reader.cc
+109
-0
lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
+145
-0
lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
+37
-0
lite/demo/cxx/train_demo/cplus_train/run_build.sh
lite/demo/cxx/train_demo/cplus_train/run_build.sh
+21
-0
lite/demo/cxx/train_demo/image/lr_loss.png
lite/demo/cxx/train_demo/image/lr_loss.png
+0
-0
lite/demo/cxx/train_demo/train.py
lite/demo/cxx/train_demo/train.py
+135
-0
lite/kernels/arm/CMakeLists.txt
lite/kernels/arm/CMakeLists.txt
+6
-7
lite/kernels/bm/bridges/CMakeLists.txt
lite/kernels/bm/bridges/CMakeLists.txt
+4
-0
lite/kernels/bm/bridges/act_op.cc
lite/kernels/bm/bridges/act_op.cc
+13
-5
lite/kernels/bm/bridges/assign_value_op.cc
lite/kernels/bm/bridges/assign_value_op.cc
+66
-0
lite/kernels/bm/bridges/conv_op.cc
lite/kernels/bm/bridges/conv_op.cc
+2
-0
lite/kernels/bm/bridges/elementwise_ops.cc
lite/kernels/bm/bridges/elementwise_ops.cc
+1
-0
lite/kernels/bm/bridges/fill_constant_op.cc
lite/kernels/bm/bridges/fill_constant_op.cc
+66
-0
lite/kernels/bm/bridges/mul_op.cc
lite/kernels/bm/bridges/mul_op.cc
+39
-15
lite/kernels/bm/bridges/paddle_use_bridges.h
lite/kernels/bm/bridges/paddle_use_bridges.h
+2
-0
lite/kernels/bm/subgraph_compute.cc
lite/kernels/bm/subgraph_compute.cc
+1
-1
lite/kernels/host/CMakeLists.txt
lite/kernels/host/CMakeLists.txt
+0
-3
lite/kernels/host/multiclass_nms_compute.cc
lite/kernels/host/multiclass_nms_compute.cc
+1
-0
lite/kernels/host/multiclass_nms_compute_test.cc
lite/kernels/host/multiclass_nms_compute_test.cc
+0
-368
lite/kernels/opencl/activation_buffer_compute.cc
lite/kernels/opencl/activation_buffer_compute.cc
+12
-6
lite/kernels/opencl/activation_image_compute.cc
lite/kernels/opencl/activation_image_compute.cc
+68
-29
lite/kernels/opencl/bilinear_interp_image_compute.cc
lite/kernels/opencl/bilinear_interp_image_compute.cc
+6
-3
lite/kernels/opencl/concat_buffer_compute.cc
lite/kernels/opencl/concat_buffer_compute.cc
+6
-3
lite/kernels/opencl/concat_image_compute.cc
lite/kernels/opencl/concat_image_compute.cc
+6
-3
lite/kernels/opencl/conv_buffer_compute.cc
lite/kernels/opencl/conv_buffer_compute.cc
+8
-5
lite/kernels/opencl/conv_buffer_compute.h
lite/kernels/opencl/conv_buffer_compute.h
+2
-0
lite/kernels/opencl/conv_image_compute.cc
lite/kernels/opencl/conv_image_compute.cc
+93
-101
lite/kernels/opencl/conv_image_compute.h
lite/kernels/opencl/conv_image_compute.h
+16
-13
lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
+6
-3
lite/kernels/opencl/dropout_image_compute.cc
lite/kernels/opencl/dropout_image_compute.cc
+6
-3
lite/kernels/opencl/elementwise_add_buffer_compute.cc
lite/kernels/opencl/elementwise_add_buffer_compute.cc
+5
-3
lite/kernels/opencl/elementwise_add_buffer_compute.h
lite/kernels/opencl/elementwise_add_buffer_compute.h
+2
-0
lite/kernels/opencl/elementwise_add_image_compute.cc
lite/kernels/opencl/elementwise_add_image_compute.cc
+86
-68
lite/kernels/opencl/elementwise_add_image_compute.h
lite/kernels/opencl/elementwise_add_image_compute.h
+19
-0
lite/kernels/opencl/elementwise_mul_image_compute.cc
lite/kernels/opencl/elementwise_mul_image_compute.cc
+6
-3
lite/kernels/opencl/elementwise_sub_image_compute.cc
lite/kernels/opencl/elementwise_sub_image_compute.cc
+5
-3
lite/kernels/opencl/elementwise_sub_image_compute.h
lite/kernels/opencl/elementwise_sub_image_compute.h
+2
-0
lite/kernels/opencl/fc_buffer_compute.cc
lite/kernels/opencl/fc_buffer_compute.cc
+77
-47
lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
...pencl/fusion_elementwise_add_activation_buffer_compute.cc
+4
-2
lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
...opencl/fusion_elementwise_add_activation_image_compute.cc
+5
-2
lite/kernels/opencl/grid_sampler_image_compute.cc
lite/kernels/opencl/grid_sampler_image_compute.cc
+75
-51
lite/kernels/opencl/image_helper.h
lite/kernels/opencl/image_helper.h
+6
-0
lite/kernels/opencl/instance_norm_image_compute.cc
lite/kernels/opencl/instance_norm_image_compute.cc
+11
-6
lite/kernels/opencl/lrn_image_compute.cc
lite/kernels/opencl/lrn_image_compute.cc
+3
-2
lite/kernels/opencl/mul_buffer_compute.cc
lite/kernels/opencl/mul_buffer_compute.cc
+7
-3
lite/kernels/opencl/nearest_interp_image_compute.cc
lite/kernels/opencl/nearest_interp_image_compute.cc
+6
-3
lite/kernels/opencl/pad2d_image_compute.cc
lite/kernels/opencl/pad2d_image_compute.cc
+6
-3
lite/kernels/opencl/pool_buffer_compute.cc
lite/kernels/opencl/pool_buffer_compute.cc
+6
-3
lite/kernels/opencl/pool_image_compute.cc
lite/kernels/opencl/pool_image_compute.cc
+3
-2
lite/kernels/opencl/reshape_image_compute.cc
lite/kernels/opencl/reshape_image_compute.cc
+6
-3
lite/kernels/opencl/scale_image_compute.cc
lite/kernels/opencl/scale_image_compute.cc
+53
-30
lite/kernels/opencl/slice_image_compute.cc
lite/kernels/opencl/slice_image_compute.cc
+6
-3
lite/kernels/xpu/bridges/CMakeLists.txt
lite/kernels/xpu/bridges/CMakeLists.txt
+2
-0
lite/kernels/xpu/bridges/paddle_use_bridges.h
lite/kernels/xpu/bridges/paddle_use_bridges.h
+1
-0
lite/kernels/xpu/bridges/yolo_box_op.cc
lite/kernels/xpu/bridges/yolo_box_op.cc
+85
-0
lite/kernels/xpu/subgraph_compute.cc
lite/kernels/xpu/subgraph_compute.cc
+3
-5
lite/model_parser/model_parser.cc
lite/model_parser/model_parser.cc
+1
-1
lite/operators/CMakeLists.txt
lite/operators/CMakeLists.txt
+6
-7
lite/operators/multiclass_nms_op.cc
lite/operators/multiclass_nms_op.cc
+2
-8
lite/tests/kernels/CMakeLists.txt
lite/tests/kernels/CMakeLists.txt
+1
-0
lite/tests/kernels/multiclass_nms_compute_test.cc
lite/tests/kernels/multiclass_nms_compute_test.cc
+491
-0
lite/tests/kernels/yolo_box_compute_test.cc
lite/tests/kernels/yolo_box_compute_test.cc
+15
-11
lite/tools/build.sh
lite/tools/build.sh
+7
-0
lite/tools/build_bm.sh
lite/tools/build_bm.sh
+1
-1
lite/tools/ci_build.sh
lite/tools/ci_build.sh
+1
-0
未找到文件。
CMakeLists.txt
浏览文件 @
917620e8
...
@@ -67,7 +67,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
...
@@ -67,7 +67,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option
(
LITE_WITH_FPGA
"Enable FPGA support in lite"
OFF
)
lite_option
(
LITE_WITH_FPGA
"Enable FPGA support in lite"
OFF
)
lite_option
(
LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
"Enable light-weight framework"
OFF
)
lite_option
(
LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
"Enable light-weight framework"
OFF
)
lite_option
(
LITE_WITH_PROFILE
"Enable profile mode in lite framework"
OFF
)
lite_option
(
LITE_WITH_PROFILE
"Enable profile mode in lite framework"
OFF
)
lite_option
(
LITE_WITH_PRECISION_PROFILE
"Enable precision profile in profile mode ON in lite"
OFF
IF LITE_WITH_PROFILE
)
lite_option
(
LITE_WITH_PRECISION_PROFILE
"Enable precision profile in profile mode ON in lite"
OFF
)
lite_option
(
LITE_SHUTDOWN_LOG
"Shutdown log system or not."
OFF
)
lite_option
(
LITE_SHUTDOWN_LOG
"Shutdown log system or not."
OFF
)
lite_option
(
LITE_ON_TINY_PUBLISH
"Publish tiny predictor lib."
OFF
)
lite_option
(
LITE_ON_TINY_PUBLISH
"Publish tiny predictor lib."
OFF
)
lite_option
(
LITE_ON_MODEL_OPTIMIZE_TOOL
"Build the model optimize tool"
OFF
)
lite_option
(
LITE_ON_MODEL_OPTIMIZE_TOOL
"Build the model optimize tool"
OFF
)
...
...
cmake/configure.cmake
浏览文件 @
917620e8
...
@@ -152,9 +152,10 @@ endif()
...
@@ -152,9 +152,10 @@ endif()
if
(
LITE_WITH_PROFILE
)
if
(
LITE_WITH_PROFILE
)
add_definitions
(
"-DLITE_WITH_PROFILE"
)
add_definitions
(
"-DLITE_WITH_PROFILE"
)
if
(
LITE_WITH_PRECISION_PROFILE
)
endif
()
add_definitions
(
"-DLITE_WITH_PRECISION_PROFILE"
)
endif
()
if
(
LITE_WITH_PRECISION_PROFILE
)
add_definitions
(
"-DLITE_WITH_PRECISION_PROFILE"
)
endif
()
endif
()
if
(
LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
)
if
(
LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
)
...
...
cmake/lite.cmake
浏览文件 @
917620e8
...
@@ -307,6 +307,9 @@ function(add_kernel TARGET device level)
...
@@ -307,6 +307,9 @@ function(add_kernel TARGET device level)
if
(
"
${
level
}
"
STREQUAL
"extra"
AND
(
NOT LITE_BUILD_EXTRA
))
if
(
"
${
level
}
"
STREQUAL
"extra"
AND
(
NOT LITE_BUILD_EXTRA
))
return
()
return
()
endif
()
endif
()
if
(
"
${
level
}
"
STREQUAL
"train"
AND
(
NOT LITE_WITH_TRAIN
))
return
()
endif
()
if
(
"
${
device
}
"
STREQUAL
"Host"
)
if
(
"
${
device
}
"
STREQUAL
"Host"
)
...
@@ -322,16 +325,11 @@ function(add_kernel TARGET device level)
...
@@ -322,16 +325,11 @@ function(add_kernel TARGET device level)
set
(
arm_kernels
"
${
arm_kernels
}
;
${
TARGET
}
"
CACHE INTERNAL
""
)
set
(
arm_kernels
"
${
arm_kernels
}
;
${
TARGET
}
"
CACHE INTERNAL
""
)
endif
()
endif
()
if
(
"
${
device
}
"
STREQUAL
"X86"
)
if
(
"
${
device
}
"
STREQUAL
"X86"
)
if
(
NOT LITE_WITH_X86
)
if
(
NOT LITE_WITH_X86
OR LITE_ON_MODEL_OPTIMIZE_TOOL
)
foreach
(
src
${
args_SRCS
}
)
foreach
(
src
${
args_SRCS
}
)
file
(
APPEND
${
fake_kernels_src_list
}
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
\n
"
)
file
(
APPEND
${
fake_kernels_src_list
}
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
\n
"
)
endforeach
()
endforeach
()
return
()
return
()
elseif
(
LITE_ON_MODEL_OPTIMIZE_TOOL
)
foreach
(
src
${
args_SRCS
}
)
file
(
APPEND
${
kernels_src_list
}
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
src
}
\n
"
)
endforeach
()
return
()
endif
()
endif
()
set
(
x86_kernels
"
${
x86_kernels
}
;
${
TARGET
}
"
CACHE INTERNAL
""
)
set
(
x86_kernels
"
${
x86_kernels
}
;
${
TARGET
}
"
CACHE INTERNAL
""
)
endif
()
endif
()
...
@@ -434,11 +432,13 @@ function(add_operator TARGET level)
...
@@ -434,11 +432,13 @@ function(add_operator TARGET level)
ARGS
)
ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
"
${
level
}
"
STREQUAL
"extra"
AND
(
NOT LITE_BUILD_EXTRA
))
if
(
"
${
level
}
"
STREQUAL
"extra"
AND
(
NOT LITE_BUILD_EXTRA
))
return
()
return
()
endif
()
endif
()
if
(
"
${
level
}
"
STREQUAL
"train"
AND
(
NOT LITE_WITH_TRAIN
))
return
()
endif
()
foreach
(
src
${
args_SRCS
}
)
foreach
(
src
${
args_SRCS
}
)
if
(
LITE_BUILD_TAILOR
)
if
(
LITE_BUILD_TAILOR
)
...
...
lite/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -192,7 +192,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
...
@@ -192,7 +192,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies
(
publish_inference publish_inference_cxx_lib
)
add_dependencies
(
publish_inference publish_inference_cxx_lib
)
if
(
NOT
"
${
CMAKE_BUILD_TYPE
}
"
STREQUAL
"Debug"
)
if
(
NOT
"
${
CMAKE_BUILD_TYPE
}
"
STREQUAL
"Debug"
)
add_custom_command
(
TARGET publish_inference_cxx_lib POST_BUILD
add_custom_command
(
TARGET publish_inference_cxx_lib POST_BUILD
COMMAND
${
CMAKE_STRIP
}
"--strip-debug"
${
INFER_LITE_PUBLISH_ROOT
}
/cxx/lib/*.a
)
COMMAND
${
CMAKE_STRIP
}
"--strip-debug"
${
INFER_LITE_PUBLISH_ROOT
}
/cxx/lib/*.a
COMMAND
${
CMAKE_STRIP
}
"--strip-debug"
${
INFER_LITE_PUBLISH_ROOT
}
/cxx/lib/*.so
)
endif
()
endif
()
endif
()
endif
()
else
()
else
()
...
...
lite/api/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -8,11 +8,12 @@ if (LITE_ON_TINY_PUBLISH)
...
@@ -8,11 +8,12 @@ if (LITE_ON_TINY_PUBLISH)
set
(
CMAKE_CXX_FLAGS_RELEASE
"-Os -DNDEBUG"
)
set
(
CMAKE_CXX_FLAGS_RELEASE
"-Os -DNDEBUG"
)
set
(
CMAKE_C_FLAGS_RELEASE
"-Os -DNDEBUG"
)
set
(
CMAKE_C_FLAGS_RELEASE
"-Os -DNDEBUG"
)
endif
()
endif
()
set
(
light_lib_DEPS light_api paddle_api paddle_api_light optimizer
)
if
((
NOT LITE_ON_TINY_PUBLISH
)
AND
(
LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL
"android"
OR ARM_TARGET_OS STREQUAL
"armlinux"
))
set
(
light_lib_DEPS light_api paddle_api paddle_api_light
)
if
((
NOT LITE_ON_TINY_PUBLISH
)
AND
(
LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL
"android"
OR ARM_TARGET_OS STREQUAL
"armlinux"
))
#full api dynamic library
#full api dynamic library
add_library
(
paddle_full_api_shared SHARED
""
)
lite_cc_library
(
paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
target_sources
(
paddle_full_api_shared PUBLIC
${
__lite_cc_files
}
paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
)
DEPS paddle_api paddle_api_light paddle_api_full
)
add_dependencies
(
paddle_full_api_shared op_list_h kernel_list_h framework_proto
)
add_dependencies
(
paddle_full_api_shared op_list_h kernel_list_h framework_proto
)
target_link_libraries
(
paddle_full_api_shared framework_proto
)
target_link_libraries
(
paddle_full_api_shared framework_proto
)
if
(
LITE_WITH_X86
)
if
(
LITE_WITH_X86
)
...
@@ -27,13 +28,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
...
@@ -27,13 +28,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
endif
(
LITE_WITH_CUDA
)
endif
(
LITE_WITH_CUDA
)
#light api dynamic library
#light api dynamic library
lite_cc_library
(
paddle_light_api_shared
MODULE
lite_cc_library
(
paddle_light_api_shared
SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
SRCS light_api_shared.cc
DEPS
${
light_lib_DEPS
}
DEPS
${
light_lib_DEPS
}
ARM_DEPS
${
arm_kernels
}
ARM_DEPS
${
arm_kernels
}
CV_DEPS paddle_cv_arm
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
NPU_DEPS
${
npu_kernels
}
)
)
add_dependencies
(
paddle_light_api_shared op_list_h kernel_list_h
)
target_link_libraries
(
paddle_light_api_shared
${
light_lib_DEPS
}
${
arm_kernels
}
${
npu_kernels
}
)
target_link_libraries
(
paddle_light_api_shared
${
light_lib_DEPS
}
${
arm_kernels
}
${
npu_kernels
}
)
set
(
LINK_MAP_FILE
"
${
PADDLE_SOURCE_DIR
}
/lite/core/lite.map"
)
set
(
LINK_MAP_FILE
"
${
PADDLE_SOURCE_DIR
}
/lite/core/lite.map"
)
set
(
LINK_FLAGS
"-Wl,--version-script
${
LINK_MAP_FILE
}
"
)
set
(
LINK_FLAGS
"-Wl,--version-script
${
LINK_MAP_FILE
}
"
)
...
@@ -262,7 +263,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
...
@@ -262,7 +263,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
NPU_DEPS
${
npu_kernels
}
CL_DEPS
${
opencl_kernels
}
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
)
FPGA_DEPS
${
fpga_kernels
}
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
BM_DEPS
${
bm_kernels
}
)
# The final inference library for just MobileConfig.
# The final inference library for just MobileConfig.
bundle_static_library
(
paddle_api_full paddle_api_full_bundled bundle_full_api
)
bundle_static_library
(
paddle_api_full paddle_api_full_bundled bundle_full_api
)
target_link_libraries
(
paddle_api_full
${
cuda_deps
}
)
target_link_libraries
(
paddle_api_full
${
cuda_deps
}
)
...
@@ -311,7 +315,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
...
@@ -311,7 +315,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
if
(
LITE_ON_MODEL_OPTIMIZE_TOOL
)
if
(
LITE_ON_MODEL_OPTIMIZE_TOOL
)
message
(
STATUS
"Compiling opt"
)
message
(
STATUS
"Compiling opt"
)
lite_cc_binary
(
opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
lite_cc_binary
(
opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
DEPS gflags kernel op optimizer mir_passes utils
)
DEPS gflags kernel op optimizer mir_passes utils
${
host_kernels
}
)
add_dependencies
(
opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h
)
add_dependencies
(
opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h
)
endif
(
LITE_ON_MODEL_OPTIMIZE_TOOL
)
endif
(
LITE_ON_MODEL_OPTIMIZE_TOOL
)
...
...
lite/api/benchmark.cc
浏览文件 @
917620e8
...
@@ -44,7 +44,10 @@ DEFINE_string(input_shape,
...
@@ -44,7 +44,10 @@ DEFINE_string(input_shape,
"set input shapes according to the model, "
"set input shapes according to the model, "
"separated by colon and comma, "
"separated by colon and comma, "
"such as 1,3,244,244"
);
"such as 1,3,244,244"
);
DEFINE_string
(
input_img_path
,
""
,
"the path of input image"
);
DEFINE_string
(
input_img_path
,
""
,
"the path of input image, if not set "
"input_img_path, the input of model will be 1.0."
);
DEFINE_int32
(
warmup
,
0
,
"warmup times"
);
DEFINE_int32
(
warmup
,
0
,
"warmup times"
);
DEFINE_int32
(
repeats
,
1
,
"repeats times"
);
DEFINE_int32
(
repeats
,
1
,
"repeats times"
);
DEFINE_int32
(
power_mode
,
DEFINE_int32
(
power_mode
,
...
@@ -57,16 +60,11 @@ DEFINE_int32(power_mode,
...
@@ -57,16 +60,11 @@ DEFINE_int32(power_mode,
DEFINE_int32
(
threads
,
1
,
"threads num"
);
DEFINE_int32
(
threads
,
1
,
"threads num"
);
DEFINE_string
(
result_filename
,
DEFINE_string
(
result_filename
,
"result.txt"
,
"result.txt"
,
"save benchmark "
"save the inference time to the file."
);
"result to the file"
);
DEFINE_bool
(
run_model_optimize
,
DEFINE_bool
(
run_model_optimize
,
false
,
false
,
"if set true, apply model_optimize_tool to "
"if set true, apply model_optimize_tool to "
"model and use optimized model to test. "
);
"model and use optimized model to test. "
);
DEFINE_bool
(
is_quantized_model
,
false
,
"if set true, "
"test the performance of the quantized model. "
);
namespace
paddle
{
namespace
paddle
{
namespace
lite_api
{
namespace
lite_api
{
...
@@ -87,10 +85,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
...
@@ -87,10 +85,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
std
::
vector
<
Place
>
vaild_places
=
{
std
::
vector
<
Place
>
vaild_places
=
{
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)},
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)},
};
};
if
(
FLAGS_is_quantized_model
)
{
vaild_places
.
insert
(
vaild_places
.
begin
(),
Place
{
TARGET
(
kARM
),
PRECISION
(
kInt8
)});
}
config
.
set_valid_places
(
vaild_places
);
config
.
set_valid_places
(
vaild_places
);
auto
predictor
=
lite_api
::
CreatePaddlePredictor
(
config
);
auto
predictor
=
lite_api
::
CreatePaddlePredictor
(
config
);
...
@@ -181,8 +175,8 @@ void Run(const std::vector<int64_t>& input_shape,
...
@@ -181,8 +175,8 @@ void Run(const std::vector<int64_t>& input_shape,
int
main
(
int
argc
,
char
**
argv
)
{
int
main
(
int
argc
,
char
**
argv
)
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
if
(
FLAGS_model_dir
==
""
||
FLAGS_result_filename
==
""
)
{
if
(
FLAGS_model_dir
==
""
)
{
LOG
(
INFO
)
<<
"
p
lease run ./benchmark_bin --help to obtain usage."
;
LOG
(
INFO
)
<<
"
P
lease run ./benchmark_bin --help to obtain usage."
;
exit
(
0
);
exit
(
0
);
}
}
...
...
lite/api/cxx_api.cc
浏览文件 @
917620e8
...
@@ -19,6 +19,7 @@
...
@@ -19,6 +19,7 @@
#include <string>
#include <string>
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h"
#include "lite/utils/io.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -298,6 +299,9 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
...
@@ -298,6 +299,9 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
inner_places
.
emplace_back
(
TARGET
(
kHost
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
));
inner_places
.
emplace_back
(
TARGET
(
kHost
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
));
inner_places
.
emplace_back
(
inner_places
.
emplace_back
(
TARGET
(
kHost
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNCHW
));
TARGET
(
kHost
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNCHW
));
// Analysis whether the modle is quantized.
// For quantized model, add place(arm, int8) to inner_places
const
std
::
vector
<
std
::
string
>
quant_dequant_op
=
{
const
std
::
vector
<
std
::
string
>
quant_dequant_op
=
{
"fake_quantize_abs_max"
,
"fake_quantize_abs_max"
,
"fake_quantize_range_abs_max"
,
"fake_quantize_range_abs_max"
,
...
@@ -320,7 +324,8 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
...
@@ -320,7 +324,8 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
}
}
}
}
if
(
is_quantized_model
)
{
if
(
is_quantized_model
)
{
inner_places
.
emplace_back
(
Place
{
TARGET
(
kARM
),
PRECISION
(
kInt8
)});
inner_places
.
insert
(
inner_places
.
begin
(),
Place
{
TARGET
(
kARM
),
PRECISION
(
kInt8
)});
}
}
Program
program
(
*
desc
.
get
(),
scope_
,
inner_places
);
Program
program
(
*
desc
.
get
(),
scope_
,
inner_places
);
...
...
lite/api/light_api.cc
浏览文件 @
917620e8
...
@@ -13,13 +13,9 @@
...
@@ -13,13 +13,9 @@
// limitations under the License.
// limitations under the License.
#include "lite/api/light_api.h"
#include "lite/api/light_api.h"
#include <algorithm>
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#include <algorithm>
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{
...
...
lite/api/light_api_shared.cc
已删除
100644 → 0
浏览文件 @
7446e257
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/api/paddle_api.h"
namespace
paddle
{
namespace
lite_api
{
void
RunModel
()
{
// 1. Set MobileConfig
MobileConfig
mobile_config
;
// 2. Create PaddlePredictor by MobileConfig
std
::
shared_ptr
<
PaddlePredictor
>
mobile_predictor
=
CreatePaddlePredictor
<
MobileConfig
>
(
mobile_config
);
}
}
// namespace lite_api
}
// namespace paddle
lite/api/opt.cc
浏览文件 @
917620e8
...
@@ -23,6 +23,7 @@
...
@@ -23,6 +23,7 @@
#include "kernel_src_map.h" // NOLINT
#include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/core/op_registry.h"
#include "lite/core/op_registry.h"
...
...
lite/api/test_classify_lite_bm.cc
浏览文件 @
917620e8
...
@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
...
@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
predictor
.
Build
(
FLAGS_model_dir
,
""
,
""
,
valid_places
,
passes
);
predictor
.
Build
(
FLAGS_model_dir
,
""
,
""
,
valid_places
,
passes
);
auto
*
input_tensor
=
predictor
.
GetInput
(
0
);
auto
*
input_tensor
=
predictor
.
GetInput
(
0
);
input_tensor
->
Resize
(
DDim
(
std
::
vector
<
DDim
::
value_type
>
({
1
,
3
,
224
,
224
})));
input_tensor
->
Resize
(
DDim
(
std
::
vector
<
DDim
::
value_type
>
({
1
,
3
,
FLAGS_im_height
,
FLAGS_im_width
})));
auto
*
data
=
input_tensor
->
mutable_data
<
float
>
();
auto
*
data
=
input_tensor
->
mutable_data
<
float
>
();
auto
item_size
=
input_tensor
->
dims
().
production
();
auto
item_size
=
input_tensor
->
dims
().
production
();
if
(
FLAGS_input_img_txt_path
.
empty
())
{
if
(
FLAGS_input_img_txt_path
.
empty
())
{
...
@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
...
@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
<<
", spend "
<<
(
GetCurrentUS
()
-
start
)
/
FLAGS_repeats
/
1000.0
<<
", spend "
<<
(
GetCurrentUS
()
-
start
)
/
FLAGS_repeats
/
1000.0
<<
" ms in average."
;
<<
" ms in average."
;
auto
*
out
=
predictor
.
GetOutput
(
0
);
auto
out
=
predictor
.
GetOutputs
();
ASSERT_EQ
(
out
->
dims
().
size
(),
2
);
ASSERT_EQ
(
out
->
dims
()[
0
],
1
);
ASSERT_EQ
(
out
->
dims
()[
1
],
1000
);
auto
*
out_data
=
out
->
data
<
float
>
();
FILE
*
fp
=
fopen
(
"result.txt"
,
"wb"
);
FILE
*
fp
=
fopen
(
"result.txt"
,
"wb"
);
for
(
int
i
=
0
;
i
<
out
->
numel
();
i
++
)
{
for
(
int
i
=
0
;
i
<
out
.
size
();
i
++
)
{
fprintf
(
fp
,
"%f
\n
"
,
out_data
[
i
]);
auto
*
out_data
=
out
[
i
]
->
data
<
float
>
();
for
(
int
j
=
0
;
j
<
out
[
i
]
->
numel
();
j
++
)
{
fprintf
(
fp
,
"%f
\n
"
,
out_data
[
j
]);
}
}
}
fclose
(
fp
);
fclose
(
fp
);
}
}
...
...
lite/api/transform_test.cc
浏览文件 @
917620e8
...
@@ -13,7 +13,9 @@
...
@@ -13,7 +13,9 @@
// limitations under the License.
// limitations under the License.
#include <gflags/gflags.h>
#include <gflags/gflags.h>
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#include <gtest/gtest.h>
#endif
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/cxx_api.h"
...
...
lite/backends/opencl/cl_context.cc
浏览文件 @
917620e8
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_context.h"
#include <algorithm>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <utility>
#include <utility>
...
@@ -35,8 +36,10 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
...
@@ -35,8 +36,10 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
STL
::
stringstream
program_key_ss
;
STL
::
stringstream
program_key_ss
;
program_key_ss
<<
file_name
<<
options
;
program_key_ss
<<
file_name
<<
options
;
std
::
string
program_key
=
program_key_ss
.
str
();
std
::
string
program_key
=
program_key_ss
.
str
();
auto
it
=
programs_
.
find
(
program_key
);
if
(
it
!=
programs_
.
end
())
{
auto
&
programs
=
CLRuntime
::
Global
()
->
programs
();
auto
it
=
programs
.
find
(
program_key
);
if
(
it
!=
programs
.
end
())
{
VLOG
(
3
)
<<
" --- program -> "
<<
program_key
<<
" has been built --- "
;
VLOG
(
3
)
<<
" --- program -> "
<<
program_key
<<
" has been built --- "
;
return
*
(
it
->
second
);
return
*
(
it
->
second
);
}
}
...
@@ -47,14 +50,15 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
...
@@ -47,14 +50,15 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
CLRuntime
::
Global
()
->
BuildProgram
(
program
.
get
(),
options
);
CLRuntime
::
Global
()
->
BuildProgram
(
program
.
get
(),
options
);
VLOG
(
3
)
<<
" --- end build program -> "
<<
program_key
<<
" --- "
;
VLOG
(
3
)
<<
" --- end build program -> "
<<
program_key
<<
" --- "
;
programs
_
[
program_key
]
=
std
::
move
(
program
);
programs
[
program_key
]
=
std
::
move
(
program
);
return
*
(
programs
_
[
program_key
]);
return
*
(
programs
[
program_key
]);
}
}
void
CLContext
::
AddKernel
(
const
std
::
string
&
kernel_name
,
void
CLContext
::
AddKernel
(
const
std
::
string
&
kernel_name
,
const
std
::
string
&
file_name
,
const
std
::
string
&
file_name
,
const
std
::
string
&
options
)
{
const
std
::
string
&
options
,
const
std
::
string
&
time_stamp
)
{
cl_int
status
{
CL_SUCCESS
};
cl_int
status
{
CL_SUCCESS
};
VLOG
(
3
)
<<
" --- to get program "
<<
file_name
<<
" --- "
;
VLOG
(
3
)
<<
" --- to get program "
<<
file_name
<<
" --- "
;
auto
program
=
GetProgram
(
file_name
,
options
);
auto
program
=
GetProgram
(
file_name
,
options
);
...
@@ -64,25 +68,30 @@ void CLContext::AddKernel(const std::string &kernel_name,
...
@@ -64,25 +68,30 @@ void CLContext::AddKernel(const std::string &kernel_name,
new
cl
::
Kernel
(
program
,
kernel_name
.
c_str
(),
&
status
));
new
cl
::
Kernel
(
program
,
kernel_name
.
c_str
(),
&
status
));
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
VLOG
(
3
)
<<
" --- end create kernel --- "
;
VLOG
(
3
)
<<
" --- end create kernel --- "
;
kernels_
.
emplace_back
(
std
::
move
(
kernel
));
auto
&
kernels
=
CLRuntime
::
Global
()
->
kernels
();
auto
&
kernel_offset_map
=
CLRuntime
::
Global
()
->
kernel_offset
();
kernels
.
emplace_back
(
std
::
move
(
kernel
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_name
<<
options
;
kernel_key
<<
kernel_name
<<
options
<<
time_stamp
;
kernel_offset_
[
kernel_key
.
str
()]
=
kernels_
.
size
()
-
1
;
kernel_offset_
map
[
kernel_key
.
str
()]
=
kernels
.
size
()
-
1
;
}
}
cl
::
Kernel
&
CLContext
::
GetKernel
(
const
int
index
)
{
cl
::
Kernel
&
CLContext
::
GetKernel
(
const
int
index
)
{
VLOG
(
3
)
<<
" --- kernel count: "
<<
kernels_
.
size
()
<<
" --- "
;
auto
&
kernels
=
CLRuntime
::
Global
()
->
kernels
();
CHECK
(
static_cast
<
size_t
>
(
index
)
<
kernels_
.
size
())
VLOG
(
3
)
<<
" --- kernel count: "
<<
kernels
.
size
()
<<
" --- "
;
CHECK
(
static_cast
<
size_t
>
(
index
)
<
kernels
.
size
())
<<
"The index must be less than the size of kernels."
;
<<
"The index must be less than the size of kernels."
;
CHECK
(
kernels
_
[
index
]
!=
nullptr
)
CHECK
(
kernels
[
index
]
!=
nullptr
)
<<
"The target kernel pointer cannot be null."
;
<<
"The target kernel pointer cannot be null."
;
return
*
(
kernels
_
[
index
]);
return
*
(
kernels
[
index
]);
}
}
cl
::
Kernel
&
CLContext
::
GetKernel
(
const
std
::
string
&
name
)
{
cl
::
Kernel
&
CLContext
::
GetKernel
(
const
std
::
string
&
name
)
{
auto
it
=
kernel_offset_
.
find
(
name
);
auto
&
kernel_offset_map
=
CLRuntime
::
Global
()
->
kernel_offset
();
CHECK
(
it
!=
kernel_offset_
.
end
())
<<
"Cannot find the kernel function: "
auto
it
=
kernel_offset_map
.
find
(
name
);
<<
name
;
CHECK
(
it
!=
kernel_offset_map
.
end
())
<<
"Cannot find the kernel function: "
<<
name
;
return
GetKernel
(
it
->
second
);
return
GetKernel
(
it
->
second
);
}
}
...
@@ -121,14 +130,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
...
@@ -121,14 +130,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
}
}
}
}
cl
::
NDRange
CLContext
::
LocalWorkSizeTurn
(
cl
::
NDRange
global_work_size
,
size_t
max_work_size
,
int
divisor
)
{
int
preferred_lws
=
0
;
#if 1
auto
gws0
=
global_work_size
[
0
];
auto
gws1
=
global_work_size
[
1
];
auto
gws2
=
global_work_size
[
2
];
#else
auto
gws2
=
global_work_size
[
0
];
auto
gws1
=
global_work_size
[
1
];
auto
gws0
=
global_work_size
[
2
];
#endif
if
(
divisor
>
1
)
{
max_work_size
/=
divisor
;
}
if
(
preferred_lws
>
0
&&
preferred_lws
<=
max_work_size
)
{
max_work_size
=
preferred_lws
;
}
while
(
gws1
>
max_work_size
&&
max_work_size
>
0
)
{
gws1
=
gws1
%
2
==
0
?
gws1
/
2
:
1
;
}
while
(
gws2
*
gws1
>
max_work_size
&&
max_work_size
>
0
)
{
gws2
=
gws2
%
2
==
0
?
gws2
/
2
:
1
;
}
while
(
gws0
*
gws1
*
gws2
>
max_work_size
&&
max_work_size
>
0
)
{
gws0
=
gws0
%
2
==
0
?
gws0
/
2
:
1
;
}
#if 1
return
cl
::
NDRange
{
static_cast
<
size_t
>
(
gws0
),
static_cast
<
size_t
>
(
gws1
),
static_cast
<
size_t
>
(
gws2
)};
#else
return
cl
::
NDRange
{
static_cast
<
size_t
>
(
gws2
),
static_cast
<
size_t
>
(
gws1
),
static_cast
<
size_t
>
(
gws0
)};
#endif
}
cl
::
NDRange
CLContext
::
LocalWorkSize
(
cl
::
NDRange
global_work_size
,
cl
::
NDRange
CLContext
::
LocalWorkSize
(
cl
::
NDRange
global_work_size
,
size_t
max_work_size
)
{
size_t
max_work_size
)
{
int
preferred_lws
=
0
;
int
preferred_lws
=
0
;
int
divisor
=
2
;
int
divisor
=
2
;
auto
tmp
0
=
global_work_size
[
0
];
auto
gws
0
=
global_work_size
[
0
];
auto
tmp
1
=
global_work_size
[
1
];
auto
gws
1
=
global_work_size
[
1
];
auto
tmp
2
=
global_work_size
[
2
];
auto
gws
2
=
global_work_size
[
2
];
if
(
divisor
>
1
)
{
if
(
divisor
>
1
)
{
max_work_size
/=
divisor
;
max_work_size
/=
divisor
;
...
@@ -136,18 +184,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
...
@@ -136,18 +184,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
if
(
preferred_lws
>
0
&&
preferred_lws
<=
max_work_size
)
{
if
(
preferred_lws
>
0
&&
preferred_lws
<=
max_work_size
)
{
max_work_size
=
preferred_lws
;
max_work_size
=
preferred_lws
;
}
}
while
(
tmp
1
>
max_work_size
&&
max_work_size
>
0
)
{
while
(
gws
1
>
max_work_size
&&
max_work_size
>
0
)
{
tmp1
=
tmp1
%
2
==
0
?
tmp
1
/
2
:
1
;
gws1
=
gws1
%
2
==
0
?
gws
1
/
2
:
1
;
}
}
while
(
tmp2
*
tmp
1
>
max_work_size
&&
max_work_size
>
0
)
{
while
(
gws2
*
gws
1
>
max_work_size
&&
max_work_size
>
0
)
{
tmp2
=
tmp2
%
2
==
0
?
tmp
2
/
2
:
1
;
gws2
=
gws2
%
2
==
0
?
gws
2
/
2
:
1
;
}
}
while
(
tmp0
*
tmp1
*
tmp
2
>
max_work_size
&&
max_work_size
>
0
)
{
while
(
gws0
*
gws1
*
gws
2
>
max_work_size
&&
max_work_size
>
0
)
{
tmp0
=
tmp0
%
2
==
0
?
tmp
0
/
2
:
1
;
gws0
=
gws0
%
2
==
0
?
gws
0
/
2
:
1
;
}
}
return
cl
::
NDRange
{
static_cast
<
size_t
>
(
tmp
0
),
return
cl
::
NDRange
{
static_cast
<
size_t
>
(
gws
0
),
static_cast
<
size_t
>
(
tmp
1
),
static_cast
<
size_t
>
(
gws
1
),
static_cast
<
size_t
>
(
tmp
2
)};
static_cast
<
size_t
>
(
gws
2
)};
}
}
}
// namespace lite
}
// namespace lite
...
...
lite/backends/opencl/cl_context.h
浏览文件 @
917620e8
...
@@ -36,7 +36,8 @@ class CLContext {
...
@@ -36,7 +36,8 @@ class CLContext {
void
AddKernel
(
const
std
::
string
&
kernel_name
,
void
AddKernel
(
const
std
::
string
&
kernel_name
,
const
std
::
string
&
file_name
,
const
std
::
string
&
file_name
,
const
std
::
string
&
options
=
""
);
const
std
::
string
&
options
=
""
,
const
std
::
string
&
time_stamp
=
""
);
cl
::
Kernel
&
GetKernel
(
const
int
index
);
cl
::
Kernel
&
GetKernel
(
const
int
index
);
...
@@ -46,10 +47,11 @@ class CLContext {
...
@@ -46,10 +47,11 @@ class CLContext {
cl
::
NDRange
LocalWorkSize
(
cl
::
NDRange
global_work_size
,
size_t
max_work_size
);
cl
::
NDRange
LocalWorkSize
(
cl
::
NDRange
global_work_size
,
size_t
max_work_size
);
private:
cl
::
NDRange
LocalWorkSizeTurn
(
cl
::
NDRange
global_work_size
,
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
cl
::
Program
>>
programs_
;
size_t
max_work_size
,
std
::
vector
<
std
::
unique_ptr
<
cl
::
Kernel
>>
kernels_
;
int
divitor
=
2
);
std
::
map
<
std
::
string
,
int
>
kernel_offset_
;
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size);
};
};
}
// namespace lite
}
// namespace lite
...
...
lite/backends/opencl/cl_runtime.cc
浏览文件 @
917620e8
...
@@ -14,6 +14,7 @@ limitations under the License. */
...
@@ -14,6 +14,7 @@ limitations under the License. */
#include "lite/backends/opencl/cl_runtime.h"
#include "lite/backends/opencl/cl_runtime.h"
#include <string>
#include <string>
#include <unordered_map>
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "lite/utils/cp_logging.h"
#include "lite/utils/cp_logging.h"
...
@@ -29,10 +30,26 @@ CLRuntime* CLRuntime::Global() {
...
@@ -29,10 +30,26 @@ CLRuntime* CLRuntime::Global() {
CLRuntime
::~
CLRuntime
()
{
CLRuntime
::~
CLRuntime
()
{
if
(
command_queue_
!=
nullptr
)
{
if
(
command_queue_
!=
nullptr
)
{
command_queue_
->
flush
();
command_queue_
->
finish
();
command_queue_
->
finish
();
}
}
// For controlling the destruction order:
for
(
size_t
kidx
=
0
;
kidx
<
kernels_
.
size
();
++
kidx
)
{
clReleaseKernel
(
kernels_
[
kidx
]
->
get
());
kernels_
[
kidx
].
reset
();
}
kernels_
.
clear
();
kernel_offset_
.
clear
();
for
(
auto
&
p
:
programs_
)
{
clReleaseProgram
(
p
.
second
->
get
());
}
programs_
.
clear
();
// For controlling the destruction order
command_queue_
&&
clReleaseCommandQueue
(
command_queue_
->
get
());
command_queue_
.
reset
();
command_queue_
.
reset
();
context_
&&
clReleaseContext
(
context_
->
get
());
context_
.
reset
();
context_
.
reset
();
device_
.
reset
();
device_
.
reset
();
platform_
.
reset
();
platform_
.
reset
();
...
@@ -73,14 +90,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
...
@@ -73,14 +90,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
return
*
command_queue_
;
return
*
command_queue_
;
}
}
std
::
unique
_ptr
<
cl
::
Program
>
CLRuntime
::
CreateProgram
(
std
::
shared
_ptr
<
cl
::
Program
>
CLRuntime
::
CreateProgram
(
const
cl
::
Context
&
context
,
std
::
string
file_name
)
{
const
cl
::
Context
&
context
,
std
::
string
file_name
)
{
auto
cl_file
=
opencl_kernels_files
.
find
(
file_name
);
auto
cl_file
=
opencl_kernels_files
.
find
(
file_name
);
std
::
string
content
(
cl_file
->
second
.
begin
(),
cl_file
->
second
.
end
());
std
::
string
content
(
cl_file
->
second
.
begin
(),
cl_file
->
second
.
end
());
cl
::
Program
::
Sources
sources
;
cl
::
Program
::
Sources
sources
;
sources
.
push_back
(
content
);
sources
.
push_back
(
content
);
auto
prog
=
auto
prog
=
std
::
unique
_ptr
<
cl
::
Program
>
(
new
cl
::
Program
(
context
,
sources
,
&
status_
));
std
::
shared
_ptr
<
cl
::
Program
>
(
new
cl
::
Program
(
context
,
sources
,
&
status_
));
VLOG
(
4
)
<<
"OpenCL kernel file name: "
<<
file_name
;
VLOG
(
4
)
<<
"OpenCL kernel file name: "
<<
file_name
;
VLOG
(
4
)
<<
"Program source size: "
<<
content
.
size
();
VLOG
(
4
)
<<
"Program source size: "
<<
content
.
size
();
CL_CHECK_FATAL
(
status_
);
CL_CHECK_FATAL
(
status_
);
...
...
lite/backends/opencl/cl_runtime.h
浏览文件 @
917620e8
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <map>
#include <map>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <vector>
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/backends/opencl/cl_utility.h"
...
@@ -42,7 +43,7 @@ class CLRuntime {
...
@@ -42,7 +43,7 @@ class CLRuntime {
cl
::
CommandQueue
&
command_queue
();
cl
::
CommandQueue
&
command_queue
();
std
::
unique
_ptr
<
cl
::
Program
>
CreateProgram
(
const
cl
::
Context
&
context
,
std
::
shared
_ptr
<
cl
::
Program
>
CreateProgram
(
const
cl
::
Context
&
context
,
std
::
string
file_name
);
std
::
string
file_name
);
std
::
unique_ptr
<
cl
::
UserEvent
>
CreateEvent
(
const
cl
::
Context
&
context
);
std
::
unique_ptr
<
cl
::
UserEvent
>
CreateEvent
(
const
cl
::
Context
&
context
);
...
@@ -57,6 +58,12 @@ class CLRuntime {
...
@@ -57,6 +58,12 @@ class CLRuntime {
std
::
map
<
std
::
string
,
size_t
>&
GetDeviceInfo
();
std
::
map
<
std
::
string
,
size_t
>&
GetDeviceInfo
();
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
cl
::
Program
>>&
programs
()
{
return
programs_
;
}
std
::
vector
<
std
::
unique_ptr
<
cl
::
Kernel
>>&
kernels
()
{
return
kernels_
;
}
std
::
map
<
std
::
string
,
int
>&
kernel_offset
()
{
return
kernel_offset_
;
}
private:
private:
CLRuntime
()
=
default
;
CLRuntime
()
=
default
;
...
@@ -98,6 +105,12 @@ class CLRuntime {
...
@@ -98,6 +105,12 @@ class CLRuntime {
std
::
shared_ptr
<
cl
::
CommandQueue
>
command_queue_
{
nullptr
};
std
::
shared_ptr
<
cl
::
CommandQueue
>
command_queue_
{
nullptr
};
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
cl
::
Program
>>
programs_
{};
std
::
vector
<
std
::
unique_ptr
<
cl
::
Kernel
>>
kernels_
{};
std
::
map
<
std
::
string
,
int
>
kernel_offset_
{};
cl_int
status_
{
CL_SUCCESS
};
cl_int
status_
{
CL_SUCCESS
};
bool
initialized_
{
false
};
bool
initialized_
{
false
};
...
...
lite/backends/opencl/cl_utility.h
浏览文件 @
917620e8
...
@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
...
@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__FILE__, \
__LINE__); \
__LINE__); \
}
}
#ifndef LITE_SHUTDOWN_LOG
#define CL_CHECK_FATAL(err_code__) \
#define CL_CHECK_FATAL(err_code__) \
if (err_code__ != CL_SUCCESS) { \
if (err_code__ != CL_SUCCESS) { \
LOG(FATAL) << string_format( \
LOG(FATAL) << string_format( \
...
@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
...
@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__FILE__, \
__LINE__); \
__LINE__); \
}
}
#else
#define CL_CHECK_FATAL(err_code__)
#endif
}
// namespace lite
}
// namespace lite
}
// namespace paddle
}
// namespace paddle
lite/core/mir/fusion/conv_bn_fuse_pass.cc
浏览文件 @
917620e8
...
@@ -26,7 +26,8 @@ namespace mir {
...
@@ -26,7 +26,8 @@ namespace mir {
void
ConvBNFusePass
::
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
{
void
ConvBNFusePass
::
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
{
// initialze fuser params
// initialze fuser params
std
::
vector
<
bool
>
conv_has_bias_cases
{
true
,
false
};
std
::
vector
<
bool
>
conv_has_bias_cases
{
true
,
false
};
std
::
vector
<
std
::
string
>
conv_type_cases
{
"conv2d"
,
"depthwise_conv2d"
};
std
::
vector
<
std
::
string
>
conv_type_cases
{
"conv2d"
,
"depthwise_conv2d"
,
"conv2d_transpose"
};
// start fuse using params
// start fuse using params
for
(
auto
conv_has_bias
:
conv_has_bias_cases
)
{
for
(
auto
conv_has_bias
:
conv_has_bias_cases
)
{
for
(
auto
conv_type
:
conv_type_cases
)
{
for
(
auto
conv_type
:
conv_type_cases
)
{
...
...
lite/core/mir/fusion/conv_bn_fuser.cc
浏览文件 @
917620e8
...
@@ -103,10 +103,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
...
@@ -103,10 +103,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
std
::
string
conv_weight_name
=
matched
.
at
(
"conv_weight"
)
->
arg
()
->
name
;
std
::
string
conv_weight_name
=
matched
.
at
(
"conv_weight"
)
->
arg
()
->
name
;
auto
conv_weight_t
=
auto
conv_weight_t
=
scope
->
FindVar
(
conv_weight_name
)
->
GetMutable
<
lite
::
Tensor
>
();
scope
->
FindVar
(
conv_weight_name
)
->
GetMutable
<
lite
::
Tensor
>
();
CHECK_EQ
(
static_cast
<
size_t
>
(
bn_scale_t
->
data_size
()),
if
(
conv_type_
==
"conv2d_transpose"
)
{
static_cast
<
size_t
>
(
conv_weight_t
->
dims
()[
0
]))
CHECK_EQ
(
static_cast
<
size_t
>
(
bn_scale_t
->
data_size
()),
<<
"The BN bias's size should be equal to the size of the first "
static_cast
<
size_t
>
(
conv_weight_t
->
dims
()[
1
]))
<<
"dim size of the conv weights"
;
<<
"The BN bias's size should be equal to the size of the first "
<<
"dim size of the conv weights"
;
}
else
{
CHECK_EQ
(
static_cast
<
size_t
>
(
bn_scale_t
->
data_size
()),
static_cast
<
size_t
>
(
conv_weight_t
->
dims
()[
0
]))
<<
"The BN bias's size should be equal to the size of the first "
<<
"dim size of the conv weights"
;
}
size_t
weight_num
=
conv_weight_t
->
data_size
();
size_t
weight_num
=
conv_weight_t
->
data_size
();
bool
enable_int8
=
conv_op_desc
->
HasAttr
(
"enable_int8"
)
?
true
:
false
;
bool
enable_int8
=
conv_op_desc
->
HasAttr
(
"enable_int8"
)
?
true
:
false
;
bool
is_weight_quantization
=
bool
is_weight_quantization
=
...
@@ -153,12 +160,29 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
...
@@ -153,12 +160,29 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
// compute new conv_weight for int8
// compute new conv_weight for int8
auto
weight_scale
=
auto
weight_scale
=
conv_op_desc
->
GetAttr
<
std
::
vector
<
float
>>
(
"weight_scale"
);
conv_op_desc
->
GetAttr
<
std
::
vector
<
float
>>
(
"weight_scale"
);
for
(
unsigned
int
i
=
0
;
i
<
h
;
++
i
)
{
if
(
conv_type_
==
"conv2d_transpose"
)
{
weight_scale
[
i
]
*=
fabsf
(
alpha_data
[
i
]);
int
c_size
=
conv_weight_t
->
dims
()[
1
]
*
conv_weight_t
->
dims
()[
2
]
*
if
(
alpha_data
[
i
]
<
0.
f
)
{
conv_weight_t
->
dims
()[
3
];
auto
ptr_row
=
conv_weight_d
+
i
*
w
;
int
hw
=
conv_weight_t
->
dims
()[
2
]
*
conv_weight_t
->
dims
()[
3
];
for
(
unsigned
int
j
=
0
;
j
<
w
;
++
j
)
{
for
(
unsigned
int
k
=
0
;
k
<
conv_weight_t
->
dims
()[
0
];
++
k
)
{
ptr_row
[
j
]
*=
-
1
;
for
(
unsigned
int
i
=
0
;
i
<
h
;
++
i
)
{
weight_scale
[
i
]
*=
fabsf
(
alpha_data
[
i
]);
if
(
alpha_data
[
i
]
<
0.
f
)
{
auto
ptr_row
=
conv_weight_d
+
k
*
c_size
+
i
*
hw
;
for
(
unsigned
int
j
=
0
;
j
<
hw
;
++
j
)
{
ptr_row
[
j
]
*=
-
1
;
}
}
}
}
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
h
;
++
i
)
{
weight_scale
[
i
]
*=
fabsf
(
alpha_data
[
i
]);
if
(
alpha_data
[
i
]
<
0.
f
)
{
auto
ptr_row
=
conv_weight_d
+
i
*
w
;
for
(
unsigned
int
j
=
0
;
j
<
w
;
++
j
)
{
ptr_row
[
j
]
*=
-
1
;
}
}
}
}
}
}
}
...
@@ -176,9 +200,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
...
@@ -176,9 +200,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
}
else
{
}
else
{
// compute new conv_weight
// compute new conv_weight
auto
conv_weight_d
=
conv_weight_t
->
mutable_data
<
float
>
();
auto
conv_weight_d
=
conv_weight_t
->
mutable_data
<
float
>
();
for
(
unsigned
int
i
=
0
;
i
<
h
;
++
i
)
{
// n: conv2d output channels
if
(
conv_type_
==
"conv2d_transpose"
)
{
for
(
unsigned
int
j
=
0
;
j
<
w
;
++
j
)
{
// w: conv2d input channels
int
c_size
=
conv_weight_t
->
dims
()[
1
]
*
conv_weight_t
->
dims
()[
2
]
*
conv_weight_d
[
i
*
w
+
j
]
*=
alpha_data
[
i
];
conv_weight_t
->
dims
()[
3
];
int
hw
=
conv_weight_t
->
dims
()[
2
]
*
conv_weight_t
->
dims
()[
3
];
for
(
unsigned
int
k
=
0
;
k
<
conv_weight_t
->
dims
()[
0
];
++
k
)
{
for
(
unsigned
int
i
=
0
;
i
<
h
;
++
i
)
{
auto
ptr_row
=
conv_weight_d
+
k
*
c_size
+
i
*
hw
;
for
(
unsigned
int
j
=
0
;
j
<
hw
;
++
j
)
{
ptr_row
[
j
]
*=
alpha_data
[
i
];
}
}
}
}
else
{
for
(
unsigned
int
i
=
0
;
i
<
h
;
++
i
)
{
// n: conv2d output channels
for
(
unsigned
int
j
=
0
;
j
<
w
;
++
j
)
{
// w: conv2d input channels
conv_weight_d
[
i
*
w
+
j
]
*=
alpha_data
[
i
];
}
}
}
}
}
}
}
...
...
lite/core/mir/fusion/quant_dequant_fuse_pass.cc
浏览文件 @
917620e8
...
@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
...
@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser
(
graph
.
get
());
fuser
(
graph
.
get
());
}
}
// delete quant_dequant_node
// process quant_dequant_node
for
(
auto
op_type
:
{
"pool2d"
,
"softmax"
,
"elementwise_add"
})
{
fusion
::
DeleteQuantDequantOpFuser
dqd_fuser
;
fusion
::
DeleteQuantDequantOpFuser
fuser
(
op_type
);
dqd_fuser
(
graph
.
get
());
fuser
(
graph
.
get
());
}
}
}
}
// namespace mir
}
// namespace mir
...
...
lite/core/mir/fusion/quant_dequant_op_fuser.cc
浏览文件 @
917620e8
...
@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
auto
*
output_scale_node
=
matched
.
at
(
"output_scale_node"
);
auto
*
output_scale_node
=
matched
.
at
(
"output_scale_node"
);
auto
*
output_act_node
=
matched
.
at
(
"output_act_node"
);
auto
*
output_act_node
=
matched
.
at
(
"output_act_node"
);
// obtain
values, save value
s and relink node
// obtain
scale, save attr
s and relink node
int
bit_length
=
quant_node
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
int
bit_length
=
quant_node
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
auto
*
scope
=
quant_node
->
stmt
()
->
op
()
->
scope
();
auto
*
scope
=
quant_node
->
stmt
()
->
op
()
->
scope
();
...
@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
->
GetMutable
<
lite
::
Tensor
>
();
->
GetMutable
<
lite
::
Tensor
>
();
float
scale_value
=
scale_tensor
->
data
<
float
>
()[
0
]
/
range
;
float
scale_value
=
scale_tensor
->
data
<
float
>
()[
0
]
/
range
;
auto
in_act_name
=
input_act_node
->
arg
()
->
name
;
auto
out_act_name
=
output_act_node
->
arg
()
->
name
;
auto
outlinks
=
output_act_node
->
outlinks
;
auto
outlinks
=
output_act_node
->
outlinks
;
for
(
auto
*
quantized_node
:
outlinks
)
{
for
(
auto
*
quantized_node
:
outlinks
)
{
auto
*
op_desc
=
quantized_node
->
stmt
()
->
mutable_op_info
();
// save input scale in quantized op by input argname + index
op_desc
->
SetAttr
<
int
>
(
"bit_length"
,
bit_length
);
auto
op_desc
=
*
quantized_node
->
stmt
()
->
mutable_op_info
();
op_desc
->
SetAttr
<
float
>
(
"input_scale"
,
scale_value
);
std
::
string
argname
;
int
index
;
op_desc
.
GetInputArgname
(
out_act_name
,
&
argname
);
op_desc
.
GetInputIndex
(
out_act_name
,
&
index
);
op_desc
.
SetAttr
<
float
>
(
argname
+
std
::
to_string
(
index
)
+
"_input_scale"
,
scale_value
);
op_desc
.
SetAttr
<
float
>
(
"input_scale"
,
scale_value
);
// save it for now
op_desc
.
SetAttr
<
int
>
(
"bit_length"
,
bit_length
);
op_desc
.
UpdateAllInputs
(
out_act_name
,
in_act_name
);
quantized_node
->
stmt
()
->
ResetOp
(
op_desc
,
graph
->
valid_places
());
IR_NODE_LINK_TO
(
input_act_node
,
quantized_node
)
IR_NODE_LINK_TO
(
input_act_node
,
quantized_node
)
}
}
...
@@ -125,19 +136,18 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -125,19 +136,18 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
auto
*
dequant_op
=
matched
.
at
(
"dequant_op"
);
auto
*
dequant_op
=
matched
.
at
(
"dequant_op"
);
auto
*
dequant_op_out
=
matched
.
at
(
"dequant_op_out"
);
auto
*
dequant_op_out
=
matched
.
at
(
"dequant_op_out"
);
// obtain
input_scale and weight_scal
e
// obtain
weight_scale from max_rang
e
auto
*
scope
=
quantized_op
->
stmt
()
->
op
()
->
scope
();
auto
*
scope
=
quantized_op
->
stmt
()
->
op
()
->
scope
();
auto
&
valid_places
=
quantized_op
->
stmt
()
->
op
()
->
valid_places
();
auto
&
valid_places
=
quantized_op
->
stmt
()
->
op
()
->
valid_places
();
int
bit_length
=
quantized_op
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
int
bit_length
=
quantized_op
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
float
input_scale
=
quantized_op
->
stmt
()
->
op_info
()
->
GetAttr
<
float
>
(
"input_scale"
);
float
max_range
=
dequant_op
->
stmt
()
->
op_info
()
->
GetAttr
<
float
>
(
"max_range"
);
float
max_range
=
dequant_op
->
stmt
()
->
op_info
()
->
GetAttr
<
float
>
(
"max_range"
);
float
whole_weight_scale
=
float
whole_weight_scale
=
static_cast
<
float
>
(
range
*
range
)
/
max_range
/
range
;
static_cast
<
float
>
(
range
*
range
)
/
max_range
/
range
;
// max_range = range * range / max(abs(weight))
// As: max_range = range * range / max(abs(weight))
// weight_scale = range * range / (range * range / max(abs(weight))) / range
// So: whole_weight_scale
// = max(abs(weight)) / range
// = range * range / (range * range / max(abs(weight))) / range
// = max(abs(weight)) / range
// set op desc
// set op desc
cpp
::
OpDesc
op_desc
=
*
quantized_op
->
stmt
()
->
op_info
();
cpp
::
OpDesc
op_desc
=
*
quantized_op
->
stmt
()
->
op_info
();
...
@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// be Cout.
// be Cout.
weight_scale_size
=
quantized_weight_t
->
dims
()[
0
];
weight_scale_size
=
quantized_weight_t
->
dims
()[
0
];
}
else
if
(
quantized_op_type_
==
"mul"
)
{
}
else
if
(
quantized_op_type_
==
"mul"
||
quantized_op_type_
==
"matmul"
)
{
op_desc
.
SetInput
(
"X"
,
{
quantized_op_input
->
arg
()
->
name
});
op_desc
.
SetInput
(
"X"
,
{
quantized_op_input
->
arg
()
->
name
});
op_desc
.
SetOutput
(
"Out"
,
{
dequant_op_out
->
arg
()
->
name
});
op_desc
.
SetOutput
(
"Out"
,
{
dequant_op_out
->
arg
()
->
name
});
// Fc weight: Cin * Cout, the weight_scale_size should be Cout.
// Fc weight: Cin * Cout, the weight_scale_size should be Cout.
...
@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
weight_scale
.
push_back
(
whole_weight_scale
);
weight_scale
.
push_back
(
whole_weight_scale
);
}
}
op_desc
.
SetAttr
(
"enable_int8"
,
true
);
op_desc
.
SetAttr
(
"enable_int8"
,
true
);
op_desc
.
SetAttr
(
"input_scale"
,
input_scale
);
op_desc
.
SetAttr
(
"weight_scale"
,
weight_scale
);
op_desc
.
SetAttr
(
"weight_scale"
,
weight_scale
);
// change the weight from the float type to int8 type.
// change the weight from the float type to int8 type.
...
@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() {
...
@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() {
->
assert_is_op_output
(
quantized_op_type_
)
->
assert_is_op_output
(
quantized_op_type_
)
->
assert_is_op_input
(
dequant_op_type
,
"X"
)
->
assert_is_op_input
(
dequant_op_type
,
"X"
)
->
AsIntermediate
();
->
AsIntermediate
();
// The scale var_node of input activation is deleted in DeleteQuantOpFuser
auto
*
dequant_op_channel_scale
=
VarNode
(
"dequant_op_channel_scale"
)
auto
*
dequant_op_channel_scale
=
VarNode
(
"dequant_op_channel_scale"
)
->
assert_is_op_input
(
dequant_op_type
)
->
assert_is_op_input
(
dequant_op_type
)
->
AsIntermediate
();
->
AsIntermediate
();
...
@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
auto
*
dequant_op
=
matched
.
at
(
"dequant_op"
);
auto
*
dequant_op
=
matched
.
at
(
"dequant_op"
);
auto
*
dequant_op_out
=
matched
.
at
(
"dequant_op_out"
);
auto
*
dequant_op_out
=
matched
.
at
(
"dequant_op_out"
);
// obtain input
_scale and weight_scale
// obtain input
weight_scale from fake_dequant op
auto
*
scope
=
quantized_op
->
stmt
()
->
op
()
->
scope
();
auto
*
scope
=
quantized_op
->
stmt
()
->
op
()
->
scope
();
auto
&
valid_places
=
quantized_op
->
stmt
()
->
op
()
->
valid_places
();
auto
&
valid_places
=
quantized_op
->
stmt
()
->
op
()
->
valid_places
();
float
input_scale
=
quantized_op
->
stmt
()
->
op_info
()
->
GetAttr
<
float
>
(
"input_scale"
);
std
::
vector
<
float
>
weight_scale
;
std
::
vector
<
float
>
weight_scale
;
std
::
vector
<
int
>
quant_bits
=
std
::
vector
<
int
>
quant_bits
=
...
@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
...
@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
// set op desc
// set op desc
cpp
::
OpDesc
op_desc
=
*
quantized_op
->
stmt
()
->
op_info
();
cpp
::
OpDesc
op_desc
=
*
quantized_op
->
stmt
()
->
op_info
();
op_desc
.
SetInput
(
"Input"
,
{
quantized_op_input
->
arg
()
->
name
});
if
(
quantized_op_type_
==
"conv2d"
||
op_desc
.
SetOutput
(
"Output"
,
{
dequant_op_out
->
arg
()
->
name
});
quantized_op_type_
==
"depthwise_conv2d"
)
{
op_desc
.
SetInput
(
"Input"
,
{
quantized_op_input
->
arg
()
->
name
});
op_desc
.
SetOutput
(
"Output"
,
{
dequant_op_out
->
arg
()
->
name
});
}
else
if
(
quantized_op_type_
==
"mul"
||
quantized_op_type_
==
"matmul"
)
{
op_desc
.
SetInput
(
"X"
,
{
quantized_op_input
->
arg
()
->
name
});
op_desc
.
SetOutput
(
"Out"
,
{
dequant_op_out
->
arg
()
->
name
});
}
op_desc
.
SetAttr
(
"enable_int8"
,
true
);
op_desc
.
SetAttr
(
"enable_int8"
,
true
);
op_desc
.
SetAttr
(
"input_scale"
,
input_scale
);
op_desc
.
SetAttr
(
"weight_scale"
,
weight_scale
);
op_desc
.
SetAttr
(
"weight_scale"
,
weight_scale
);
// change the weight from the float type to int8 type.
// change the weight from the float type to int8 type.
...
@@ -297,167 +309,65 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
...
@@ -297,167 +309,65 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
void
DeleteQuantDequantOpFuser
::
BuildPattern
()
{
void
DeleteQuantDequantOpFuser
::
BuildPattern
()
{
std
::
string
quant_dequant_op_type
=
std
::
string
quant_dequant_op_type
=
"fake_quantize_dequantize_moving_average_abs_max"
;
"fake_quantize_dequantize_moving_average_abs_max"
;
if
(
quantized_op_type_
==
"pool2d"
||
quantized_op_type_
==
"softmax"
)
{
auto
*
input_scale_node
=
auto
*
input_scale_node
=
VarNode
(
"input_scale_node"
)
VarNode
(
"input_scale_node"
)
->
assert_is_op_input
(
quant_dequant_op_type
,
"InScale"
);
->
assert_is_op_input
(
quant_dequant_op_type
,
"InScale"
);
auto
*
input_act_node
=
auto
*
input_act_node
=
VarNode
(
"input_act_node"
)
VarNode
(
"input_act_node"
)
->
assert_is_op_input
(
quant_dequant_op_type
,
"X"
);
->
assert_is_op_input
(
quant_dequant_op_type
,
"X"
);
auto
*
quant_dequant_node
=
OpNode
(
"quant_dequant_node"
,
quant_dequant_op_type
)
auto
*
quant_dequant_node
=
->
assert_is_op
(
quant_dequant_op_type
);
OpNode
(
"quant_dequant_node"
,
quant_dequant_op_type
)
auto
*
output_scale_node
=
->
assert_is_op
(
quant_dequant_op_type
);
VarNode
(
"output_scale_node"
)
auto
*
output_scale_node
=
->
assert_is_op_output
(
quant_dequant_op_type
,
"OutScale"
);
VarNode
(
"output_scale_node"
)
auto
*
output_act_node
=
->
assert_is_op_output
(
quant_dequant_op_type
,
"OutScale"
);
VarNode
(
"output_act_node"
)
auto
*
output_act_node
=
->
assert_is_op_output
(
quant_dequant_op_type
,
"Out"
);
VarNode
(
"output_act_node"
)
->
assert_is_op_output
(
quant_dequant_op_type
,
"Out"
);
quant_dequant_node
->
LinksFrom
({
input_scale_node
,
input_act_node
});
auto
*
quantized_node
=
OpNode
(
"quantized_node"
,
quantized_op_type_
)
output_scale_node
->
LinksFrom
({
quant_dequant_node
});
->
assert_is_op
(
quantized_op_type_
);
output_act_node
->
LinksFrom
({
quant_dequant_node
});
quant_dequant_node
->
LinksFrom
({
input_scale_node
,
input_act_node
});
output_scale_node
->
LinksFrom
({
quant_dequant_node
});
output_act_node
->
LinksFrom
({
quant_dequant_node
});
quantized_node
->
LinksFrom
({
output_act_node
});
}
else
if
(
quantized_op_type_
==
"elementwise_add"
)
{
auto
*
input_scale_left_node
=
VarNode
(
"input_scale_left_node"
)
->
assert_is_op_input
(
quant_dequant_op_type
,
"InScale"
);
auto
*
input_act_left_node
=
VarNode
(
"input_act_left_node"
)
->
assert_is_op_input
(
quant_dequant_op_type
,
"X"
);
auto
*
quant_dequant_left_node
=
OpNode
(
"quant_dequant_left_node"
,
quant_dequant_op_type
)
->
assert_is_op
(
quant_dequant_op_type
);
auto
*
output_scale_left_node
=
VarNode
(
"output_scale_left_node"
)
->
assert_is_op_output
(
quant_dequant_op_type
,
"OutScale"
);
auto
*
output_act_left_node
=
VarNode
(
"output_act_left_node"
)
->
assert_is_op_output
(
quant_dequant_op_type
,
"Out"
)
->
assert_is_op_input
(
quantized_op_type_
,
"X"
);
quant_dequant_left_node
->
LinksFrom
(
{
input_scale_left_node
,
input_act_left_node
});
output_scale_left_node
->
LinksFrom
({
quant_dequant_left_node
});
output_act_left_node
->
LinksFrom
({
quant_dequant_left_node
});
auto
*
input_scale_right_node
=
VarNode
(
"input_scale_right_node"
)
->
assert_is_op_input
(
quant_dequant_op_type
,
"InScale"
);
auto
*
input_act_right_node
=
VarNode
(
"input_act_right_node"
)
->
assert_is_op_input
(
quant_dequant_op_type
,
"X"
);
auto
*
quant_dequant_right_node
=
OpNode
(
"quant_dequant_right_node"
,
quant_dequant_op_type
)
->
assert_is_op
(
quant_dequant_op_type
);
auto
*
output_scale_right_node
=
VarNode
(
"output_scale_right_node"
)
->
assert_is_op_output
(
quant_dequant_op_type
,
"OutScale"
);
auto
*
output_act_right_node
=
VarNode
(
"output_act_right_node"
)
->
assert_is_op_output
(
quant_dequant_op_type
,
"Out"
)
->
assert_is_op_input
(
quantized_op_type_
,
"Y"
);
quant_dequant_right_node
->
LinksFrom
(
{
input_scale_right_node
,
input_act_right_node
});
output_scale_right_node
->
LinksFrom
({
quant_dequant_right_node
});
output_act_right_node
->
LinksFrom
({
quant_dequant_right_node
});
auto
*
quantized_node
=
OpNode
(
"quantized_node"
,
quantized_op_type_
)
->
assert_is_op
(
quantized_op_type_
);
quantized_node
->
LinksFrom
({
output_act_left_node
,
output_act_right_node
});
}
else
{
LOG
(
FATAL
)
<<
"No support quantized_op_type:"
<<
quantized_op_type_
;
}
VLOG
(
4
)
<<
"DeleteQuantDequantOpFuser BuildPattern op_type:"
<<
quantized_op_type_
;
}
}
void
DeleteQuantDequantOpFuser
::
InsertNewNode
(
SSAGraph
*
graph
,
void
DeleteQuantDequantOpFuser
::
InsertNewNode
(
SSAGraph
*
graph
,
const
key2nodes_t
&
matched
)
{
const
key2nodes_t
&
matched
)
{
if
(
quantized_op_type_
==
"pool2d"
||
quantized_op_type_
==
"softmax"
)
{
auto
*
input_scale_node
=
matched
.
at
(
"input_scale_node"
);
auto
*
input_scale_node
=
matched
.
at
(
"input_scale_node"
);
auto
*
input_act_node
=
matched
.
at
(
"input_act_node"
);
auto
*
input_act_node
=
matched
.
at
(
"input_act_node"
);
auto
*
quant_dequant_node
=
matched
.
at
(
"quant_dequant_node"
);
auto
*
quant_dequant_node
=
matched
.
at
(
"quant_dequant_node"
);
auto
*
output_scale_node
=
matched
.
at
(
"output_scale_node"
);
auto
*
output_scale_node
=
matched
.
at
(
"output_scale_node"
);
auto
*
output_act_node
=
matched
.
at
(
"output_act_node"
);
auto
*
output_act_node
=
matched
.
at
(
"output_act_node"
);
auto
input_act_name
=
input_act_node
->
arg
()
->
name
;
auto
*
quantized_node
=
matched
.
at
(
"quantized_node"
);
auto
output_act_name
=
output_act_node
->
arg
()
->
name
;
// obtain values, save values and relink node
// Get scale value from scale var node
int
bit_length
=
int
bit_length
=
quant_dequant_node
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
quant_dequant_node
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
auto
*
scope
=
quant_dequant_node
->
stmt
()
->
op
()
->
scope
();
auto
*
scope
=
quant_dequant_node
->
stmt
()
->
op
()
->
scope
();
auto
*
scale_tensor
=
scope
->
FindVar
(
output_scale_node
->
arg
()
->
name
)
auto
*
scale_tensor
=
scope
->
FindVar
(
output_scale_node
->
arg
()
->
name
)
->
GetMutable
<
lite
::
Tensor
>
();
->
GetMutable
<
lite
::
Tensor
>
();
float
scale_value
=
scale_tensor
->
data
<
float
>
()[
0
]
/
range
;
float
scale_value
=
scale_tensor
->
data
<
float
>
()[
0
]
/
range
;
auto
*
op_desc
=
quantized_node
->
stmt
()
->
mutable_op_info
();
auto
quantized_nodes
=
output_act_node
->
outlinks
;
op_desc
->
SetAttr
<
int
>
(
"bit_length"
,
bit_length
);
for
(
auto
*
quantized_node
:
quantized_nodes
)
{
op_desc
->
SetAttr
<
float
>
(
"input_scale"
,
scale_value
);
// Save quantization info in op_info attr
op_desc
->
SetInput
(
"X"
,
{
input_act_node
->
arg
()
->
name
});
auto
op_info
=
*
quantized_node
->
stmt
()
->
op_info
();
IR_NODE_LINK_TO
(
input_act_node
,
quantized_node
)
std
::
string
argname
;
auto
update_op_desc
=
*
quantized_node
->
stmt
()
->
mutable_op_info
();
int
index
;
quantized_node
->
stmt
()
->
ResetOp
(
update_op_desc
,
graph
->
valid_places
());
op_info
.
GetInputArgname
(
output_act_name
,
&
argname
);
op_info
.
GetInputIndex
(
output_act_name
,
&
index
);
// delete nodes and edges
op_info
.
SetAttr
<
float
>
(
argname
+
std
::
to_string
(
index
)
+
"_input_scale"
,
std
::
unordered_set
<
const
Node
*>
nodes2rm
=
{
input_scale_node
,
scale_value
);
quant_dequant_node
,
op_info
.
SetAttr
<
float
>
(
"input_scale"
,
scale_value
);
// Save it for now
output_scale_node
,
op_info
.
SetAttr
<
int
>
(
"bit_length"
,
bit_length
);
output_act_node
};
GraphSafeRemoveNodes
(
graph
,
nodes2rm
);
op_info
.
UpdateAllInputs
(
output_act_name
,
input_act_name
);
}
else
if
(
quantized_op_type_
==
"elementwise_add"
)
{
quantized_node
->
stmt
()
->
ResetOp
(
op_info
,
graph
->
valid_places
());
auto
*
input_scale_left_node
=
matched
.
at
(
"input_scale_left_node"
);
IR_NODE_LINK_TO
(
input_act_node
,
quantized_node
);
auto
*
input_act_left_node
=
matched
.
at
(
"input_act_left_node"
);
auto
*
quant_dequant_left_node
=
matched
.
at
(
"quant_dequant_left_node"
);
auto
*
output_scale_left_node
=
matched
.
at
(
"output_scale_left_node"
);
auto
*
output_act_left_node
=
matched
.
at
(
"output_act_left_node"
);
auto
*
input_scale_right_node
=
matched
.
at
(
"input_scale_right_node"
);
auto
*
input_act_right_node
=
matched
.
at
(
"input_act_right_node"
);
auto
*
quant_dequant_right_node
=
matched
.
at
(
"quant_dequant_right_node"
);
auto
*
output_scale_right_node
=
matched
.
at
(
"output_scale_right_node"
);
auto
*
output_act_right_node
=
matched
.
at
(
"output_act_right_node"
);
auto
*
quantized_node
=
matched
.
at
(
"quantized_node"
);
// obtain values, save values and relink node
int
bit_length
=
quant_dequant_left_node
->
stmt
()
->
op_info
()
->
GetAttr
<
int
>
(
"bit_length"
);
int
range
=
((
1
<<
(
bit_length
-
1
))
-
1
);
auto
*
scope
=
quant_dequant_left_node
->
stmt
()
->
op
()
->
scope
();
auto
*
left_scale_tensor
=
scope
->
FindVar
(
output_scale_left_node
->
arg
()
->
name
)
->
GetMutable
<
lite
::
Tensor
>
();
float
left_scale_value
=
left_scale_tensor
->
data
<
float
>
()[
0
]
/
range
;
auto
*
right_scale_tensor
=
scope
->
FindVar
(
output_scale_right_node
->
arg
()
->
name
)
->
GetMutable
<
lite
::
Tensor
>
();
float
right_scale_value
=
right_scale_tensor
->
data
<
float
>
()[
0
]
/
range
;
auto
*
op_desc
=
quantized_node
->
stmt
()
->
mutable_op_info
();
op_desc
->
SetAttr
<
int
>
(
"bit_length"
,
bit_length
);
op_desc
->
SetAttr
<
float
>
(
"x_input_scale"
,
left_scale_value
);
op_desc
->
SetAttr
<
float
>
(
"y_input_scale"
,
right_scale_value
);
op_desc
->
SetInput
(
"X"
,
{
input_act_left_node
->
arg
()
->
name
});
op_desc
->
SetInput
(
"Y"
,
{
input_act_right_node
->
arg
()
->
name
});
IR_NODE_LINK_TO
(
input_act_left_node
,
quantized_node
)
IR_NODE_LINK_TO
(
input_act_right_node
,
quantized_node
)
auto
update_op_desc
=
*
quantized_node
->
stmt
()
->
mutable_op_info
();
quantized_node
->
stmt
()
->
ResetOp
(
update_op_desc
,
graph
->
valid_places
());
// delete nodes and edges
std
::
unordered_set
<
const
Node
*>
nodes2rm
=
{
input_scale_left_node
,
quant_dequant_left_node
,
output_scale_left_node
,
output_act_left_node
,
input_scale_right_node
,
quant_dequant_right_node
,
output_scale_right_node
,
output_act_right_node
};
GraphSafeRemoveNodes
(
graph
,
nodes2rm
);
}
else
{
LOG
(
FATAL
)
<<
"No support quantized_op_type:"
<<
quantized_op_type_
;
}
}
// delete nodes and edges
std
::
unordered_set
<
const
Node
*>
nodes2rm
=
{
input_scale_node
,
quant_dequant_node
,
output_scale_node
,
output_act_node
};
GraphSafeRemoveNodes
(
graph
,
nodes2rm
);
}
}
cpp
::
OpDesc
DeleteQuantDequantOpFuser
::
GenOpDesc
(
const
key2nodes_t
&
matched
)
{
cpp
::
OpDesc
DeleteQuantDequantOpFuser
::
GenOpDesc
(
const
key2nodes_t
&
matched
)
{
...
...
lite/core/mir/fusion/quant_dequant_op_fuser.h
浏览文件 @
917620e8
...
@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase {
...
@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase {
};
};
/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
* pooled/elementwise_add" can be deteted by this fuser. The fuser
* quantized_op" can be deteted by this fuser. The fuser modifies the input
* extract the input_scale form fake_quant_dequant_op and save into
* scale for the quantized_op and deletes the fake_quant_dequant_op.
* the quantized_op. Besides, the fuser delete fake_quant_dequant_op in
* the graph.
*/
*/
class
DeleteQuantDequantOpFuser
:
public
FuseBase
{
class
DeleteQuantDequantOpFuser
:
public
FuseBase
{
public:
public:
explicit
DeleteQuantDequantOpFuser
(
const
std
::
string
&
quantized_op_type
)
:
quantized_op_type_
(
quantized_op_type
)
{}
void
BuildPattern
()
override
;
void
BuildPattern
()
override
;
void
InsertNewNode
(
SSAGraph
*
graph
,
const
key2nodes_t
&
matched
)
override
;
void
InsertNewNode
(
SSAGraph
*
graph
,
const
key2nodes_t
&
matched
)
override
;
private:
private:
cpp
::
OpDesc
GenOpDesc
(
const
key2nodes_t
&
matched
)
override
;
cpp
::
OpDesc
GenOpDesc
(
const
key2nodes_t
&
matched
)
override
;
private:
std
::
string
quantized_op_type_
{};
};
};
}
// namespace fusion
}
// namespace fusion
...
...
lite/core/op_lite.h
浏览文件 @
917620e8
...
@@ -225,6 +225,32 @@ class OpInfo : public cpp::OpDesc {
...
@@ -225,6 +225,32 @@ class OpInfo : public cpp::OpDesc {
return
false
;
return
false
;
}
}
// For the input variable name, find the index of the corresponding
// input argname
bool
GetInputIndex
(
const
std
::
string
&
value_name
,
int
*
out
)
const
{
for
(
auto
&
item
:
inputs_
)
{
auto
it
=
std
::
find
(
item
.
second
.
begin
(),
item
.
second
.
end
(),
value_name
);
if
(
it
!=
item
.
second
.
end
())
{
*
out
=
it
-
item
.
second
.
begin
();
return
true
;
}
}
return
false
;
}
// For the output variable name, find the index of the corresponding
// output argname
bool
GetOutputIndex
(
const
std
::
string
&
value_name
,
int
*
out
)
const
{
for
(
auto
&
item
:
outputs_
)
{
auto
it
=
std
::
find
(
item
.
second
.
begin
(),
item
.
second
.
end
(),
value_name
);
if
(
it
!=
item
.
second
.
end
())
{
*
out
=
it
-
item
.
second
.
begin
();
return
true
;
}
}
return
false
;
}
void
UpdateAllInputs
(
const
std
::
string
&
from
,
const
std
::
string
&
to
)
{
void
UpdateAllInputs
(
const
std
::
string
&
from
,
const
std
::
string
&
to
)
{
for
(
auto
&
item
:
inputs_
)
{
for
(
auto
&
item
:
inputs_
)
{
for
(
auto
&
var
:
item
.
second
)
{
for
(
auto
&
var
:
item
.
second
)
{
...
...
lite/core/profile/precision_profiler.h
浏览文件 @
917620e8
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
* of each kernel.
* of each kernel.
*/
*/
#pragma once
#pragma once
#include <cmath>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "lite/core/program.h"
#include "lite/core/program.h"
...
...
lite/core/program.cc
浏览文件 @
917620e8
...
@@ -20,7 +20,7 @@
...
@@ -20,7 +20,7 @@
#include "lite/operators/conditional_block_op.h"
#include "lite/operators/conditional_block_op.h"
#include "lite/operators/subgraph_op.h"
#include "lite/operators/subgraph_op.h"
#include "lite/operators/while_op.h"
#include "lite/operators/while_op.h"
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PR
ECISION_PR
OFILE
#include "lite/core/profile/precision_profiler.h"
#include "lite/core/profile/precision_profiler.h"
#endif
#endif
...
@@ -136,12 +136,10 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
...
@@ -136,12 +136,10 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
}
}
void
RuntimeProgram
::
Run
()
{
void
RuntimeProgram
::
Run
()
{
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
auto
inst_precision_profiler
=
paddle
::
lite
::
profile
::
PrecisionProfiler
();
auto
inst_precision_profiler
=
paddle
::
lite
::
profile
::
PrecisionProfiler
();
std
::
string
precision_profiler_summary
=
std
::
string
precision_profiler_summary
=
inst_precision_profiler
.
GetSummaryHeader
();
inst_precision_profiler
.
GetSummaryHeader
();
#endif
#endif
#endif
for
(
auto
&
inst
:
instructions_
)
{
for
(
auto
&
inst
:
instructions_
)
{
...
@@ -149,21 +147,19 @@ void RuntimeProgram::Run() {
...
@@ -149,21 +147,19 @@ void RuntimeProgram::Run() {
if
(
inst
.
is_feed_fetch_op
())
continue
;
if
(
inst
.
is_feed_fetch_op
())
continue
;
#endif
#endif
inst
.
Run
();
inst
.
Run
();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA
#ifndef LITE_WITH_FPGA
precision_profiler_summary
+=
precision_profiler_summary
+=
inst_precision_profiler
.
GetInstPrecision
(
&
inst
);
inst_precision_profiler
.
GetInstPrecision
(
&
inst
);
#endif
#endif
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
}
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PROFILE
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
(
profile
::
Type
::
kDispatch
,
false
,
0
);
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
(
profile
::
Type
::
kDispatch
,
false
,
0
);
#endif
#ifdef LITE_WITH_PRECISION_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
LOG
(
INFO
)
<<
"
\n
"
<<
precision_profiler_summary
;
LOG
(
INFO
)
<<
"
\n
"
<<
precision_profiler_summary
;
#endif // LITE_WITH_PRECISION_PROFILE
#endif
#endif // LITE_WITH_PROFILE
}
}
void
Program
::
Build
(
const
cpp
::
ProgramDesc
&
prog
)
{
void
Program
::
Build
(
const
cpp
::
ProgramDesc
&
prog
)
{
...
...
lite/demo/cxx/train_demo/README.md
0 → 100644
浏览文件 @
917620e8
# Introduction
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
你可以通过book库中的
[
文档
](
https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html
)
和
[
源码
](
https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line
)
进一步了解“波士顿房价预测”这一任务的定义及其建模过程,
其使用线性回归(Linear Regression)
模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。
# Requirements
-
一部安卓手机,用于运行训练程序
-
装了Paddle (version: 1.7.0) 的python
# Quick start
## Step1 build paddle-lite
请按照
[
paddle-lite官方文档
](
https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite
)
的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为:
```
shell
## 配置环境
wget
-c
https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz
--no-check-certificate
tar
xzf cmake-3.10.3-Linux-x86_64.tar.gz
export
PATH
=
${
PWD
}
'/cmake-3.10.3-Linux-x86_64/bin'
:
$PATH
wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
unzip android-ndk-r17c-linux-x86_64.zip
export
NDK_ROOT
=
/opt/android-ndk-r17c
## 编译
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd
Paddle-Lite
./lite/tools/build.sh
\
--arm_os
=
android
\
--arm_abi
=
armv7
\
--build_extra
=
ON
\
--arm_lang
=
gcc
\
--android_stl
=
c++_static
\
--build_train
=
ON full_publish
```
产物:
```
shell
Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
```
## Step2 编译lr_trainer
```
shell
cd
Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c
```
产物:
```
shell
bin/
`
--
demo_trainer
```
## Step3 download model and run it!
在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行:
```
shell
local_path
=
/data/local/tmp/linear_regression
adb shell
"mkdir "
${
local_path
}
# download model and push to mobile
wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz
tar
-zxvf
lite_lr_model.tar.gz
adb push lite_lr_model/housing.data
${
local_path
}
adb push lite_lr_model/model_dir
${
local_path
}
# push lib and executable file to moblie
adb push libpaddle_full_api_shared.so
${
local_path
}
adb push demo_trainer
${
local_path
}
adb shell
chmod
+x
${
local_path
}
/demo_trainer
# run it!
adb shell
"export LD_LIBRARY_PATH="
${
local_path
}
" && export LIBRARY_PATH="
${
local_path
}
" && cd "
${
local_path
}
" && ./demo_trainer true"
```
期望结果:
```
sample 0: Loss: 564.317
sample 1: Loss: 463.9
sample 2: Loss: 1197.54
sample 3: Loss: 1093.83
sample 4: Loss: 1282.76
sample 5: Loss: 792.097
sample 6: Loss: 491.776
sample 7: Loss: 698.496
sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
# 更多细节
上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令:
```
shell
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd
Paddle-Lite/lite/demo/cxx/train_demo/
python train.py
--save_model
```
产物:
```
shell
model_dir/
|-- fc_0.b_0
|-- fc_0.w_0
|-- learning_rate_0
`
--
__model__
md5sum
fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
```
如果你想生成自己的模型用于训练,可以参考
`train.py`
中保存模型的方式。
# 与Paddle训练结果做校对
## 前10个Loss值
为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。
python + paddle 命令:
```
shell
fluid train.py
--num_steps
=
10
--batch_size
=
1
```
python + paddle 结果:
```
shell
Train cost, Step 0, Cost 564.317017
Train cost, Step 1, Cost 463.900238
Train cost, Step 2, Cost 1197.537354
Train cost, Step 3, Cost 1093.833008
Train cost, Step 4, Cost 1282.760254
Train cost, Step 5, Cost 792.097351
Train cost, Step 6, Cost 491.775848
Train cost, Step 7, Cost 698.496033
Train cost, Step 8, Cost 248.444885
Train cost, Step 9, Cost 325.135132
```
c++ 与 paddle-lite命令:
```
./demo_trainer true
```
c++ 与 paddle-lite结果:
```
sample 0: Loss: 564.317
sample 1: Loss: 463.9
sample 2: Loss: 1197.54
sample 3: Loss: 1093.83
sample 4: Loss: 1282.76
sample 5: Loss: 792.097
sample 6: Loss: 491.776
sample 7: Loss: 698.496
sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
## Loss 曲线
控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。

如果想复现上述效果,paddle+python的运行命令为:
```
git clone https://github.com/PaddlePaddle/book.git
cd book/01.fit_a_line
python train.py
```
lite + c++的运行命令为:
```
./demo_trainer false
```
lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt
0 → 100644
浏览文件 @
917620e8
cmake_minimum_required
(
VERSION 2.8
)
set
(
CMAKE_CXX_STANDARD 11
)
# Project's name
if
(
NOT DEFINED LITE_ROOT
)
message
(
FATAL_ERROR
"please set LITE_ROOT with
-DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/"
)
endif
()
project
(
demo_trainer
)
# Set the output folder where your program will be created
set
(
CMAKE_BINARY_DIR
${
CMAKE_SOURCE_DIR
}
/bin
)
set
(
EXECUTABLE_OUTPUT_PATH
${
CMAKE_BINARY_DIR
}
)
set
(
LIBRARY_OUTPUT_PATH
${
CMAKE_BINARY_DIR
}
)
# The following folder will be included
include_directories
(
"include"
)
include_directories
(
"
${
LITE_ROOT
}
/inference_lite_lib.android.armv7/cxx/include"
)
add_executable
(
demo_trainer
${
PROJECT_SOURCE_DIR
}
/demo_trainer.cc
${
PROJECT_SOURCE_DIR
}
/data_reader.cc
)
TARGET_LINK_LIBRARIES
(
demo_trainer
"
${
LITE_ROOT
}
/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so"
)
lite/demo/cxx/train_demo/cplus_train/data_reader.cc
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/data_reader.h"
#include <limits>
using
std
::
string
;
using
std
::
vector
;
int
FEATURE_NUM
=
13
;
float
rate
=
0.8
;
int
get_samples
(
string
line
,
vector
<
float
>*
feature
,
float
*
label
)
{
std
::
istringstream
reader
(
line
);
std
::
vector
<
float
>
numbers
;
do
{
// read as many numbers as possible.
for
(
float
number
;
reader
>>
number
;)
{
numbers
.
push_back
(
number
);
}
// consume and discard token from stream.
if
(
reader
.
fail
())
{
reader
.
clear
();
std
::
string
token
;
reader
>>
token
;
}
}
while
(
!
reader
.
eof
());
assert
(
numbers
.
size
()
==
FEATURE_NUM
+
1
);
for
(
int
i
=
0
;
i
<
FEATURE_NUM
;
i
++
)
{
feature
->
push_back
(
numbers
[
i
]);
}
*
label
=
numbers
[
FEATURE_NUM
];
return
0
;
}
int
normalize
(
const
vector
<
vector
<
float
>>&
origin_features
,
vector
<
vector
<
float
>>*
features
,
float
rate
)
{
int
inf
=
std
::
numeric_limits
<
int
>::
max
();
vector
<
float
>
min_vec
(
FEATURE_NUM
,
static_cast
<
float
>
(
inf
));
vector
<
float
>
max_vec
(
FEATURE_NUM
,
-
(
static_cast
<
float
>
(
inf
)));
vector
<
float
>
sum_vec
(
FEATURE_NUM
,
0
);
vector
<
float
>
avg_vec
(
FEATURE_NUM
,
0
);
for
(
int
i
=
0
;
i
<
origin_features
.
size
();
i
++
)
{
for
(
int
j
=
0
;
j
<
FEATURE_NUM
;
j
++
)
{
min_vec
[
j
]
=
min
(
min_vec
[
j
],
origin_features
[
i
][
j
]);
max_vec
[
j
]
=
max
(
max_vec
[
j
],
origin_features
[
i
][
j
]);
sum_vec
[
j
]
+=
origin_features
[
i
][
j
];
}
}
for
(
int
i
=
0
;
i
<
FEATURE_NUM
;
i
++
)
{
avg_vec
[
i
]
=
sum_vec
[
i
]
/
origin_features
.
size
();
}
for
(
int
i
=
0
;
i
<
origin_features
.
size
()
*
rate
-
1
;
i
++
)
{
vector
<
float
>
feat
;
for
(
int
j
=
0
;
j
<
FEATURE_NUM
;
j
++
)
{
feat
.
push_back
((
origin_features
[
i
][
j
]
-
avg_vec
[
j
])
/
(
max_vec
[
j
]
-
min_vec
[
j
]));
}
features
->
push_back
(
feat
);
}
}
int
read_samples
(
const
string
fname
,
vector
<
vector
<
float
>>*
features
,
vector
<
float
>*
labels
)
{
fstream
fin
;
fin
.
open
(
fname
);
if
(
!
static_cast
<
bool
>
(
fin
))
{
return
1
;
}
vector
<
vector
<
float
>>
origin_features
;
vector
<
string
>
lines
;
string
line
;
while
(
getline
(
fin
,
line
))
{
lines
.
push_back
(
line
);
}
fin
.
close
();
for
(
int
i
=
0
;
i
<
lines
.
size
();
i
++
)
{
vector
<
float
>
feat
;
float
lbl
=
0
;
get_samples
(
lines
[
i
],
&
feat
,
&
lbl
);
origin_features
.
push_back
(
feat
);
if
(
i
<
lines
.
size
()
*
rate
-
1
)
{
labels
->
push_back
(
lbl
);
}
}
cout
<<
"finish read fata"
<<
endl
;
normalize
(
origin_features
,
features
,
rate
);
assert
(
features
->
size
()
==
labels
->
size
());
return
0
;
}
lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <vector>
#include "include/data_reader.h"
#include "paddle_api.h" // NOLINT
using
namespace
paddle
::
lite_api
;
// NOLINT
class
LRModel
{
public:
void
InitModel
()
{
// 1. Set CxxConfig
CxxConfig
config
;
config
.
set_model_dir
(
"model_dir"
);
std
::
vector
<
Place
>
valid_places
{
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)}};
config
.
set_valid_places
(
valid_places
);
predictor_
=
CreatePaddlePredictor
<
CxxConfig
>
(
config
);
}
float
Predict
(
const
vector
<
vector
<
float
>>&
features
,
const
vector
<
float
>&
labels
)
{
// Create Tensor
assert
(
features
.
size
()
==
labels
.
size
());
int
batch_size
=
features
.
size
();
std
::
unique_ptr
<
Tensor
>
input_tensor
(
std
::
move
(
predictor_
->
GetInput
(
0
)));
input_tensor
->
Resize
(
shape_t
({
batch_size
,
FEATURE_NUM
}));
auto
*
data
=
input_tensor
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
j
=
0
;
j
<
FEATURE_NUM
;
j
++
)
{
data
[
FEATURE_NUM
*
i
+
j
]
=
features
[
i
][
j
];
}
}
std
::
unique_ptr
<
Tensor
>
y_tensor
(
std
::
move
(
predictor_
->
GetInput
(
1
)));
y_tensor
->
Resize
(
shape_t
({
batch_size
,
1
}));
auto
*
y_data
=
y_tensor
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
y_data
[
i
]
=
labels
[
i
];
}
predictor_
->
Run
();
std
::
unique_ptr
<
const
Tensor
>
output_tensor
(
std
::
move
(
predictor_
->
GetOutput
(
0
)));
return
output_tensor
->
data
<
float
>
()[
0
];
}
private:
std
::
shared_ptr
<
PaddlePredictor
>
predictor_
;
};
int
shuffle
(
vector
<
vector
<
float
>>*
features
,
vector
<
float
>*
labels
)
{
assert
(
features
->
size
()
==
labels
->
size
());
vector
<
int
>
index
;
for
(
int
i
=
0
;
i
<
features
->
size
();
i
++
)
{
index
.
push_back
(
i
);
}
random_shuffle
(
index
.
begin
(),
index
.
end
());
vector
<
vector
<
float
>>
tmp_features
;
vector
<
float
>
tmp_labels
;
for
(
int
i
=
0
;
i
<
features
->
size
();
i
++
)
{
tmp_features
.
push_back
((
*
features
)[
index
[
i
]]);
tmp_labels
.
push_back
((
*
labels
)[
index
[
i
]]);
}
for
(
int
i
=
0
;
i
<
features
->
size
();
i
++
)
{
for
(
int
j
=
0
;
j
<
FEATURE_NUM
;
j
++
)
{
(
*
features
)[
i
][
j
]
=
tmp_features
[
i
][
j
];
}
(
*
labels
)[
i
]
=
tmp_labels
[
i
];
}
return
0
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
if
(
argc
<
2
)
{
cerr
<<
"usage: ./demo_trainer is_small"
<<
endl
;
cerr
<<
" if is_small is true, the batch size is set to 1, "
<<
endl
;
cerr
<<
" and it will only runs for 10 steps."
<<
endl
;
return
1
;
}
string
is_small
=
argv
[
1
];
vector
<
vector
<
float
>>
features
;
vector
<
float
>
labels
;
read_samples
(
"housing.data"
,
&
features
,
&
labels
);
cout
<<
"sample count: "
<<
features
.
size
()
<<
" "
<<
endl
;
std
::
shared_ptr
<
LRModel
>
local_model
(
new
LRModel
());
local_model
->
InitModel
();
if
(
is_small
==
"true"
)
{
cout
<<
"small mode"
<<
endl
;
for
(
int
i
;
i
<
10
;
i
++
)
{
vector
<
vector
<
float
>>
batch_feature
;
vector
<
float
>
batch_label
;
batch_feature
.
push_back
(
features
[
i
]);
batch_label
.
push_back
(
labels
[
i
]);
auto
loss
=
local_model
->
Predict
(
batch_feature
,
batch_label
);
cout
<<
"sample "
<<
i
<<
": "
<<
loss
<<
endl
;
}
}
else
if
(
is_small
==
"false"
)
{
// shuffle
cout
<<
"full model"
<<
endl
;
int
epoch
=
100
;
int
batch_size
=
20
;
int
step
=
0
;
for
(
int
i
;
i
<
epoch
;
i
++
)
{
shuffle
(
&
features
,
&
labels
);
for
(
int
j
=
0
;
j
<
ceil
(
static_cast
<
float
>
(
features
.
size
())
/
batch_size
);
j
++
)
{
int
start_idx
=
j
*
batch_size
;
int
end_idx
=
min
((
j
+
1
)
*
batch_size
,
static_cast
<
int
>
(
features
.
size
()));
auto
batch_feature
=
vector
<
vector
<
float
>>
(
features
.
begin
()
+
start_idx
,
features
.
begin
()
+
end_idx
);
auto
batch_label
=
vector
<
float
>
(
labels
.
begin
()
+
start_idx
,
labels
.
begin
()
+
end_idx
);
auto
loss
=
local_model
->
Predict
(
batch_feature
,
batch_label
);
if
(
step
%
10
==
0
)
{
std
::
cout
<<
"batch: "
<<
i
<<
", step: "
<<
step
<<
", Loss: "
<<
loss
<<
endl
;
}
step
+=
1
;
}
}
}
else
{
cerr
<<
"wrong arg for is_small: "
<<
is_small
<<
endl
;
}
}
lite/demo/cxx/train_demo/cplus_train/include/data_reader.h
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <assert.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
using
std
::
string
;
using
std
::
vector
;
using
std
::
cerr
;
using
std
::
cout
;
using
std
::
endl
;
using
std
::
min
;
using
std
::
max
;
using
std
::
fstream
;
extern
int
FEATURE_NUM
;
int
get_samples
(
string
line
,
const
vector
<
float
>&
feature
,
float
*
label
);
int
read_samples
(
const
string
fname
,
vector
<
vector
<
float
>>*
features
,
vector
<
float
>*
labels
);
lite/demo/cxx/train_demo/cplus_train/run_build.sh
0 → 100644
浏览文件 @
917620e8
rm
-rf
build
mkdir
build
cd
build
LITE_ROOT
=
$1
NDK_ROOT
=
$2
cmake ..
\
-DLITE_ROOT
=
${
LITE_ROOT
}
\
-DNDK_ROOT
=
${
NDK_ROOT
}
\
-DCMAKE_TOOLCHAIN_FILE
=
${
NDK_ROOT
}
/build/cmake/android.toolchain.cmake
\
-DANDROID_TOOLCHAIN
=
gcc
\
-DANDROID_ABI
=
"armeabi-v7a"
\
-DANDROID_PLATFORM
=
android-23
\
-DANDROID
=
true
\
-DANDROID_STL
=
c++_static
make
cd
..
# ./bin/demo_trainer
lite/demo/cxx/train_demo/image/lr_loss.png
0 → 100644
浏览文件 @
917620e8
51.0 KB
lite/demo/cxx/train_demo/train.py
0 → 100644
浏览文件 @
917620e8
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
sys
import
argparse
import
math
import
numpy
import
paddle
import
paddle.fluid
as
fluid
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
"fit_a_line"
)
parser
.
add_argument
(
'--save_model'
,
action
=
'store_true'
,
help
=
"Whether to save main program"
)
parser
.
add_argument
(
'--num_steps'
,
type
=
int
,
default
=
1000000000000
,
help
=
"train steps"
)
parser
.
add_argument
(
'--num_epochs'
,
type
=
int
,
default
=
100
,
help
=
"number of epochs."
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
20
,
help
=
"batch size."
)
parser
.
add_argument
(
'--shuffle'
,
action
=
'store_true'
,
help
=
"Whether to shuffle train data."
)
args
=
parser
.
parse_args
()
return
args
# For training test cost
def
train_test
(
executor
,
program
,
reader
,
feeder
,
fetch_list
):
accumulated
=
1
*
[
0
]
count
=
0
for
data_test
in
reader
():
outs
=
executor
.
run
(
program
=
program
,
feed
=
feeder
.
feed
(
data_test
),
fetch_list
=
fetch_list
)
accumulated
=
[
x_c
[
0
]
+
x_c
[
1
][
0
]
for
x_c
in
zip
(
accumulated
,
outs
)]
count
+=
1
return
[
x_d
/
count
for
x_d
in
accumulated
]
def
main
():
if
args
.
shuffle
:
print
(
"doing shuffle"
)
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
uci_housing
.
train
(),
buf_size
=
500
),
batch_size
=
args
.
batch_size
)
else
:
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
uci_housing
.
train
(),
batch_size
=
args
.
batch_size
)
# feature vector of length 13
x
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
None
,
13
],
dtype
=
'float32'
)
y
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
None
,
1
],
dtype
=
'float32'
)
main_program
=
fluid
.
default_main_program
()
startup_program
=
fluid
.
default_startup_program
()
main_program
.
random_seed
=
90
startup_program
.
random_seed
=
90
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1
,
act
=
None
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_loss
=
fluid
.
layers
.
mean
(
cost
)
test_program
=
main_program
.
clone
(
for_test
=
True
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.001
)
sgd_optimizer
.
minimize
(
avg_loss
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
num_epochs
=
args
.
num_epochs
# main train loop.
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
x
,
y
])
exe
.
run
(
startup_program
)
if
args
.
save_model
:
fluid
.
io
.
save_persistables
(
exe
,
"model_dir"
)
# add feed and fetch op
feeded_var_names
=
[
'x'
,
'y'
]
fetch_var_names
=
[
'mean_0.tmp_0'
]
fluid
.
io
.
prepend_feed_ops
(
main_program
,
feeded_var_names
)
fluid
.
io
.
append_fetch_ops
(
main_program
,
fetch_var_names
)
with
open
(
"model_dir/__model__"
,
"wb"
)
as
f
:
f
.
write
(
main_program
.
desc
.
serialize_to_string
())
with
open
(
"debug_main_program"
,
"w"
)
as
f
:
f
.
write
(
str
(
main_program
))
print
(
"train model saved to model_dir"
)
return
train_prompt
=
"Train cost"
step
=
0
for
pass_id
in
range
(
num_epochs
):
for
data_train
in
train_reader
():
avg_loss_value
,
=
exe
.
run
(
main_program
,
feed
=
feeder
.
feed
(
data_train
),
fetch_list
=
[
avg_loss
])
print
(
"%s, Step %d, Cost %f"
%
(
train_prompt
,
step
,
avg_loss_value
[
0
]))
if
step
==
args
.
num_steps
-
1
:
return
step
+=
1
if
math
.
isnan
(
float
(
avg_loss_value
[
0
])):
sys
.
exit
(
"got NaN loss, training failed."
)
if
__name__
==
'__main__'
:
args
=
parse_args
()
main
()
lite/kernels/arm/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -106,13 +106,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
...
@@ -106,13 +106,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
# 4. training kernels
# 4. training kernels
add_kernel
(
mean_compute_arm ARM extra SRCS mean_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
mean_compute_arm ARM extra SRCS mean_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
if
(
LITE_WITH_TRAIN
)
add_kernel
(
mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
add_kernel
(
sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
endif
()
lite_cc_test
(
test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm
)
lite_cc_test
(
test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm
)
lite_cc_test
(
test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm
)
lite_cc_test
(
test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm
)
...
...
lite/kernels/bm/bridges/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -30,6 +30,8 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D
...
@@ -30,6 +30,8 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D
lite_cc_library
(
subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_fill_constant_op_bm SRCS fill_constant_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS
${
bm_subgraph_bridge_deps
}
)
set
(
bm_subgraph_bridges
set
(
bm_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_registry
...
@@ -58,4 +60,6 @@ set(bm_subgraph_bridges
...
@@ -58,4 +60,6 @@ set(bm_subgraph_bridges
subgraph_bridge_reduce_full_op_bm
subgraph_bridge_reduce_full_op_bm
subgraph_bridge_squeeze_op_bm
subgraph_bridge_squeeze_op_bm
subgraph_bridge_cast_op_bm
subgraph_bridge_cast_op_bm
subgraph_bridge_fill_constant_op_bm
subgraph_bridge_assign_value_op_bm
CACHE INTERNAL
"bm_subgraph_bridges"
)
CACHE INTERNAL
"bm_subgraph_bridges"
)
lite/kernels/bm/bridges/act_op.cc
浏览文件 @
917620e8
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include <bmcompiler_if.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include <bmcompiler_op_code.h>
#include <bmcompiler_op_code.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/registry.h"
...
@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
output_dims
=
output
->
dims
();
auto
output_dims
=
output
->
dims
();
const
int64_t
*
x_shape_data
=
const_cast
<
const
int64_t
*>
(
&
x_dims
.
data
()[
0
]);
bool
x_is_const
=
!
graph
->
HasNode
(
x_var_name
);
const
int64_t
*
output_shape_data
=
const_cast
<
const
int64_t
*>
(
&
output_dims
.
data
()[
0
]);
std
::
vector
<
int32_t
>
i_x_shape_data
(
x_dims
.
size
());
std
::
vector
<
int32_t
>
i_x_shape_data
(
x_dims
.
size
());
std
::
vector
<
int32_t
>
i_output_shape_data
(
output_dims
.
size
());
std
::
vector
<
int32_t
>
i_output_shape_data
(
output_dims
.
size
());
for
(
size_t
i
=
0
;
i
<
x_dims
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
x_dims
.
size
();
i
++
)
{
i_x_shape_data
[
i
]
=
static_cast
<
int
>
(
x_shape_data
[
i
])
;
i_x_shape_data
[
i
]
=
x_dims
[
i
]
;
}
}
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_shape_data
[
i
])
;
i_output_shape_data
[
i
]
=
output_dims
[
i
]
;
}
}
float
alpha
=
0.
f
;
float
alpha
=
0.
f
;
int
active_type_id
=
0
;
int
active_type_id
=
0
;
...
@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
LOG
(
FATAL
)
<<
"[BM] unsupport act type"
;
LOG
(
FATAL
)
<<
"[BM] unsupport act type"
;
return
FAILED
;
return
FAILED
;
}
}
const
float
*
x_data
=
const_cast
<
const
float
*>
(
x
->
mutable_data
<
float
>
());
if
(
x_is_const
)
{
bm_add_const_tensor
(
graph
->
GetCompilerHandle
(),
static_cast
<
const
char
*>
(
x_var_name
.
c_str
()),
const_cast
<
const
int
*>
(
&
i_x_shape_data
[
0
]),
x_dims
.
size
(),
static_cast
<
bm_data_type_t
>
(
DTYPE_FP32
),
static_cast
<
const
void
*>
(
x_data
));
}
if
(
op_type
==
"relu"
||
op_type
==
"leaky_relu"
)
{
if
(
op_type
==
"relu"
||
op_type
==
"leaky_relu"
)
{
add_relu_layer
(
graph
->
GetCompilerHandle
(),
add_relu_layer
(
graph
->
GetCompilerHandle
(),
const_cast
<
const
int
*>
(
&
i_x_shape_data
[
0
]),
const_cast
<
const
int
*>
(
&
i_x_shape_data
[
0
]),
...
...
lite/kernels/bm/bridges/assign_value_op.cc
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
bm
{
int
AssignValueConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
scope
=
op
->
scope
();
auto
op_info
=
op
->
op_info
();
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
output_dims
=
output
->
dims
();
std
::
vector
<
int32_t
>
i_output_shape_data
(
output_dims
.
size
());
int
buffer_size
=
1
;
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_dims
[
i
]);
buffer_size
*=
i_output_shape_data
[
i
];
}
auto
fp32_values
=
op_info
->
GetAttr
<
std
::
vector
<
float
>>
(
"fp32_values"
);
float
*
assign_data
=
reinterpret_cast
<
float
*>
(
malloc
(
buffer_size
*
sizeof
(
float
)));
CHECK
(
assign_data
!=
nullptr
);
CHECK_EQ
(
buffer_size
,
fp32_values
.
size
());
bm_add_const_tensor
(
graph
->
GetCompilerHandle
(),
static_cast
<
const
char
*>
(
output_var_name
.
c_str
()),
const_cast
<
const
int
*>
(
i_output_shape_data
.
data
()),
output_dims
.
size
(),
static_cast
<
bm_data_type_t
>
(
DTYPE_FP32
),
reinterpret_cast
<
const
void
*>
(
assign_data
));
graph
->
AddNode
(
output_var_name
);
return
SUCCESS
;
}
}
// namespace bm
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
assign_value
,
kBM
,
paddle
::
lite
::
subgraph
::
bm
::
AssignValueConverter
);
lite/kernels/bm/bridges/conv_op.cc
浏览文件 @
917620e8
...
@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
filter_var_name
=
op_info
->
Input
(
"Filter"
).
front
();
auto
filter_var_name
=
op_info
->
Input
(
"Filter"
).
front
();
auto
filter
=
scope
->
FindVar
(
filter_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
filter
=
scope
->
FindVar
(
filter_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
filter_dims
=
filter
->
dims
();
auto
filter_dims
=
filter
->
dims
();
CHECK_EQ
(
input_dims
.
size
(),
4
);
CHECK_EQ
(
input_dims
.
size
(),
4
);
CHECK_EQ
(
output_dims
.
size
(),
4
);
CHECK_EQ
(
output_dims
.
size
(),
4
);
CHECK_EQ
(
filter_dims
.
size
(),
4
);
CHECK_EQ
(
filter_dims
.
size
(),
4
);
...
@@ -90,6 +91,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -90,6 +91,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
dilations
[
1
],
dilations
[
1
],
static_cast
<
int
>
(
has_bias
));
static_cast
<
int
>
(
has_bias
));
graph
->
AddNode
(
output_var_name
);
graph
->
AddNode
(
output_var_name
);
LOG
(
INFO
)
<<
output_var_name
<<
input_dims
<<
" "
<<
output_dims
;
return
SUCCESS
;
return
SUCCESS
;
}
}
...
...
lite/kernels/bm/bridges/elementwise_ops.cc
浏览文件 @
917620e8
...
@@ -65,6 +65,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -65,6 +65,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
output_dims
=
output
->
dims
();
auto
output_dims
=
output
->
dims
();
const
int64_t
*
output_shape_data
=
const
int64_t
*
output_shape_data
=
const_cast
<
const
int64_t
*>
(
&
output_dims
.
data
()[
0
]);
const_cast
<
const
int64_t
*>
(
&
output_dims
.
data
()[
0
]);
LOG
(
INFO
)
<<
x_dims
<<
" "
<<
output_dims
;
std
::
vector
<
int32_t
>
i_output_shape_data
(
output_dims
.
size
());
std
::
vector
<
int32_t
>
i_output_shape_data
(
output_dims
.
size
());
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_shape_data
[
i
]);
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_shape_data
[
i
]);
...
...
lite/kernels/bm/bridges/fill_constant_op.cc
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
bm
{
int
FillConstantConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
scope
=
op
->
scope
();
auto
op_info
=
op
->
op_info
();
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
output_dims
=
output
->
dims
();
std
::
vector
<
int32_t
>
i_output_shape_data
(
output_dims
.
size
());
int
buffer_size
=
1
;
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_dims
[
i
]);
}
float
*
const_data
=
reinterpret_cast
<
float
*>
(
malloc
(
buffer_size
*
sizeof
(
float
)));
CHECK
(
const_data
!=
nullptr
);
auto
value
=
op_info
->
GetAttr
<
float
>
(
"value"
);
for
(
size_t
i
=
0
;
i
<
buffer_size
;
i
++
)
{
const_data
[
i
]
=
value
;
}
bm_add_const_tensor
(
graph
->
GetCompilerHandle
(),
static_cast
<
const
char
*>
(
output_var_name
.
c_str
()),
const_cast
<
const
int
*>
(
i_output_shape_data
.
data
()),
output_dims
.
size
(),
static_cast
<
bm_data_type_t
>
(
DTYPE_FP32
),
reinterpret_cast
<
const
void
*>
(
const_data
));
graph
->
AddNode
(
output_var_name
);
return
SUCCESS
;
}
}
// namespace bm
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
fill_constant
,
kBM
,
paddle
::
lite
::
subgraph
::
bm
::
FillConstantConverter
);
lite/kernels/bm/bridges/mul_op.cc
浏览文件 @
917620e8
...
@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
op_info
=
op
->
op_info
();
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
op_type
=
op_info
->
Type
();
auto
unique_op_name
=
lite
::
subgraph
::
bm
::
UniqueName
(
op_type
);
auto
unique_op_name
=
lite
::
subgraph
::
bm
::
UniqueName
(
op_type
);
// only support y is const
// input
// input
auto
x_var_name
=
op_info
->
Input
(
"X"
).
front
();
auto
x_var_name
=
op_info
->
Input
(
"X"
).
front
();
auto
x
=
scope
->
FindVar
(
x_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
x
=
scope
->
FindVar
(
x_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
...
@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
y_var_name
=
op_info
->
Input
(
"Y"
).
front
();
auto
y_var_name
=
op_info
->
Input
(
"Y"
).
front
();
auto
y
=
scope
->
FindVar
(
y_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
y
=
scope
->
FindVar
(
y_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
y_dims
=
y
->
dims
();
auto
y_dims
=
y
->
dims
();
bool
y_is_const
=
!
graph
->
HasNode
(
y_var_name
);
CHECK_EQ
(
y_dims
.
size
(),
2
);
int
i_y_shape_data
[
2
];
for
(
size_t
i
=
0
;
i
<
2
;
i
++
)
{
i_y_shape_data
[
i
]
=
y_dims
[
i
];
}
// output
// output
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output_var_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
auto
output
=
scope
->
FindVar
(
output_var_name
)
->
GetMutable
<
lite
::
Tensor
>
();
...
@@ -71,20 +76,39 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -71,20 +76,39 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
output_dims
.
size
();
i
++
)
{
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_shape_data
[
i
]);
i_output_shape_data
[
i
]
=
static_cast
<
int
>
(
output_shape_data
[
i
]);
}
}
add_fc_layer
(
graph
->
GetCompilerHandle
(),
if
(
y_is_const
)
{
const_cast
<
const
int
*>
(
&
i_x_reshape_shape_data
[
0
]),
add_fc_layer
(
graph
->
GetCompilerHandle
(),
2
,
const_cast
<
const
int
*>
(
&
i_x_reshape_shape_data
[
0
]),
static_cast
<
const
char
*>
(
unique_op_reshape_name
.
c_str
()),
2
,
const_cast
<
const
int
*>
(
&
i_output_shape_data
[
0
]),
static_cast
<
const
char
*>
(
unique_op_reshape_name
.
c_str
()),
output_dims
.
size
(),
const_cast
<
const
int
*>
(
&
i_output_shape_data
[
0
]),
static_cast
<
const
char
*>
(
output_var_name
.
c_str
()),
output_dims
.
size
(),
static_cast
<
const
char
*>
(
unique_op_name
.
c_str
()),
static_cast
<
const
char
*>
(
output_var_name
.
c_str
()),
i_x_reshape_shape_data
[
1
],
static_cast
<
const
char
*>
(
unique_op_name
.
c_str
()),
i_output_shape_data
[
1
],
i_x_reshape_shape_data
[
1
],
static_cast
<
const
float
*>
(
y
->
mutable_data
<
float
>
()),
i_output_shape_data
[
1
],
nullptr
,
static_cast
<
const
float
*>
(
y
->
mutable_data
<
float
>
()),
0
,
nullptr
,
0
);
0
,
0
);
}
else
{
add_fc_weight_layer
(
graph
->
GetCompilerHandle
(),
const_cast
<
const
int
*>
(
&
i_x_reshape_shape_data
[
0
]),
2
,
static_cast
<
const
char
*>
(
unique_op_reshape_name
.
c_str
()),
const_cast
<
const
int
*>
(
&
i_output_shape_data
[
0
]),
output_dims
.
size
(),
static_cast
<
const
char
*>
(
output_var_name
.
c_str
()),
static_cast
<
const
char
*>
(
unique_op_name
.
c_str
()),
const_cast
<
const
int
*>
(
&
i_y_shape_data
[
0
]),
2
,
static_cast
<
const
char
*>
(
y_var_name
.
c_str
()),
i_x_reshape_shape_data
[
1
],
nullptr
,
0
,
0
);
}
graph
->
AddNode
(
output_var_name
);
graph
->
AddNode
(
output_var_name
);
return
SUCCESS
;
return
SUCCESS
;
}
}
...
...
lite/kernels/bm/bridges/paddle_use_bridges.h
浏览文件 @
917620e8
...
@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
...
@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
USE_SUBGRAPH_BRIDGE
(
squeeze
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
squeeze
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
squeeze2
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
squeeze2
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
cast
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
cast
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
fill_constant
,
kBM
);
USE_SUBGRAPH_BRIDGE
(
assign_value
,
kBM
);
lite/kernels/bm/subgraph_compute.cc
浏览文件 @
917620e8
...
@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
...
@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
graph
.
CreateCompilerHandle
();
graph
.
CreateCompilerHandle
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
BMContext
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
BMContext
>();
for
(
auto
&
inst
:
origin_program_
)
{
for
(
auto
&
inst
:
origin_program_
)
{
auto
op
=
inst
.
op
(
);
auto
op
=
const_cast
<
OpLite
*>
(
inst
.
op
()
);
CHECK
(
op
);
CHECK
(
op
);
op
->
CheckShape
();
op
->
CheckShape
();
op
->
InferShape
();
op
->
InferShape
();
...
...
lite/kernels/host/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -5,6 +5,3 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
...
@@ -5,6 +5,3 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
add_kernel
(
reshape_compute_host Host basic SRCS reshape_compute.cc DEPS
${
lite_kernel_deps
}
reshape_op
)
add_kernel
(
reshape_compute_host Host basic SRCS reshape_compute.cc DEPS
${
lite_kernel_deps
}
reshape_op
)
add_kernel
(
multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS
${
lite_kernel_deps
}
)
add_kernel
(
multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS
${
lite_kernel_deps
}
)
add_kernel
(
crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS
${
lite_kernel_deps
}
)
add_kernel
(
crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS
${
lite_kernel_deps
}
)
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
lite/kernels/host/multiclass_nms_compute.cc
浏览文件 @
917620e8
...
@@ -370,6 +370,7 @@ void MulticlassNmsCompute::Run() {
...
@@ -370,6 +370,7 @@ void MulticlassNmsCompute::Run() {
}
}
}
else
{
}
else
{
outs
->
Resize
({
static_cast
<
int64_t
>
(
num_kept
),
out_dim
});
outs
->
Resize
({
static_cast
<
int64_t
>
(
num_kept
),
out_dim
});
outs
->
mutable_data
<
float
>
();
int
offset
=
0
;
int
offset
=
0
;
int
*
oindices
=
nullptr
;
int
*
oindices
=
nullptr
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
lite/kernels/host/multiclass_nms_compute_test.cc
已删除
100644 → 0
浏览文件 @
7446e257
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/host/multiclass_nms_compute.h"
#include <gtest/gtest.h>
#include <map>
#include <utility>
#include <vector>
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
host
{
template
<
typename
dtype
>
static
bool
sort_score_pair_descend
(
const
std
::
pair
<
float
,
dtype
>&
pair1
,
const
std
::
pair
<
float
,
dtype
>&
pair2
)
{
return
pair1
.
first
>
pair2
.
first
;
}
template
<
typename
dtype
>
void
get_max_score_index
(
const
dtype
*
scores
,
int
num
,
float
threshold
,
int
top_k
,
std
::
vector
<
std
::
pair
<
dtype
,
int
>>*
score_index_vec
)
{
//! Generate index score pairs.
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
if
(
scores
[
i
]
>
threshold
)
{
score_index_vec
->
push_back
(
std
::
make_pair
(
scores
[
i
],
i
));
}
}
//! Sort the score pair according to the scores in descending order
std
::
stable_sort
(
score_index_vec
->
begin
(),
score_index_vec
->
end
(),
sort_score_pair_descend
<
int
>
);
//! Keep top_k scores if needed.
if
(
top_k
>
-
1
&&
top_k
<
score_index_vec
->
size
())
{
score_index_vec
->
resize
(
top_k
);
}
}
template
<
typename
dtype
>
dtype
bbox_size
(
const
dtype
*
bbox
,
bool
normalized
=
true
)
{
if
(
bbox
[
2
]
<
bbox
[
0
]
||
bbox
[
3
]
<
bbox
[
1
])
{
// If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
return
dtype
(
0.
);
}
else
{
const
dtype
width
=
bbox
[
2
]
-
bbox
[
0
];
const
dtype
height
=
bbox
[
3
]
-
bbox
[
1
];
if
(
normalized
)
{
return
width
*
height
;
}
else
{
// If bbox is not within range [0, 1].
return
(
width
+
1
)
*
(
height
+
1
);
}
}
}
template
<
typename
dtype
>
dtype
jaccard_overlap
(
const
dtype
*
bbox1
,
const
dtype
*
bbox2
)
{
if
(
bbox2
[
0
]
>
bbox1
[
2
]
||
bbox2
[
2
]
<
bbox1
[
0
]
||
bbox2
[
1
]
>
bbox1
[
3
]
||
bbox2
[
3
]
<
bbox1
[
1
])
{
return
dtype
(
0.
);
}
else
{
const
dtype
inter_xmin
=
std
::
max
(
bbox1
[
0
],
bbox2
[
0
]);
const
dtype
inter_ymin
=
std
::
max
(
bbox1
[
1
],
bbox2
[
1
]);
const
dtype
inter_xmax
=
std
::
min
(
bbox1
[
2
],
bbox2
[
2
]);
const
dtype
inter_ymax
=
std
::
min
(
bbox1
[
3
],
bbox2
[
3
]);
const
dtype
inter_width
=
inter_xmax
-
inter_xmin
;
const
dtype
inter_height
=
inter_ymax
-
inter_ymin
;
const
dtype
inter_size
=
inter_width
*
inter_height
;
const
dtype
bbox1_size
=
bbox_size
(
bbox1
);
const
dtype
bbox2_size
=
bbox_size
(
bbox2
);
return
inter_size
/
(
bbox1_size
+
bbox2_size
-
inter_size
);
}
}
template
<
typename
dtype
>
void
apply_nms_fast
(
const
dtype
*
bboxes
,
const
dtype
*
scores
,
int
num
,
float
score_threshold
,
float
nms_threshold
,
float
eta
,
int
top_k
,
std
::
vector
<
int
>*
indices
)
{
// Get top_k scores (with corresponding indices).
std
::
vector
<
std
::
pair
<
dtype
,
int
>>
score_index_vec
;
get_max_score_index
(
scores
,
num
,
score_threshold
,
top_k
,
&
score_index_vec
);
// Do nms.
float
adaptive_threshold
=
nms_threshold
;
indices
->
clear
();
while
(
score_index_vec
.
size
()
!=
0
)
{
const
int
idx
=
score_index_vec
.
front
().
second
;
bool
keep
=
true
;
for
(
int
k
=
0
;
k
<
indices
->
size
();
++
k
)
{
if
(
keep
)
{
const
int
kept_idx
=
(
*
indices
)[
k
];
float
overlap
=
jaccard_overlap
(
bboxes
+
idx
*
4
,
bboxes
+
kept_idx
*
4
);
keep
=
overlap
<=
adaptive_threshold
;
}
else
{
break
;
}
}
if
(
keep
)
{
indices
->
push_back
(
idx
);
}
score_index_vec
.
erase
(
score_index_vec
.
begin
());
if
(
keep
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
}
template
<
typename
dtype
>
void
multiclass_nms_compute_ref
(
const
operators
::
MulticlassNmsParam
&
param
,
int
class_num
,
const
std
::
vector
<
int
>&
priors
,
bool
share_location
,
std
::
vector
<
float
>*
result
)
{
int
background_id
=
param
.
background_label
;
int
keep_topk
=
param
.
keep_top_k
;
int
nms_topk
=
param
.
nms_top_k
;
float
conf_thresh
=
param
.
score_threshold
;
float
nms_thresh
=
param
.
nms_threshold
;
float
nms_eta
=
param
.
nms_eta
;
const
dtype
*
bbox_data
=
param
.
bboxes
->
data
<
const
dtype
>
();
const
dtype
*
conf_data
=
param
.
scores
->
data
<
const
dtype
>
();
dtype
*
out
=
param
.
out
->
mutable_data
<
dtype
>
();
(
*
result
).
clear
();
int
num_kept
=
0
;
std
::
vector
<
std
::
map
<
int
,
std
::
vector
<
int
>>>
all_indices
;
int64_t
conf_offset
=
0
;
int64_t
bbox_offset
=
0
;
for
(
int
i
=
0
;
i
<
priors
.
size
();
++
i
)
{
std
::
map
<
int
,
std
::
vector
<
int
>>
indices
;
int
num_det
=
0
;
int
num_priors
=
priors
[
i
];
int
conf_idx
=
class_num
*
conf_offset
;
int
bbox_idx
=
share_location
?
bbox_offset
*
4
:
bbox_offset
*
4
*
class_num
;
for
(
int
c
=
0
;
c
<
class_num
;
++
c
)
{
if
(
c
==
background_id
)
{
// Ignore background class
continue
;
}
const
dtype
*
cur_conf_data
=
conf_data
+
conf_idx
+
c
*
num_priors
;
const
dtype
*
cur_bbox_data
=
bbox_data
+
bbox_idx
;
if
(
!
share_location
)
{
cur_bbox_data
+=
c
*
num_priors
*
4
;
}
apply_nms_fast
(
cur_bbox_data
,
cur_conf_data
,
num_priors
,
conf_thresh
,
nms_thresh
,
nms_eta
,
nms_topk
,
&
(
indices
[
c
]));
num_det
+=
indices
[
c
].
size
();
}
if
(
keep_topk
>
-
1
&&
num_det
>
keep_topk
)
{
std
::
vector
<
std
::
pair
<
float
,
std
::
pair
<
int
,
int
>>>
score_index_pairs
;
for
(
auto
it
=
indices
.
begin
();
it
!=
indices
.
end
();
++
it
)
{
int
label
=
it
->
first
;
const
std
::
vector
<
int
>&
label_indices
=
it
->
second
;
for
(
int
j
=
0
;
j
<
label_indices
.
size
();
++
j
)
{
int
idx
=
label_indices
[
j
];
float
score
=
conf_data
[
conf_idx
+
label
*
num_priors
+
idx
];
score_index_pairs
.
push_back
(
std
::
make_pair
(
score
,
std
::
make_pair
(
label
,
idx
)));
}
}
// Keep top k results per image.
std
::
stable_sort
(
score_index_pairs
.
begin
(),
score_index_pairs
.
end
(),
sort_score_pair_descend
<
std
::
pair
<
int
,
int
>>
);
score_index_pairs
.
resize
(
keep_topk
);
// Store the new indices.
std
::
map
<
int
,
std
::
vector
<
int
>>
new_indices
;
for
(
int
j
=
0
;
j
<
score_index_pairs
.
size
();
++
j
)
{
int
label
=
score_index_pairs
[
j
].
second
.
first
;
int
idx
=
score_index_pairs
[
j
].
second
.
second
;
new_indices
[
label
].
push_back
(
idx
);
}
all_indices
.
push_back
(
new_indices
);
num_kept
+=
keep_topk
;
}
else
{
all_indices
.
push_back
(
indices
);
num_kept
+=
num_det
;
}
conf_offset
+=
num_priors
;
bbox_offset
+=
num_priors
;
}
if
(
num_kept
==
0
)
{
(
*
result
).
clear
();
(
*
result
).
resize
(
1
);
(
*
result
)[
0
]
=
-
1
;
return
;
}
else
{
(
*
result
).
resize
(
num_kept
*
6
);
}
int
count
=
0
;
conf_offset
=
0
;
bbox_offset
=
0
;
for
(
int
i
=
0
;
i
<
priors
.
size
();
++
i
)
{
int
num_priors
=
priors
[
i
];
int
conf_idx
=
class_num
*
conf_offset
;
int
bbox_idx
=
share_location
?
bbox_offset
*
4
:
bbox_offset
*
4
*
class_num
;
for
(
auto
it
=
all_indices
[
i
].
begin
();
it
!=
all_indices
[
i
].
end
();
++
it
)
{
int
label
=
it
->
first
;
std
::
vector
<
int
>&
indices
=
it
->
second
;
const
dtype
*
cur_conf_data
=
conf_data
+
conf_idx
+
label
*
num_priors
;
const
dtype
*
cur_bbox_data
=
bbox_data
+
bbox_idx
;
if
(
!
share_location
)
{
cur_bbox_data
+=
label
*
num_priors
*
4
;
}
for
(
int
j
=
0
;
j
<
indices
.
size
();
++
j
)
{
int
idx
=
indices
[
j
];
(
*
result
)[
count
*
6
]
=
label
;
(
*
result
)[
count
*
6
+
1
]
=
cur_conf_data
[
idx
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
(
*
result
)[
count
*
6
+
2
+
k
]
=
cur_bbox_data
[
idx
*
4
+
k
];
}
++
count
;
}
}
conf_offset
+=
num_priors
;
bbox_offset
+=
num_priors
;
}
}
TEST
(
multiclass_nms_host
,
init
)
{
MulticlassNmsCompute
multiclass_nms
;
ASSERT_EQ
(
multiclass_nms
.
precision
(),
PRECISION
(
kFloat
));
ASSERT_EQ
(
multiclass_nms
.
target
(),
TARGET
(
kHost
));
}
TEST
(
multiclass_nms_host
,
retrive_op
)
{
auto
multiclass_nms
=
KernelRegistry
::
Global
().
Create
<
TARGET
(
kHost
),
PRECISION
(
kFloat
)
>
(
"multiclass_nms"
);
ASSERT_FALSE
(
multiclass_nms
.
empty
());
ASSERT_TRUE
(
multiclass_nms
.
front
());
}
TEST
(
multiclass_nms_host
,
compute
)
{
MulticlassNmsCompute
multiclass_nms
;
operators
::
MulticlassNmsParam
param
;
lite
::
Tensor
bbox
,
conf
,
out
;
std
::
vector
<
float
>
out_ref
;
for
(
std
::
vector
<
int
>
priors
:
{
std
::
vector
<
int
>
({
2
,
2
,
2
})})
{
int
N
=
priors
.
size
();
for
(
bool
share_location
:
{
true
})
{
for
(
int
class_num
:
{
1
,
4
,
10
})
{
DDim
*
bbox_dim
;
DDim
*
conf_dim
;
int
M
=
priors
[
0
];
if
(
share_location
)
{
bbox_dim
=
new
DDim
({
N
,
M
,
4
});
}
else
{
bbox_dim
=
new
DDim
({
class_num
,
M
,
4
});
}
conf_dim
=
new
DDim
({
N
,
class_num
,
M
});
bbox
.
Resize
(
*
bbox_dim
);
conf
.
Resize
(
*
conf_dim
);
for
(
int
background_id
:
{
0
})
{
for
(
int
keep_topk
:
{
1
,
5
,
10
})
{
for
(
int
nms_topk
:
{
1
,
5
,
10
})
{
for
(
float
nms_eta
:
{
1.0
,
0.99
,
0.9
})
{
for
(
float
nms_thresh
:
{
0.5
,
0.7
})
{
for
(
float
conf_thresh
:
{
0.5
,
0.7
})
{
auto
*
conf_data
=
conf
.
mutable_data
<
float
>
();
auto
*
bbox_data
=
bbox
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
bbox_dim
->
production
();
++
i
)
{
bbox_data
[
i
]
=
i
*
1.
/
bbox_dim
->
production
();
}
for
(
int
i
=
0
;
i
<
conf_dim
->
production
();
++
i
)
{
conf_data
[
i
]
=
i
*
1.
/
conf_dim
->
production
();
}
param
.
bboxes
=
&
bbox
;
param
.
scores
=
&
conf
;
param
.
out
=
&
out
;
param
.
background_label
=
background_id
;
param
.
keep_top_k
=
keep_topk
;
param
.
nms_top_k
=
nms_topk
;
param
.
score_threshold
=
conf_thresh
;
param
.
nms_threshold
=
nms_thresh
;
param
.
nms_eta
=
nms_eta
;
multiclass_nms
.
SetParam
(
param
);
multiclass_nms
.
Run
();
auto
*
out_data
=
out
.
mutable_data
<
float
>
();
out_ref
.
clear
();
multiclass_nms_compute_ref
<
float
>
(
param
,
class_num
,
priors
,
share_location
,
&
out_ref
);
EXPECT_EQ
(
out
.
dims
().
production
(),
out_ref
.
size
());
if
(
out
.
dims
().
production
()
==
out_ref
.
size
())
{
auto
*
out_ref_data
=
out_ref
.
data
();
for
(
int
i
=
0
;
i
<
out
.
dims
().
production
();
i
++
)
{
EXPECT_NEAR
(
out_data
[
i
],
out_ref_data
[
i
],
1e-5
);
}
}
}
}
}
}
}
}
delete
bbox_dim
;
delete
conf_dim
;
}
}
}
}
}
// namespace host
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
USE_LITE_KERNEL
(
multiclass_nms
,
kHost
,
kFloat
,
kNCHW
,
def
);
lite/kernels/opencl/activation_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -32,8 +32,10 @@ class ReluCompute
...
@@ -32,8 +32,10 @@ class ReluCompute
std
::
string
doc
()
const
override
{
return
"Relu using cl::Buffer, kFloat"
;
}
std
::
string
doc
()
const
override
{
return
"Relu using cl::Buffer, kFloat"
;
}
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/relu_kernel.cl"
,
build_options_
);
"buffer/relu_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -46,7 +48,7 @@ class ReluCompute
...
@@ -46,7 +48,7 @@ class ReluCompute
auto
*
x_buf
=
param
.
X
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
x_buf
=
param
.
X
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
out_buf
=
param
.
Out
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
auto
*
out_buf
=
param
.
Out
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
...
@@ -74,6 +76,7 @@ class ReluCompute
...
@@ -74,6 +76,7 @@ class ReluCompute
private:
private:
std
::
string
kernel_func_name_
{
"relu"
};
std
::
string
kernel_func_name_
{
"relu"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float -DRELU"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float -DRELU"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
@@ -87,8 +90,10 @@ class SigmoidCompute
...
@@ -87,8 +90,10 @@ class SigmoidCompute
}
}
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/sigmoid_kernel.cl"
,
build_options_
);
"buffer/sigmoid_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -101,7 +106,7 @@ class SigmoidCompute
...
@@ -101,7 +106,7 @@ class SigmoidCompute
auto
*
x_buf
=
param
.
X
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
x_buf
=
param
.
X
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
out_buf
=
param
.
Out
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
auto
*
out_buf
=
param
.
Out
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
...
@@ -129,6 +134,7 @@ class SigmoidCompute
...
@@ -129,6 +134,7 @@ class SigmoidCompute
private:
private:
std
::
string
kernel_func_name_
{
"sigmoid"
};
std
::
string
kernel_func_name_
{
"sigmoid"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float -DSIGMOID"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float -DSIGMOID"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/activation_image_compute.cc
浏览文件 @
917620e8
...
@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
...
@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
}
}
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
act_param_
=
param_
.
get_mutable
<
param_t
>
();
act_param_
=
param_
.
get_mutable
<
param_t
>
();
int
act_type
=
static_cast
<
int
>
(
act_param_
->
active_type
);
int
act_type
=
static_cast
<
int
>
(
act_param_
->
active_type
);
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
1
)
<<
"ActivationTypeToStr(act_param_->active_type):"
VLOG
(
1
)
<<
"ActivationTypeToStr(act_param_->active_type):"
<<
ActivationTypeToStr
(
act_param_
->
active_type
);
<<
ActivationTypeToStr
(
act_param_
->
active_type
);
#endif
switch
(
act_type
)
{
switch
(
act_type
)
{
case
1
:
case
1
:
kernel_func_name_
=
"relu"
;
kernel_func_name_
=
"relu"
;
...
@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
...
@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
LOG
(
FATAL
)
<<
"This act type:"
<<
act_type
<<
" doesn't support."
;
LOG
(
FATAL
)
<<
"This act type:"
<<
act_type
<<
" doesn't support."
;
return
;
return
;
}
}
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
#endif
kernel_func_name_
,
"image/activation_kernel.cl"
,
build_options_
);
}
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
x_dims
=
param
.
X
->
dims
();
auto
*
x_img
=
param
.
X
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
image_shape
=
InitImageDimInfoWith
(
x_dims
);
auto
*
out_img
=
param
.
Out
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
image_shape
[
"width"
],
image_shape
[
"height"
]);
const
auto
&
y_dims
=
param
.
Out
->
dims
();
// useless: check dim only
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/activation_kernel.cl"
,
build_options_
,
time_stamp_
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
}
int
arg_idx
=
0
;
void
ReInitWhenNeeded
()
override
{
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_img
);
act_param_
=
param_
.
get_mutable
<
param_t
>
();
auto
x_dims
=
act_param_
->
X
->
dims
();
if
((
!
first_epoch_for_reinit_
&&
x_dims
!=
last_x_dims_
)
||
first_epoch_for_reinit_
)
{
last_x_dims_
=
x_dims
;
first_epoch_for_reinit_
=
false
;
// compute image shape
paddle
::
lite
::
CLImageConverterDefault
default_convertor
;
x_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
act_param_
->
X
->
dims
());
// w, h
out_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
act_param_
->
Out
->
dims
());
// w, h
// compute global work size
GetGlobalWorkSize
();
}
}
void
GetGlobalWorkSize
()
{
global_work_size_
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
x_img_shape_
[
0
]),
static_cast
<
cl
::
size_type
>
(
x_img_shape_
[
1
])};
}
void
Run
()
override
{
auto
*
x_img
=
act_param_
->
X
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
out_img
=
act_param_
->
Out
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
out_img_shape_
[
0
],
out_img_shape_
[
1
]);
auto
kernel
=
kernel_
;
cl_int
status
;
status
=
kernel
.
setArg
(
0
,
*
x_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_img
);
status
=
kernel
.
setArg
(
1
,
*
out_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
threshold_
);
status
=
kernel
.
setArg
(
2
,
threshold_
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
scale_
);
status
=
kernel
.
setArg
(
3
,
scale_
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
const
auto
&
x_dims
=
act_param_
->
X
->
dims
();
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
const
auto
&
y_dims
=
act_param_
->
Out
->
dims
();
// useless: check dim only
VLOG
(
4
)
<<
"image_shape(w,h):"
<<
image_shape
[
"width"
]
<<
" "
VLOG
(
4
)
<<
TargetToStr
(
act_param_
->
X
->
target
());
<<
image_shape
[
"height"
];
VLOG
(
4
)
<<
TargetToStr
(
act_param_
->
Out
->
target
());
VLOG
(
4
)
<<
"x_img_shape_(w,h):"
<<
x_img_shape_
[
0
]
<<
" "
<<
x_img_shape_
[
1
];
VLOG
(
4
)
<<
"x_dims["
<<
x_dims
.
size
()
<<
"D]:"
<<
x_dims
[
0
]
<<
" "
VLOG
(
4
)
<<
"x_dims["
<<
x_dims
.
size
()
<<
"D]:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
VLOG
(
4
)
<<
"y_dims["
<<
y_dims
.
size
()
<<
"D]:"
<<
y_dims
[
0
]
<<
" "
VLOG
(
4
)
<<
"y_dims["
<<
y_dims
.
size
()
<<
"D]:"
<<
y_dims
[
0
]
<<
" "
...
@@ -115,13 +145,12 @@ class ActivationComputeImageDefault
...
@@ -115,13 +145,12 @@ class ActivationComputeImageDefault
VLOG
(
4
)
<<
"kernel func name:"
<<
kernel_func_name_
;
VLOG
(
4
)
<<
"kernel func name:"
<<
kernel_func_name_
;
#endif
#endif
auto
global_work_size
=
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
image_shape
[
"width"
]),
CHECK
(
context
.
cl_context
()
!=
nullptr
);
static_cast
<
cl
::
size_type
>
(
image_shape
[
"height"
])};
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
kernel
,
cl
::
NullRange
,
cl
::
NullRange
,
global_work_size
,
global_work_size
_
,
cl
::
NullRange
,
cl
::
NullRange
,
nullptr
,
nullptr
,
event_
.
get
());
event_
.
get
());
...
@@ -131,10 +160,20 @@ class ActivationComputeImageDefault
...
@@ -131,10 +160,20 @@ class ActivationComputeImageDefault
private:
private:
param_t
*
act_param_
{
nullptr
};
param_t
*
act_param_
{
nullptr
};
DDim
x_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
DDim
out_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
DDim
last_x_dims_
;
std
::
string
kernel_func_name_
{};
std
::
string
kernel_func_name_
{};
float
threshold_
{
6.
f
};
float
threshold_
{
6.
f
};
float
scale_
{
1.
f
};
float
scale_
{
1.
f
};
cl
::
Kernel
kernel_
;
bool
first_epoch_for_reinit_
{
true
};
cl
::
NDRange
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
)};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
}
// namespace opencl
}
// namespace opencl
...
...
lite/kernels/opencl/bilinear_interp_image_compute.cc
浏览文件 @
917620e8
...
@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
...
@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
bilinear_interp_param_
=
param_
.
get_mutable
<
param_t
>
();
bilinear_interp_param_
=
param_
.
get_mutable
<
param_t
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/bilinear_interp_kernel.cl"
,
build_options_
);
"image/bilinear_interp_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
}
...
@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
...
@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
#endif
#endif
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
@@ -159,6 +161,7 @@ class BilinearInterpImageCompute
...
@@ -159,6 +161,7 @@ class BilinearInterpImageCompute
param_t
*
bilinear_interp_param_
{
nullptr
};
param_t
*
bilinear_interp_param_
{
nullptr
};
std
::
string
kernel_func_name_
{
"bilinear_interp"
};
std
::
string
kernel_func_name_
{
"bilinear_interp"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/concat_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
}
else
{
}
else
{
kernel_func_name_
=
"concat_mul"
;
kernel_func_name_
=
"concat_mul"
;
}
}
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/concat_kernel.cl"
,
build_options_
);
"buffer/concat_kernel.cl"
,
build_options_
,
time_stamp_
);
auto
axis
=
concat_param_
->
axis
;
auto
axis
=
concat_param_
->
axis
;
auto
inputs
=
concat_param_
->
x
;
auto
inputs
=
concat_param_
->
x
;
...
@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
inputs
=
param
.
x
;
auto
inputs
=
param
.
x
;
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
param_t
*
concat_param_
{
nullptr
};
param_t
*
concat_param_
{
nullptr
};
std
::
string
kernel_func_name_
{};
std
::
string
kernel_func_name_
{};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/concat_image_compute.cc
浏览文件 @
917620e8
...
@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
...
@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
kernel_func_name_
=
"concat_mul"
;
kernel_func_name_
=
"concat_mul"
;
}
}
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/concat_kernel.cl"
,
build_options_
);
"image/concat_kernel.cl"
,
build_options_
,
time_stamp_
);
auto
axis
=
concat_param_
->
axis
;
auto
axis
=
concat_param_
->
axis
;
auto
inputs
=
concat_param_
->
x
;
auto
inputs
=
concat_param_
->
x
;
...
@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
...
@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
inputs
=
param
.
x
;
auto
inputs
=
param
.
x
;
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
...
@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
param_t
*
concat_param_
{
nullptr
};
param_t
*
concat_param_
{
nullptr
};
std
::
string
kernel_func_name_
{};
std
::
string
kernel_func_name_
{};
std
::
string
build_options_
{
" -DCL_DTYPE_half"
};
std
::
string
build_options_
{
" -DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/conv_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
...
@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
}
}
for
(
size_t
i
=
0
;
i
<
kernel_func_names_
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
kernel_func_names_
.
size
();
i
++
)
{
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_names_
[
i
],
kernel_func_names_
[
i
],
kernel_func_paths_
[
i
],
build_options_
[
i
]);
kernel_func_paths_
[
i
],
build_options_
[
i
],
time_stamp_
);
}
}
}
}
...
@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
...
@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
std
::
stringstream
kernel_key
;
std
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_names_
[
0
]
<<
build_options_
[
0
];
kernel_key
<<
kernel_func_names_
[
0
]
<<
build_options_
[
0
]
<<
time_stamp_
;
auto
img2col_kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
img2col_kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
n_threads
=
c_in
*
h_out
*
w_out
;
int
n_threads
=
c_in
*
h_out
*
w_out
;
...
@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() {
...
@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() {
int
n
=
h_out
*
w_out
;
int
n
=
h_out
*
w_out
;
VLOG
(
4
)
<<
"m = "
<<
m
<<
" n = "
<<
n
<<
" k = "
<<
k
;
VLOG
(
4
)
<<
"m = "
<<
m
<<
" n = "
<<
n
<<
" k = "
<<
k
;
kernel_key
.
str
(
""
);
kernel_key
.
str
(
""
);
kernel_key
<<
kernel_func_names_
[
1
]
<<
build_options_
[
1
];
kernel_key
<<
kernel_func_names_
[
1
]
<<
build_options_
[
1
]
<<
time_stamp_
;
auto
gemm_kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
gemm_kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
GemmBatched
(
GemmBatched
(
gemm_kernel
,
col_buf
,
filter_buf
,
bias_buf
,
output_buf
,
bs
,
m
,
n
,
k
);
gemm_kernel
,
col_buf
,
filter_buf
,
bias_buf
,
output_buf
,
bs
,
m
,
n
,
k
);
...
@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() {
...
@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() {
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
std
::
stringstream
kernel_key
;
std
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_names_
.
front
()
<<
build_options_
.
front
();
kernel_key
<<
kernel_func_names_
.
front
()
<<
build_options_
.
front
()
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
GemmBatched
(
kernel
,
x_d
,
filter_d
,
bias_d
,
output_d
,
batch_size
,
m
,
n
,
k
);
GemmBatched
(
kernel
,
x_d
,
filter_d
,
bias_d
,
output_d
,
batch_size
,
m
,
n
,
k
);
...
...
lite/kernels/opencl/conv_buffer_compute.h
浏览文件 @
917620e8
...
@@ -21,6 +21,7 @@
...
@@ -21,6 +21,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -55,6 +56,7 @@ class ConvCompute
...
@@ -55,6 +56,7 @@ class ConvCompute
std
::
vector
<
std
::
string
>
kernel_func_names_
{};
std
::
vector
<
std
::
string
>
kernel_func_names_
{};
std
::
vector
<
std
::
string
>
kernel_func_paths_
{};
std
::
vector
<
std
::
string
>
kernel_func_paths_
{};
std
::
vector
<
std
::
string
>
build_options_
{};
std
::
vector
<
std
::
string
>
build_options_
{};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/conv_image_compute.cc
浏览文件 @
917620e8
...
@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() {
...
@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() {
build_options_
.
push_back
(
build_options_single
);
build_options_
.
push_back
(
build_options_single
);
for
(
size_t
i
=
0
;
i
<
kernel_func_names_
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
kernel_func_names_
.
size
();
i
++
)
{
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_names_
[
i
],
kernel_func_names_
[
i
],
kernel_func_paths_
[
i
],
build_options_
[
i
]);
kernel_func_paths_
[
i
],
build_options_
[
i
],
time_stamp_
);
}
}
VLOG
(
4
)
<<
"global_work_size_[3D]: {"
<<
global_work_size_
[
0
]
<<
","
VLOG
(
4
)
<<
"global_work_size_[3D]: {"
<<
global_work_size_
[
0
]
<<
","
<<
global_work_size_
[
1
]
<<
","
<<
global_work_size_
[
2
]
<<
"}"
;
<<
global_work_size_
[
1
]
<<
","
<<
global_work_size_
[
2
]
<<
"}"
;
std
::
stringstream
kernel_key
;
std
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_names_
[
0
]
<<
build_options_
[
0
];
kernel_key
<<
kernel_func_names_
[
0
]
<<
build_options_
[
0
]
<<
time_stamp_
;
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
VLOG
(
4
)
<<
"kernel_key: "
<<
kernel_key
.
str
();
VLOG
(
4
)
<<
"kernel_key: "
<<
kernel_key
.
str
();
VLOG
(
4
)
<<
"kernel ready ... "
<<
kernel_key
.
str
();
VLOG
(
4
)
<<
"kernel ready ... "
<<
kernel_key
.
str
();
...
@@ -388,18 +390,43 @@ void ConvImageCompute::PrepareForRun() {
...
@@ -388,18 +390,43 @@ void ConvImageCompute::PrepareForRun() {
VLOG
(
4
)
<<
"max_work_group_size: "
<<
max_work_group_size
;
VLOG
(
4
)
<<
"max_work_group_size: "
<<
max_work_group_size
;
if
(
max_work_group_size
>
0
&&
use_lws
)
{
if
(
max_work_group_size
>
0
&&
use_lws_
)
{
// local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
double
min_turn_time
=
DBL_MAX
;
// global_work_size_, max_work_group_size);
cl
::
NDRange
best_local_work_size
=
context
.
cl_context
()
->
LocalWorkSize
(
local_work_size_
=
context
.
cl_context
()
->
LocalWorkSize
(
global_work_size_
,
global_work_size_
,
max_work_group_size
);
max_work_group_size
);
cl
::
NDRange
last_local_work_size
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
0
),
static_cast
<
size_t
>
(
0
),
static_cast
<
size_t
>
(
0
)};
if
(
use_turn_
)
{
for
(
size_t
i
=
1
;
i
<
15
;
i
++
)
{
if
(
kernel_h
==
1
&&
kernel_w
==
1
)
{
// todo use diff logics
local_work_size_
=
context
.
cl_context
()
->
LocalWorkSizeTurn
(
global_work_size_
,
max_work_group_size
,
i
);
}
else
{
local_work_size_
=
context
.
cl_context
()
->
LocalWorkSizeTurn
(
global_work_size_
,
max_work_group_size
,
i
);
}
if
(
last_local_work_size
[
0
]
==
local_work_size_
[
0
]
&&
last_local_work_size
[
1
]
==
local_work_size_
[
1
]
&&
last_local_work_size
[
2
]
==
local_work_size_
[
2
])
{
// skiped turned lws
continue
;
}
auto
turn_time
=
this
->
Turn
(
5
);
if
(
min_turn_time
>
turn_time
)
{
min_turn_time
=
turn_time
;
best_local_work_size
=
local_work_size_
;
}
last_local_work_size
=
local_work_size_
;
}
}
local_work_size_
=
best_local_work_size
;
VLOG
(
4
)
<<
"local_work_size_[3D]: {"
<<
local_work_size_
[
0
]
<<
","
VLOG
(
4
)
<<
"local_work_size_[3D]: {"
<<
local_work_size_
[
0
]
<<
","
<<
local_work_size_
[
1
]
<<
","
<<
local_work_size_
[
2
]
<<
"}"
;
<<
local_work_size_
[
1
]
<<
","
<<
local_work_size_
[
2
]
<<
"}"
;
}
}
}
}
void
ConvImageCompute
::
Conv2d1x1opt
()
{
void
ConvImageCompute
::
Conv2d1x1opt
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -431,23 +458,6 @@ void ConvImageCompute::Conv2d1x1opt() {
...
@@ -431,23 +458,6 @@ void ConvImageCompute::Conv2d1x1opt() {
int
input_c
=
input_dims
[
1
];
int
input_c
=
input_dims
[
1
];
auto
dilations
=
*
param
.
dilations
;
auto
dilations
=
*
param
.
dilations
;
// const std::vector<size_t>& default_work_size =
// DefaultWorkSize(output_dims,
// DDim(std::vector<DDim::value_type>{
// static_cast<int64_t>(out_image_shape["width"]),
// static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// int maped_w = maptofactor(w, 4);
// auto global_work_size_ =
// cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
// static_cast<size_t>(maped_w),
// static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
// VLOG(4) << "out_image: " << out_image;
VLOG
(
4
)
<<
"global_work_size_[3D]: {"
<<
global_work_size_
[
0
]
<<
","
VLOG
(
4
)
<<
"global_work_size_[3D]: {"
<<
global_work_size_
[
0
]
<<
","
...
@@ -541,73 +551,12 @@ void ConvImageCompute::Conv2d1x1opt() {
...
@@ -541,73 +551,12 @@ void ConvImageCompute::Conv2d1x1opt() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
#ifdef PROFILE_CONV_KERNEL
event_
->
wait
();
bool
use_profile
=
false
;
auto
GetCurrentUS
=
[]()
->
double
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+6
*
time
.
tv_sec
+
time
.
tv_usec
;
};
double
start
=
GetCurrentUS
();
if
(
use_profile
)
{
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size_
,
local_work_size_
,
nullptr
,
event_
.
get
());
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
}
else
{
int
count
=
50
;
double
sumtime
=
0
;
if
(
!
use_profile
)
{
count
=
1
;
}
for
(
size_t
i
=
0
;
i
<
count
;
i
++
)
{
start
=
GetCurrentUS
();
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size_
,
local_work_size_
,
nullptr
,
event_
.
get
());
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
use_profile
)
{
event_
->
wait
();
double
duration
=
GetCurrentUS
()
-
start
;
sumtime
+=
duration
;
}
}
auto
dims_string
=
[](
DDimLite
dims
)
->
std
::
string
{
std
::
ostringstream
stream
;
stream
<<
"["
<<
dims
[
0
]
<<
","
<<
dims
[
1
]
<<
","
<<
dims
[
2
]
<<
","
<<
dims
[
3
]
<<
"]"
;
return
stream
.
str
();
};
if
(
use_profile
)
{
// LOG(INFO) << "input: " << input_dims;
// LOG(INFO) << "filter: " << filter_dims;
// LOG(INFO) << "output: " << output_dims;
std
::
cout
<<
std
::
setw
(
25
)
<<
std
::
left
<<
dims_string
(
input_dims
)
<<
std
::
setw
(
25
)
<<
std
::
left
<<
dims_string
(
filter_dims
)
<<
std
::
setw
(
25
)
<<
std
::
left
<<
dims_string
(
output_dims
)
<<
std
::
setw
(
25
)
<<
std
::
left
<<
sumtime
/
count
<<
std
::
endl
;
}
else
{
dims_string
(
input_dims
);
}
}
}
#endif
}
}
void
ConvImageCompute
::
Conv2d3x3
()
{
void
ConvImageCompute
::
Conv2d3x3
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -767,9 +716,13 @@ void ConvImageCompute::Conv2d3x3() {
...
@@ -767,9 +716,13 @@ void ConvImageCompute::Conv2d3x3() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
Conv2d3x3opt
()
{
void
ConvImageCompute
::
Conv2d3x3opt
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -890,9 +843,12 @@ void ConvImageCompute::Conv2d3x3opt() {
...
@@ -890,9 +843,12 @@ void ConvImageCompute::Conv2d3x3opt() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
Conv2d5x5
()
{
void
ConvImageCompute
::
Conv2d5x5
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1018,9 +974,12 @@ void ConvImageCompute::Conv2d5x5() {
...
@@ -1018,9 +974,12 @@ void ConvImageCompute::Conv2d5x5() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
Conv2d5x5opt
()
{
void
ConvImageCompute
::
Conv2d5x5opt
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1134,9 +1093,12 @@ void ConvImageCompute::Conv2d5x5opt() {
...
@@ -1134,9 +1093,12 @@ void ConvImageCompute::Conv2d5x5opt() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
Conv2d7x7
()
{
void
ConvImageCompute
::
Conv2d7x7
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1262,8 +1224,12 @@ void ConvImageCompute::Conv2d7x7() {
...
@@ -1262,8 +1224,12 @@ void ConvImageCompute::Conv2d7x7() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
Conv2d7x7opt
()
{
void
ConvImageCompute
::
Conv2d7x7opt
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1374,8 +1340,12 @@ void ConvImageCompute::Conv2d7x7opt() {
...
@@ -1374,8 +1340,12 @@ void ConvImageCompute::Conv2d7x7opt() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
DepthwiseConv2d3x3s1
()
{
void
ConvImageCompute
::
DepthwiseConv2d3x3s1
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1454,9 +1424,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
...
@@ -1454,9 +1424,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
output_img
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
output_img
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
DepthwiseConv2d3x3
()
{
void
ConvImageCompute
::
DepthwiseConv2d3x3
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1548,9 +1522,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
...
@@ -1548,9 +1522,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
output_img
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
output_img
,
event_
);
if
(
is_turn
)
{
event_
->
wait
();
}
}
}
void
ConvImageCompute
::
DepthwiseConv2d
()
{
void
ConvImageCompute
::
DepthwiseConv2d
(
bool
is_turn
)
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
...
@@ -1683,8 +1661,22 @@ void ConvImageCompute::DepthwiseConv2d() {
...
@@ -1683,8 +1661,22 @@ void ConvImageCompute::DepthwiseConv2d() {
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_image
,
event_
);
}
}
void
ConvImageCompute
::
Run
()
{
(
this
->*
impl_
)();
}
void
ConvImageCompute
::
Run
()
{
(
this
->*
impl_
)(
false
);
}
#undef PROFILE_CONV_KERNEL
double
ConvImageCompute
::
Turn
(
int
times
)
{
auto
GetCurrentUS
=
[]()
->
double
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
1e+6
*
time
.
tv_sec
+
time
.
tv_usec
;
};
auto
start
=
GetCurrentUS
();
for
(
size_t
i
=
0
;
i
<
times
;
i
++
)
{
(
this
->*
impl_
)(
true
);
}
auto
time_diff
=
(
GetCurrentUS
()
-
start
)
/
times
;
return
time_diff
;
}
}
// namespace opencl
}
// namespace opencl
}
// namespace kernels
}
// namespace kernels
}
// namespace lite
}
// namespace lite
...
...
lite/kernels/opencl/conv_image_compute.h
浏览文件 @
917620e8
...
@@ -22,40 +22,42 @@
...
@@ -22,40 +22,42 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{
namespace
kernels
{
namespace
kernels
{
namespace
opencl
{
namespace
opencl
{
class
ConvImageCompute
:
public
KernelLite
<
TARGET
(
kOpenCL
),
class
ConvImageCompute
:
public
KernelLite
<
TARGET
(
kOpenCL
),
PRECISION
(
kFP16
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kImageDefault
)
>
{
DATALAYOUT
(
kImageDefault
)
>
{
public:
public:
using
param_t
=
operators
::
ConvParam
;
using
param_t
=
operators
::
ConvParam
;
using
kernel_t
=
void
(
ConvImageCompute
::*
)();
using
kernel_t
=
void
(
ConvImageCompute
::*
)(
bool
);
void
PrepareForRun
()
override
;
void
PrepareForRun
()
override
;
void
Run
()
override
;
void
Run
()
override
;
double
Turn
(
int
times
=
5
);
private:
private:
void
Conv2d1x1opt
();
void
Conv2d1x1opt
(
bool
is_turn
=
false
);
void
Conv2d3x3
();
void
Conv2d3x3
(
bool
is_turn
=
false
);
void
Conv2d3x3opt
();
void
Conv2d3x3opt
(
bool
is_turn
=
false
);
void
Conv2d5x5
();
void
Conv2d5x5
(
bool
is_turn
=
false
);
void
Conv2d5x5opt
();
void
Conv2d5x5opt
(
bool
is_turn
=
false
);
void
Conv2d7x7
();
void
Conv2d7x7
(
bool
is_turn
=
false
);
void
Conv2d7x7opt
();
void
Conv2d7x7opt
(
bool
is_turn
=
false
);
void
DepthwiseConv2d3x3s1
();
void
DepthwiseConv2d3x3s1
(
bool
is_turn
=
false
);
void
DepthwiseConv2d3x3
();
void
DepthwiseConv2d3x3
(
bool
is_turn
=
false
);
void
DepthwiseConv2d
();
void
DepthwiseConv2d
(
bool
is_turn
=
false
);
kernel_t
impl_
;
kernel_t
impl_
;
std
::
vector
<
std
::
string
>
kernel_func_names_
{};
std
::
vector
<
std
::
string
>
kernel_func_names_
{};
std
::
vector
<
std
::
string
>
kernel_func_paths_
{};
std
::
vector
<
std
::
string
>
kernel_func_paths_
{};
std
::
vector
<
std
::
string
>
build_options_
{};
std
::
vector
<
std
::
string
>
build_options_
{};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
Tensor
filter_gpu_image_
;
Tensor
filter_gpu_image_
;
Tensor
bias_gpu_image_
;
Tensor
bias_gpu_image_
;
...
@@ -72,7 +74,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -72,7 +74,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
cl
::
NDRange
local_work_size_
=
cl
::
NDRange
{
cl
::
NDRange
local_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
)};
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
)};
bool
use_lws
{
true
};
bool
use_lws_
{
true
};
bool
use_turn_
{
false
};
};
};
}
// namespace opencl
}
// namespace opencl
...
...
lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
...
@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
build_options_
+=
" -DRELU6"
;
build_options_
+=
" -DRELU6"
;
}
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/depthwise_conv2d_kernel.cl"
,
build_options_
);
"buffer/depthwise_conv2d_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
...
@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
cl_int
status
;
cl_int
status
;
...
@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute
...
@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute
private:
private:
std
::
string
kernel_func_name_
{
"depthwise_conv2d"
};
std
::
string
kernel_func_name_
{
"depthwise_conv2d"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/dropout_image_compute.cc
浏览文件 @
917620e8
...
@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/dropout_kernel.cl"
,
build_options_
);
"image/dropout_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
cl_int
status
;
cl_int
status
;
...
@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
private:
std
::
string
kernel_func_name_
{
"dropout"
};
std
::
string
kernel_func_name_
{
"dropout"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/elementwise_add_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -25,8 +25,10 @@ namespace opencl {
...
@@ -25,8 +25,10 @@ namespace opencl {
void
ElementwiseAddCompute
::
PrepareForRun
()
{
void
ElementwiseAddCompute
::
PrepareForRun
()
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/elementwise_add_kernel.cl"
,
build_options_
);
"buffer/elementwise_add_kernel.cl"
,
build_options_
,
time_stamp_
);
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
UpdateParams
();
UpdateParams
();
}
}
...
@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
...
@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
auto
*
out_buf
=
ele_param_
->
Out
->
template
mutable_data
<
float
,
cl
::
Buffer
>(
auto
*
out_buf
=
ele_param_
->
Out
->
template
mutable_data
<
float
,
cl
::
Buffer
>(
TARGET
(
kOpenCL
));
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
TargetToStr
(
ele_param_
->
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
ele_param_
->
X
->
target
());
...
...
lite/kernels/opencl/elementwise_add_buffer_compute.h
浏览文件 @
917620e8
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include <memory>
#include <memory>
#include <string>
#include <string>
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/cp_logging.h"
...
@@ -46,6 +47,7 @@ class ElementwiseAddCompute
...
@@ -46,6 +47,7 @@ class ElementwiseAddCompute
param_t
*
ele_param_
{
nullptr
};
param_t
*
ele_param_
{
nullptr
};
std
::
string
kernel_func_name_
{
"elementwise_add"
};
std
::
string
kernel_func_name_
{
"elementwise_add"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/elementwise_add_image_compute.cc
浏览文件 @
917620e8
...
@@ -23,44 +23,84 @@ namespace lite {
...
@@ -23,44 +23,84 @@ namespace lite {
namespace
kernels
{
namespace
kernels
{
namespace
opencl
{
namespace
opencl
{
void
ElementwiseAddImageCompute
::
PrepareForRun
()
{
void
ElementwiseAddImageCompute
::
PrepareForRun
()
{}
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
auto
*
x
=
ele_param_
->
X
;
auto
*
y
=
ele_param_
->
Y
;
auto
axis
=
ele_param_
->
axis
;
if
(
y
->
dims
().
size
()
==
4
)
{
void
ElementwiseAddImageCompute
::
ReInitWhenNeeded
()
{
kernel_func_name_
=
"elementwise_add"
;
// y: ImageDefault
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
}
else
if
(
y
->
dims
().
size
()
==
1
)
{
auto
x_dims
=
ele_param_
->
X
->
dims
();
if
(
axis
==
x
->
dims
().
size
()
-
1
)
{
if
((
!
first_epoch_for_reinit_
&&
x_dims
!=
last_x_dims_
)
||
kernel_func_name_
=
"width_add"
;
// y: ImageDefault
first_epoch_for_reinit_
)
{
}
else
if
(
axis
==
x
->
dims
().
size
()
-
3
)
{
last_x_dims_
=
x_dims
;
kernel_func_name_
=
"channel_add"
;
// y: ImageFolder
first_epoch_for_reinit_
=
false
;
// choose kernel
auto
*
x
=
ele_param_
->
X
;
auto
*
y
=
ele_param_
->
Y
;
auto
*
out
=
ele_param_
->
Out
;
auto
axis
=
ele_param_
->
axis
;
if
(
y
->
dims
().
size
()
==
4
)
{
kernel_func_name_
=
"elementwise_add"
;
// y: ImageDefault
}
else
if
(
y
->
dims
().
size
()
==
1
)
{
if
(
axis
==
x
->
dims
().
size
()
-
1
)
{
kernel_func_name_
=
"width_add"
;
// y: ImageDefault
}
else
if
(
axis
==
x
->
dims
().
size
()
-
3
)
{
kernel_func_name_
=
"channel_add"
;
// y: ImageFolder
}
else
{
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
<<
", x->dims().size():"
<<
x
->
dims
().
size
()
<<
", y->dims.size():"
<<
y
->
dims
().
size
();
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
<<
", x->dims().size():"
<<
x
->
dims
().
size
()
<<
", x->dims().size():"
<<
x
->
dims
().
size
()
<<
", y->dims.size():"
<<
y
->
dims
().
size
();
<<
", y->dims.size():"
<<
y
->
dims
().
size
();
}
}
}
else
{
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
<<
", x->dims().size():"
<<
x
->
dims
().
size
()
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
<<
", y->dims.size():"
<<
y
->
dims
().
size
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/elementwise_add_kernel.cl"
,
build_options_
,
time_stamp_
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
// compute image shape
paddle
::
lite
::
CLImageConverterDefault
default_convertor
;
x_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
x
->
dims
());
// w, h
y_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
y
->
dims
());
out_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
out
->
dims
());
// w, h
// compute global work size
GetGlobalWorkSize
();
}
}
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
void
ElementwiseAddImageCompute
::
GetGlobalWorkSize
()
{
context
.
cl_context
()
->
AddKernel
(
global_work_size_
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
x_img_shape_
[
0
]),
kernel_func_name_
,
"image/elementwise_add_kernel.cl"
,
build_options_
);
static_cast
<
cl
::
size_type
>
(
x_img_shape_
[
1
])};
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"global_work_size:[2D]:"
<<
x_img_shape_
[
0
]
<<
" "
<<
x_img_shape_
[
1
];
#endif
}
}
void
ElementwiseAddImageCompute
::
Run
()
{
void
ElementwiseAddImageCompute
::
Run
()
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
auto
*
x
=
ele_param_
->
X
;
auto
*
x
=
ele_param_
->
X
;
auto
*
y
=
ele_param_
->
Y
;
auto
*
y
=
ele_param_
->
Y
;
auto
*
out
=
ele_param_
->
Out
;
auto
*
out
=
ele_param_
->
Out
;
auto
axis
=
ele_param_
->
axis
;
auto
axis
=
ele_param_
->
axis
;
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
auto
*
x_img
=
x
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
y_img
=
y
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
out_img
=
out
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
out_img_shape_
[
0
],
out_img_shape_
[
1
]);
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"x->target():"
<<
TargetToStr
(
x
->
target
());
VLOG
(
4
)
<<
"x->target():"
<<
TargetToStr
(
x
->
target
());
...
@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() {
...
@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() {
VLOG
(
4
)
<<
"y->dims():"
<<
y
->
dims
();
VLOG
(
4
)
<<
"y->dims():"
<<
y
->
dims
();
VLOG
(
4
)
<<
"out->dims():"
<<
out
->
dims
();
VLOG
(
4
)
<<
"out->dims():"
<<
out
->
dims
();
VLOG
(
4
)
<<
"axis:"
<<
axis
;
VLOG
(
4
)
<<
"axis:"
<<
axis
;
#endif
paddle
::
lite
::
CLImageConverterDefault
default_convertor
;
auto
x_img_shape
=
default_convertor
.
InitImageDimInfoWith
(
x
->
dims
());
// w, h
auto
x_img_width
=
x_img_shape
[
0
];
auto
x_img_height
=
x_img_shape
[
1
];
auto
out_img_shape
=
default_convertor
.
InitImageDimInfoWith
(
out
->
dims
());
// w, h
auto
y_img_shape
=
default_convertor
.
InitImageDimInfoWith
(
y
->
dims
());
auto
*
x_img
=
x
->
data
<
half_t
,
cl
::
Image2D
>
();
VLOG
(
4
)
<<
"x_img_shape_[w,h]:"
<<
x_img_shape_
[
0
]
<<
" "
<<
x_img_shape_
[
1
];
auto
*
y_img
=
y
->
data
<
half_t
,
cl
::
Image2D
>
();
VLOG
(
4
)
<<
"y_img_shape_[w,h]:"
<<
y_img_shape_
[
0
]
<<
" "
<<
y_img_shape_
[
1
];
auto
*
out_img
=
out
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
out_img_shape
[
0
],
VLOG
(
4
)
<<
"out_img_shape_[w,h]:"
<<
out_img_shape_
[
0
]
<<
" "
out_img_shape
[
1
]);
<<
out_img_shape_
[
1
];
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"x_img_shape[w,h]:"
<<
x_img_width
<<
" "
<<
x_img_height
;
VLOG
(
4
)
<<
"y_img_shape[w,h]:"
<<
y_img_shape
[
0
]
<<
" "
<<
y_img_shape
[
1
];
VLOG
(
4
)
<<
"out_img_shape[w,h]:"
<<
out_img_shape
[
0
]
<<
" "
<<
out_img_shape
[
1
];
#endif
#endif
STL
::
stringstream
kernel_key
;
cl_int
status
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
auto
kernel
=
kernel_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
auto
y_dims
=
y
->
dims
();
if
(
y_dims
.
size
()
==
4
)
{
if
(
y_dims
.
size
()
==
4
)
{
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_img
);
status
=
kernel
.
setArg
(
0
,
*
x_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
y_img
);
status
=
kernel
.
setArg
(
1
,
*
y_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_img
);
status
=
kernel
.
setArg
(
2
,
*
out_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
}
else
if
(
y_dims
.
size
()
==
1
)
{
}
else
if
(
y_dims
.
size
()
==
1
)
{
if
(
axis
==
x
->
dims
().
size
()
-
1
||
axis
==
x
->
dims
()
.
size
()
-
3
)
{
if
(
axis
==
x
_dims
.
size
()
-
1
||
axis
==
x_dims
.
size
()
-
3
)
{
int
tensor_w
=
x
->
dims
()[
x
->
dims
()
.
size
()
-
1
];
const
int
tensor_w
=
x_dims
[
x_dims
.
size
()
-
1
];
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"tensor_w:"
<<
tensor_w
;
VLOG
(
4
)
<<
"tensor_w:"
<<
tensor_w
;
#endif
#endif
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_img
);
status
=
kernel
.
setArg
(
0
,
*
x_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
y_img
);
status
=
kernel
.
setArg
(
1
,
*
y_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_img
);
status
=
kernel
.
setArg
(
2
,
*
out_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
static_cast
<
const
int
>
(
tensor_w
)
);
status
=
kernel
.
setArg
(
3
,
tensor_w
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
<<
", x->dims().size():"
<<
x
->
dims
()
.
size
()
<<
", x->dims().size():"
<<
x
_dims
.
size
()
<<
", y->dims.size():"
<<
y
->
dims
()
.
size
();
<<
", y->dims.size():"
<<
y
_dims
.
size
();
}
}
}
else
{
}
else
{
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
LOG
(
FATAL
)
<<
"ElementwiseAddImage doesn't support axis:"
<<
axis
<<
", x->dims().size():"
<<
x
->
dims
()
.
size
()
<<
", x->dims().size():"
<<
x
_dims
.
size
()
<<
", y->dims.size():"
<<
y
->
dims
()
.
size
();
<<
", y->dims.size():"
<<
y
_dims
.
size
();
}
}
auto
global_work_size
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
x_img_width
),
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
static_cast
<
cl
::
size_type
>
(
x_img_height
)};
CHECK
(
context
.
cl_context
()
!=
nullptr
);
#ifndef LITE_SHUTDOWN_LOG
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
VLOG
(
4
)
<<
"global_work_size:[2D]:"
<<
x_img_width
<<
" "
<<
x_img_height
;
#endif
auto
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
kernel
,
cl
::
NullRange
,
cl
::
NullRange
,
global_work_size
,
global_work_size
_
,
cl
::
NullRange
,
cl
::
NullRange
,
nullptr
,
nullptr
,
event_
.
get
());
event_
.
get
());
...
...
lite/kernels/opencl/elementwise_add_image_compute.h
浏览文件 @
917620e8
...
@@ -15,8 +15,10 @@
...
@@ -15,8 +15,10 @@
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/cp_logging.h"
...
@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
...
@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
void
PrepareForRun
()
override
;
void
PrepareForRun
()
override
;
void
ReInitWhenNeeded
()
override
;
void
GetGlobalWorkSize
();
void
Run
()
override
;
void
Run
()
override
;
std
::
string
doc
()
const
override
{
std
::
string
doc
()
const
override
{
...
@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute
...
@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute
protected:
protected:
param_t
*
ele_param_
{
nullptr
};
param_t
*
ele_param_
{
nullptr
};
DDim
last_x_dims_
;
DDim
x_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
DDim
y_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
DDim
out_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
std
::
string
kernel_func_name_
{
"elementwise_add"
};
std
::
string
kernel_func_name_
{
"elementwise_add"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
bool
first_epoch_for_reinit_
{
true
};
cl
::
Kernel
kernel_
;
cl
::
NDRange
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
)};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/elementwise_mul_image_compute.cc
浏览文件 @
917620e8
...
@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute
...
@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute
VLOG
(
4
)
<<
"bias_dims.size():"
<<
bias_dims
.
size
();
VLOG
(
4
)
<<
"bias_dims.size():"
<<
bias_dims
.
size
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/elementwise_mul_kernel.cl"
,
build_options_
);
"image/elementwise_mul_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute
...
@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute
#endif
#endif
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
bias_dims
=
y
->
dims
();
auto
bias_dims
=
y
->
dims
();
...
@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute
...
@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute
param_t
*
ele_param_
{
nullptr
};
param_t
*
ele_param_
{
nullptr
};
std
::
string
kernel_func_name_
{
"elementwise_mul"
};
std
::
string
kernel_func_name_
{
"elementwise_mul"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/elementwise_sub_image_compute.cc
浏览文件 @
917620e8
...
@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
...
@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/elementwise_sub_kernel.cl"
,
build_options_
);
"image/elementwise_sub_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
ElementwiseSubImageCompute
::
Run
()
{
void
ElementwiseSubImageCompute
::
Run
()
{
...
@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
...
@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
#endif
#endif
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
...
lite/kernels/opencl/elementwise_sub_image_compute.h
浏览文件 @
917620e8
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <string>
#include <string>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/cp_logging.h"
...
@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute
...
@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute
param_t
*
ele_param_
{
nullptr
};
param_t
*
ele_param_
{
nullptr
};
std
::
string
kernel_func_name_
{
"elementwise_sub"
};
std
::
string
kernel_func_name_
{
"elementwise_sub"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/fc_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#include "lite/utils/string.h"
...
@@ -30,74 +31,98 @@ class FcCompute
...
@@ -30,74 +31,98 @@ class FcCompute
public:
public:
using
param_t
=
operators
::
FcParam
;
using
param_t
=
operators
::
FcParam
;
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{}
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
x_dims
=
param
.
input
->
dims
();
const
auto
w_dims
=
param
.
w
->
dims
();
CHECK_GE
(
x_dims
.
size
(),
2UL
);
CHECK_GE
(
w_dims
.
size
(),
2UL
);
CHECK_EQ
(
param
.
output
->
dims
().
size
(),
2UL
);
m_
=
x_dims
.
Slice
(
0
,
param
.
in_num_col_dims
).
production
();
k_
=
x_dims
.
Slice
(
param
.
in_num_col_dims
,
x_dims
.
size
()).
production
();
n_
=
w_dims
[
1
];
CHECK_EQ
(
k_
,
static_cast
<
int
>
(
w_dims
[
0
]));
VLOG
(
4
)
<<
"x_dims:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
VLOG
(
4
)
<<
"w_dims:"
<<
w_dims
[
0
]
<<
" "
<<
w_dims
[
1
]
<<
" "
<<
w_dims
[
2
]
<<
" "
<<
w_dims
[
3
];
VLOG
(
4
)
<<
"m_: "
<<
m_
<<
" n_: "
<<
n_
<<
" k_: "
<<
k_
;
void
ReInitWhenNeeded
()
override
{
fc_param_
=
param_
.
get_mutable
<
param_t
>
();
const
auto
x_dims
=
fc_param_
->
input
->
dims
();
if
((
!
first_epoch_for_reinit_
&&
x_dims
!=
last_x_dims_
)
||
first_epoch_for_reinit_
)
{
last_x_dims_
=
x_dims
;
first_epoch_for_reinit_
=
false
;
// compute m,n,k
const
auto
w_dims
=
fc_param_
->
w
->
dims
();
CHECK_GE
(
x_dims
.
size
(),
2UL
);
CHECK_GE
(
w_dims
.
size
(),
2UL
);
CHECK_EQ
(
fc_param_
->
output
->
dims
().
size
(),
2UL
);
m_
=
x_dims
.
Slice
(
0
,
fc_param_
->
in_num_col_dims
).
production
();
k_
=
x_dims
.
Slice
(
fc_param_
->
in_num_col_dims
,
x_dims
.
size
()).
production
();
n_
=
w_dims
[
1
];
CHECK_EQ
(
k_
,
static_cast
<
int
>
(
w_dims
[
0
]));
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"x_dims:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
VLOG
(
4
)
<<
"w_dims:"
<<
w_dims
[
0
]
<<
" "
<<
w_dims
[
1
]
<<
" "
<<
w_dims
[
2
]
<<
" "
<<
w_dims
[
3
];
VLOG
(
4
)
<<
"m_: "
<<
m_
<<
" n_: "
<<
n_
<<
" k_: "
<<
k_
;
#endif
// choose kernel
if
(
m_
==
1
)
{
// gemv
kernel_func_name_
=
"fc_gemv_1x4"
;
}
else
{
// gemm
kernel_func_name_
=
"fc_gemm_4x4"
;
}
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
#endif
if
(
fc_param_
->
activation_type
==
"relu"
)
{
build_options_
+=
"-DRELU"
;
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"buffer/fc_kernel.cl"
,
build_options_
,
time_stamp_
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
// compute global work size
GetGlobalWorkSize
();
}
}
void
GetGlobalWorkSize
()
{
if
(
m_
==
1
)
{
// gemv
if
(
m_
==
1
)
{
// gemv
kernel_func_name_
=
"fc_gemv_1x4"
;
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
((
n_
+
3
)
/
4
)};
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
((
n_
+
3
)
/
4
)};
}
else
{
// gemm
}
else
{
// gemm
kernel_func_name_
=
"fc_gemm_4x4"
;
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
((
m_
+
3
)
/
4
),
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
((
m_
+
3
)
/
4
),
static_cast
<
size_t
>
((
n_
+
3
)
/
4
)};
static_cast
<
size_t
>
((
n_
+
3
)
/
4
)};
}
}
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
if
(
param
.
activation_type
==
"relu"
)
{
build_options_
+=
"-DRELU"
;
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"buffer/fc_kernel.cl"
,
build_options_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
auto
*
x_buf
=
fc_param_
->
input
->
data
<
float
,
cl
::
Buffer
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
*
w_buf
=
fc_param_
->
w
->
data
<
float
,
cl
::
Buffer
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
auto
*
bias_buf
=
fc_param_
->
bias
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
x_buf
=
param
.
input
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
w_buf
=
param
.
w
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
bias_buf
=
param
.
bias
->
data
<
float
,
cl
::
Buffer
>
();
auto
*
out_buf
=
auto
*
out_buf
=
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
fc_param_
->
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
kernel_
;
cl_int
status
;
cl_int
status
;
int
arg_idx
=
0
;
status
=
kernel
.
setArg
(
0
,
*
x_buf
);
status
=
kernel
.
setArg
(
arg_idx
,
*
x_buf
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
w_buf
);
status
=
kernel
.
setArg
(
1
,
*
w_buf
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
bias_buf
);
status
=
kernel
.
setArg
(
2
,
*
bias_buf
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_buf
);
status
=
kernel
.
setArg
(
3
,
*
out_buf
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
static_cast
<
const
int
>
(
m_
));
status
=
kernel
.
setArg
(
4
,
static_cast
<
const
int
>
(
m_
));
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
static_cast
<
const
int
>
(
n_
));
status
=
kernel
.
setArg
(
5
,
static_cast
<
const
int
>
(
n_
));
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
static_cast
<
const
int
>
(
k_
));
status
=
kernel
.
setArg
(
6
,
static_cast
<
const
int
>
(
k_
));
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
kernel
,
cl
::
NullRange
,
cl
::
NullRange
,
...
@@ -111,9 +136,14 @@ class FcCompute
...
@@ -111,9 +136,14 @@ class FcCompute
private:
private:
int
m_
,
n_
,
k_
;
int
m_
,
n_
,
k_
;
param_t
*
fc_param_
{
nullptr
};
std
::
string
kernel_func_name_
{};
std
::
string
kernel_func_name_
{};
std
::
string
build_options_
{
"-DCL_DTYPE_float "
};
std
::
string
build_options_
{
"-DCL_DTYPE_float "
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
bool
first_epoch_for_reinit_
{
true
};
DDim
last_x_dims_
;
cl
::
NDRange
global_work_size_
;
cl
::
NDRange
global_work_size_
;
cl
::
Kernel
kernel_
;
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
...
@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
build_options_
+=
" -DRELU"
;
build_options_
+=
" -DRELU"
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/elementwise_add_kernel.cl"
,
build_options_
);
"buffer/elementwise_add_kernel.cl"
,
build_options_
,
time_stamp_
);
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
UpdateParams
();
UpdateParams
();
auto
act_t
=
static_cast
<
param_t
*>
(
ele_param_
)
->
act_type
;
auto
act_t
=
static_cast
<
param_t
*>
(
ele_param_
)
->
act_type
;
...
...
lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
浏览文件 @
917620e8
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include "lite/kernels/opencl/image_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{
...
@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
...
@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
build_options_
+=
" -DRELU"
;
build_options_
+=
" -DRELU"
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/elementwise_add_kernel.cl"
,
build_options_
);
"image/elementwise_add_kernel.cl"
,
build_options_
,
time_stamp_
);
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
ele_param_
=
param_
.
get_mutable
<
param_t
>
();
auto
act_t
=
static_cast
<
param_t
*>
(
ele_param_
)
->
act_type
;
auto
act_t
=
static_cast
<
param_t
*>
(
ele_param_
)
->
act_type
;
VLOG
(
4
)
<<
"act: "
<<
act_t
;
VLOG
(
4
)
<<
"act: "
<<
act_t
;
...
...
lite/kernels/opencl/grid_sampler_image_compute.cc
浏览文件 @
917620e8
...
@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
}
}
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/grid_sampler_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
VLOG
(
4
)
<<
"kernel_key: "
<<
kernel_key
.
str
();
}
void
ReInitWhenNeeded
()
override
{
grid_param_
=
param_
.
get_mutable
<
param_t
>
();
grid_param_
=
param_
.
get_mutable
<
param_t
>
();
auto
x_dims
=
grid_param_
->
x
->
dims
();
if
((
!
first_epoch_for_reinit_
&&
x_dims
!=
last_x_dims_
)
||
first_epoch_for_reinit_
)
{
last_x_dims_
=
x_dims
;
first_epoch_for_reinit_
=
false
;
// compute image shape
paddle
::
lite
::
CLImageConverterDefault
default_convertor
;
out_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
grid_param_
->
out
->
dims
());
// compute global work size
GetGlobalWorkSize
();
}
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
void
GetGlobalWorkSize
()
{
context
.
cl_context
()
->
AddKernel
(
auto
default_work_size
=
kernel_func_name_
,
"image/grid_sampler_kernel.cl"
,
build_options_
);
DefaultWorkSize
(
grid_param_
->
out
->
dims
(),
VLOG
(
4
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
static_cast
<
int64_t
>
(
out_img_shape_
[
0
]),
static_cast
<
int64_t
>
(
out_img_shape_
[
1
])}));
global_work_size_
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
default_work_size
[
0
]),
static_cast
<
cl
::
size_type
>
(
default_work_size
[
1
]),
static_cast
<
cl
::
size_type
>
(
default_work_size
[
2
]
/
4
)};
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"default_work_size: "
<<
default_work_size
[
0
]
<<
", "
<<
default_work_size
[
1
]
<<
", "
<<
default_work_size
[
2
];
VLOG
(
4
)
<<
"global_work_size_:[2D]:"
<<
global_work_size_
[
0
]
<<
" "
<<
global_work_size_
[
1
]
<<
" "
<<
global_work_size_
[
2
];
#endif
}
}
void
Run
()
override
{
void
Run
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
auto
*
x
=
grid_param_
->
x
;
auto
*
x
=
grid_param_
->
x
;
auto
*
out
=
grid_param_
->
out
;
auto
*
grid
=
grid_param_
->
grid
;
auto
*
grid
=
grid_param_
->
grid
;
auto
*
out
=
grid_param_
->
out
;
auto
out_dims
=
out
->
dims
();
auto
out_dims
=
out
->
dims
();
auto
in_dims
=
x
->
dims
();
int
out_height
=
out_dims
[
2
];
int
out_width
=
out_dims
[
3
];
auto
*
x_img
=
x
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
grid_img
=
x
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
out_img
=
out
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
out_img_shape_
[
0
],
out_img_shape_
[
1
]);
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
auto
in_dims
=
x
->
dims
();
VLOG
(
4
)
<<
"x->target():"
<<
TargetToStr
(
x
->
target
());
VLOG
(
4
)
<<
"x->target():"
<<
TargetToStr
(
x
->
target
());
VLOG
(
4
)
<<
"out->target():"
<<
TargetToStr
(
out
->
target
());
VLOG
(
4
)
<<
"out->target():"
<<
TargetToStr
(
out
->
target
());
VLOG
(
4
)
<<
"x->dims():"
<<
in_dims
;
VLOG
(
4
)
<<
"x->dims():"
<<
in_dims
;
VLOG
(
4
)
<<
"out->dims():"
<<
out_dims
;
VLOG
(
4
)
<<
"out->dims():"
<<
out_dims
;
#endif
auto
out_image_shape
=
InitImageDimInfoWith
(
out_dims
);
auto
*
x_img
=
x
->
data
<
half_t
,
cl
::
Image2D
>
();
// VLOG(4) << "x_image: " << x_img;
// VLOG(4) << "x_image: " << x_img;
auto
*
grid_img
=
x
->
data
<
half_t
,
cl
::
Image2D
>
();
// VLOG(4) << "grid_img: " << grid_img;
// VLOG(4) << "grid_img: " << grid_img;
auto
*
out_img
=
out
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
out_image_shape
[
"width"
],
out_image_shape
[
"height"
]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img;
// VLOG(4) << "out_image" << out_img;
VLOG
(
4
)
<<
"out_im
age_shape[w,h]:"
<<
out_image_shape
[
"width"
]
<<
" "
VLOG
(
4
)
<<
"out_im
g_shape_[w,h]:"
<<
out_img_shape_
[
0
]
<<
" "
<<
out_im
age_shape
[
"height"
];
<<
out_im
g_shape_
[
1
];
#endif
#endif
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
cl_int
status
;
int
out_height
=
out_dims
[
2
];
auto
kernel
=
kernel_
;
int
out_width
=
out_dims
[
3
];
status
=
kernel
.
setArg
(
0
,
*
x_img
);
auto
default_work_size
=
DefaultWorkSize
(
out_dims
,
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
static_cast
<
int64_t
>
(
out_image_shape
[
"width"
]),
static_cast
<
int64_t
>
(
out_image_shape
[
"height"
])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"default_work_size: "
<<
default_work_size
[
0
]
<<
", "
<<
default_work_size
[
1
]
<<
", "
<<
default_work_size
[
2
];
#endif
cl_int
status
=
kernel
.
setArg
(
arg_idx
++
,
*
x_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
arg_idx
++
,
*
grid_img
);
status
=
kernel
.
setArg
(
1
,
*
grid_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
arg_idx
++
,
*
out_img
);
status
=
kernel
.
setArg
(
2
,
*
out_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
arg_idx
++
,
out_height
);
status
=
kernel
.
setArg
(
3
,
out_height
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
arg_idx
++
,
out_width
);
status
=
kernel
.
setArg
(
4
,
out_width
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
auto
global_work_size
=
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
default_work_size
[
0
]),
CHECK
(
context
.
cl_context
()
!=
nullptr
);
static_cast
<
cl
::
size_type
>
(
default_work_size
[
1
]),
static_cast
<
cl
::
size_type
>
(
default_work_size
[
2
]
/
4
)};
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
kernel
,
cl
::
NullRange
,
cl
::
NullRange
,
global_work_size
,
global_work_size
_
,
cl
::
NullRange
,
cl
::
NullRange
,
nullptr
,
nullptr
,
event_
.
get
());
event_
.
get
());
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
context
.
cl_wait_list
()
->
emplace
(
out_img
,
event_
);
context
.
cl_wait_list
()
->
emplace
(
out_img
,
event_
);
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"global_work_size:[2D]:"
<<
global_work_size
[
0
]
<<
" "
<<
global_work_size
[
1
]
<<
" "
<<
global_work_size
[
2
];
#endif
}
}
protected:
protected:
param_t
*
grid_param_
{
nullptr
};
param_t
*
grid_param_
{
nullptr
};
bool
first_epoch_for_reinit_
{
true
};
DDim
last_x_dims_
;
DDim
out_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
std
::
string
kernel_func_name_
{
"grid_sampler"
};
std
::
string
kernel_func_name_
{
"grid_sampler"
};
cl
::
Kernel
kernel_
;
cl
::
NDRange
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
)};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/image_helper.h
浏览文件 @
917620e8
...
@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
...
@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
LOG
(
FATAL
)
<<
" not support this dim, need imp "
;
LOG
(
FATAL
)
<<
" not support this dim, need imp "
;
}
}
static
const
std
::
string
GetTimeStamp
()
{
struct
timeval
time
;
gettimeofday
(
&
time
,
NULL
);
return
std
::
to_string
(
time
.
tv_usec
);
}
}
// namespace opencl
}
// namespace opencl
}
// namespace kernels
}
// namespace kernels
}
// namespace lite
}
// namespace lite
...
...
lite/kernels/opencl/instance_norm_image_compute.cc
浏览文件 @
917620e8
...
@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
}
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/instance_norm_kernel.cl"
,
build_options_
);
"image/instance_norm_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
}
...
@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
out_image_shape
[
"width"
],
out_image_shape
[
"height"
]);
out_image_shape
[
"width"
],
out_image_shape
[
"height"
]);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
cl_int
status
=
kernel
.
setArg
(
0
,
out_w
);
cl_int
status
=
kernel
.
setArg
(
0
,
out_w
);
...
@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
bias_image_
.
mutable_data
<
half_t
,
cl
::
Image2D
>
(
bias_image_
.
mutable_data
<
half_t
,
cl
::
Image2D
>
(
scale_img_size
[
0
],
scale_img_size
[
1
],
bias_img
.
data
());
scale_img_size
[
0
],
scale_img_size
[
1
],
bias_img
.
data
());
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/instance_norm_kernel.cl"
,
build_options_
);
"image/instance_norm_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
}
...
@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
#endif
#endif
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
*
scale_img
=
scale_image_
.
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
scale_img
=
scale_image_
.
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
bias_img
=
bias_image_
.
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
bias_img
=
bias_image_
.
data
<
half_t
,
cl
::
Image2D
>
();
...
@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
param_t
*
instance_norm_param_
{
nullptr
};
param_t
*
instance_norm_param_
{
nullptr
};
std
::
string
kernel_func_name_
{
"instance_norm_onnx"
};
std
::
string
kernel_func_name_
{
"instance_norm_onnx"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
Tensor
scale_image_
;
Tensor
scale_image_
;
Tensor
bias_image_
;
Tensor
bias_image_
;
...
...
lite/kernels/opencl/lrn_image_compute.cc
浏览文件 @
917620e8
...
@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
beta_
=
lrn_param_
->
beta
;
beta_
=
lrn_param_
->
beta
;
norm_region_
=
lrn_param_
->
norm_region
;
norm_region_
=
lrn_param_
->
norm_region
;
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/lrn_kernel.cl"
,
build_options_
);
kernel_func_name_
,
"image/lrn_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
}
...
@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
#endif
#endif
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
std
::
string
norm_region_
{
"AcrossChannels"
};
std
::
string
norm_region_
{
"AcrossChannels"
};
std
::
string
kernel_func_name_
{
"lrn"
};
std
::
string
kernel_func_name_
{
"lrn"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/mul_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#include "lite/utils/string.h"
...
@@ -32,8 +33,10 @@ class MulCompute
...
@@ -32,8 +33,10 @@ class MulCompute
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/mat_mul_kernel.cl"
,
build_options_
);
"buffer/mat_mul_kernel.cl"
,
build_options_
,
time_stamp_
);
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
*
x_data
=
param
.
x
->
data
<
float
>
();
const
auto
*
x_data
=
param
.
x
->
data
<
float
>
();
const
auto
*
y_data
=
param
.
y
->
data
<
float
>
();
const
auto
*
y_data
=
param
.
y
->
data
<
float
>
();
...
@@ -68,7 +71,7 @@ class MulCompute
...
@@ -68,7 +71,7 @@ class MulCompute
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
cl_int
status
;
cl_int
status
;
...
@@ -103,6 +106,7 @@ class MulCompute
...
@@ -103,6 +106,7 @@ class MulCompute
int
m_
,
n_
,
k_
;
int
m_
,
n_
,
k_
;
std
::
string
kernel_func_name_
{
"mat_mul"
};
std
::
string
kernel_func_name_
{
"mat_mul"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/nearest_interp_image_compute.cc
浏览文件 @
917620e8
...
@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
...
@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/nearest_interp_kernel.cl"
,
build_options_
);
"image/nearest_interp_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
}
...
@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
...
@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault
...
@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault
private:
private:
std
::
string
kernel_func_name_
{
"nearest_interp"
};
std
::
string
kernel_func_name_
{
"nearest_interp"
};
std
::
string
build_options_
{
" -DCL_DTYPE_half"
};
std
::
string
build_options_
{
" -DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/pad2d_image_compute.cc
浏览文件 @
917620e8
...
@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
}
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/pad2d_kernel.cl"
,
build_options_
);
"image/pad2d_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
}
}
...
@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
#endif
#endif
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
int
arg_idx
=
0
;
...
@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
...
@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
param_t
*
pad2d_param_
{
nullptr
};
param_t
*
pad2d_param_
{
nullptr
};
std
::
string
kernel_func_name_
{};
std
::
string
kernel_func_name_
{};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/pool_buffer_compute.cc
浏览文件 @
917620e8
...
@@ -37,8 +37,10 @@ class PoolCompute
...
@@ -37,8 +37,10 @@ class PoolCompute
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
kernel_func_name_
+=
param
.
pooling_type
;
kernel_func_name_
+=
param
.
pooling_type
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"buffer/pool_kernel.cl"
,
build_options_
);
"buffer/pool_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -69,7 +71,7 @@ class PoolCompute
...
@@ -69,7 +71,7 @@ class PoolCompute
auto
*
output_buf
=
auto
*
output_buf
=
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
param
.
output
->
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
cl_int
status
;
cl_int
status
;
auto
numel
=
out_dims
.
production
();
auto
numel
=
out_dims
.
production
();
...
@@ -117,6 +119,7 @@ class PoolCompute
...
@@ -117,6 +119,7 @@ class PoolCompute
private:
private:
std
::
string
kernel_func_name_
{
"pool_"
};
std
::
string
kernel_func_name_
{
"pool_"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/pool_image_compute.cc
浏览文件 @
917620e8
...
@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/pool_kernel.cl"
,
build_options_
);
kernel_func_name_
,
"image/pool_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
// VLOG(4) << "out_image" << out_img;
// VLOG(4) << "out_image" << out_img;
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
c_block
=
(
out_dims
[
1
]
+
3
)
/
4
;
int
c_block
=
(
out_dims
[
1
]
+
3
)
/
4
;
...
@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
private:
std
::
string
kernel_func_name_
{
"pool_"
};
std
::
string
kernel_func_name_
{
"pool_"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/reshape_image_compute.cc
浏览文件 @
917620e8
...
@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
...
@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/reshape_kernel.cl"
,
build_options_
);
"image/reshape_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
...
@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
#ifndef LITE_SHUTDOWN_LOG
#ifndef LITE_SHUTDOWN_LOG
...
@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
...
@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
private:
private:
std
::
string
kernel_func_name_
{
"reshape"
};
std
::
string
kernel_func_name_
{
"reshape"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/opencl/scale_image_compute.cc
浏览文件 @
917620e8
...
@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/scale_kernel.cl"
,
build_options_
,
time_stamp_
);
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/scale_kernel.cl"
,
build_options_
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
kernel_
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
}
void
ReInitWhenNeeded
()
override
{
scale_param_
=
param_
.
get_mutable
<
param_t
>
();
auto
x_dims
=
scale_param_
->
x
->
dims
();
if
((
!
first_epoch_for_reinit_
&&
x_dims
!=
last_x_dims_
)
||
first_epoch_for_reinit_
)
{
last_x_dims_
=
x_dims
;
first_epoch_for_reinit_
=
false
;
// compute image shape
paddle
::
lite
::
CLImageConverterDefault
default_convertor
;
out_img_shape_
=
default_convertor
.
InitImageDimInfoWith
(
scale_param_
->
output
->
dims
());
// compute global work size
GetGlobalWorkSize
();
}
}
void
GetGlobalWorkSize
()
{
global_work_size_
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
out_img_shape_
[
0
]),
static_cast
<
cl
::
size_type
>
(
out_img_shape_
[
1
])};
}
}
void
Run
()
override
{
void
Run
()
override
{
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
auto
*
x_img
=
scale_param_
->
x
->
data
<
half_t
,
cl
::
Image2D
>
();
const
auto
&
in_dims
=
param
.
x
->
dims
();
auto
*
out_img
=
scale_param_
->
output
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
auto
*
x_img
=
param
.
x
->
data
<
half_t
,
cl
::
Image2D
>
();
out_img_shape_
[
0
],
out_img_shape_
[
1
]);
const
float
scale
=
param
.
scale
;
const
float
scale
=
scale_param_
->
scale
;
const
float
bias
=
param
.
bias
;
const
float
bias
=
scale_param_
->
bias
;
// LOG(INFO) << "x_image" << x_img;
auto
out_image_shape
=
InitImageDimInfoWith
(
in_dims
);
#ifndef LITE_SHUTDOWN_LOG
VLOG
(
4
)
<<
"out_image_shape = "
<<
out_image_shape
[
"width"
]
<<
" "
<<
out_image_shape
[
"height"
];
#endif
auto
*
out_img
=
param
.
output
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
out_image_shape
[
"width"
],
out_image_shape
[
"height"
]);
// LOG(INFO) << "out_image" << out_img;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
global_work_size
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
out_image_shape
[
"width"
]),
static_cast
<
cl
::
size_type
>
(
out_image_shape
[
"height"
])};
auto
kernel
=
kernel_
;
cl_int
status
;
cl_int
status
;
int
arg_idx
=
0
;
status
=
kernel
.
setArg
(
0
,
*
x_img
);
status
=
kernel
.
setArg
(
arg_idx
,
*
x_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_img
);
status
=
kernel
.
setArg
(
1
,
*
out_img
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
scale
);
status
=
kernel
.
setArg
(
2
,
scale
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
bias
);
status
=
kernel
.
setArg
(
3
,
bias
);
CL_CHECK_FATAL
(
status
);
CL_CHECK_FATAL
(
status
);
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
kernel
,
cl
::
NullRange
,
cl
::
NullRange
,
global_work_size
,
global_work_size
_
,
cl
::
NullRange
,
cl
::
NullRange
,
nullptr
,
nullptr
,
event_
.
get
());
event_
.
get
());
...
@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
private:
std
::
string
kernel_func_name_
{
"scale"
};
std
::
string
kernel_func_name_
{
"scale"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
param_t
*
scale_param_
{
nullptr
};
cl
::
Kernel
kernel_
;
bool
first_epoch_for_reinit_
{
true
};
DDim
last_x_dims_
;
DDim
out_img_shape_
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
(
{
static_cast
<
DDim
::
value_type
>
(
1
),
static_cast
<
DDim
::
value_type
>
(
1
)}));
cl
::
NDRange
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
),
static_cast
<
size_t
>
(
1
)};
};
};
}
// namespace opencl
}
// namespace opencl
...
...
lite/kernels/opencl/slice_image_compute.cc
浏览文件 @
917620e8
...
@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
void
PrepareForRun
()
override
{
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
kernel_func_name_
,
"image/slice_kernel.cl"
,
build_options_
);
"image/slice_kernel.cl"
,
build_options_
,
time_stamp_
);
}
}
void
Run
()
override
{
void
Run
()
override
{
...
@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
cl_int
status
;
cl_int
status
;
...
@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
...
@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
private:
std
::
string
kernel_func_name_
{
"slice"
};
std
::
string
kernel_func_name_
{
"slice"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half"
};
std
::
string
time_stamp_
{
GetTimeStamp
()};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
};
...
...
lite/kernels/xpu/bridges/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x
...
@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x
lite_cc_library
(
subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
lite_cc_library
(
subgraph_bridge_yolo_box_op_xpu SRCS yolo_box_op.cc DEPS
${
xpu_subgraph_bridge_deps
}
)
set
(
xpu_subgraph_bridges
set
(
xpu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_registry
...
@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges
...
@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges
subgraph_bridge_dropout_op_xpu
subgraph_bridge_dropout_op_xpu
subgraph_bridge_matmul_op_xpu
subgraph_bridge_matmul_op_xpu
subgraph_bridge_cast_op_xpu
subgraph_bridge_cast_op_xpu
subgraph_bridge_yolo_box_op_xpu
CACHE INTERNAL
"xpu_subgraph_bridges"
)
CACHE INTERNAL
"xpu_subgraph_bridges"
)
message
(
STATUS
"+++++ xpu_subgraph_bridges:
${
xpu_subgraph_bridges
}
"
)
message
(
STATUS
"+++++ xpu_subgraph_bridges:
${
xpu_subgraph_bridges
}
"
)
lite/kernels/xpu/bridges/paddle_use_bridges.h
浏览文件 @
917620e8
...
@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU);
...
@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU);
USE_SUBGRAPH_BRIDGE
(
dropout
,
kXPU
);
USE_SUBGRAPH_BRIDGE
(
dropout
,
kXPU
);
USE_SUBGRAPH_BRIDGE
(
matmul
,
kXPU
);
USE_SUBGRAPH_BRIDGE
(
matmul
,
kXPU
);
USE_SUBGRAPH_BRIDGE
(
cast
,
kXPU
);
USE_SUBGRAPH_BRIDGE
(
cast
,
kXPU
);
USE_SUBGRAPH_BRIDGE
(
yolo_box
,
kXPU
);
lite/kernels/xpu/bridges/yolo_box_op.cc
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
xpu
{
int
YoloBoxConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[XPU] Converting "
+
op_type
+
"..."
;
// Get input and output vars and op attributes
auto
x_name
=
op_info
->
Input
(
"X"
).
front
();
auto
x
=
scope
->
FindTensor
(
x_name
);
auto
img_size_name
=
op_info
->
Input
(
"ImgSize"
).
front
();
auto
img_size
=
scope
->
FindTensor
(
img_size_name
);
auto
boxes_name
=
op_info
->
Output
(
"Boxes"
).
front
();
auto
scores_name
=
op_info
->
Output
(
"Scores"
).
front
();
auto
anchors
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"anchors"
);
auto
class_num
=
op_info
->
GetAttr
<
int
>
(
"class_num"
);
auto
conf_thresh
=
op_info
->
GetAttr
<
float
>
(
"conf_thresh"
);
auto
downsample_ratio
=
op_info
->
GetAttr
<
int
>
(
"downsample_ratio"
);
// X node
std
::
shared_ptr
<
Node
>
x_node
=
nullptr
;
if
(
graph
->
Has
(
x_name
))
{
x_node
=
graph
->
Get
(
x_name
);
}
else
{
x_node
=
graph
->
Add
(
x_name
,
*
x
);
}
// ImgSize node
std
::
shared_ptr
<
Node
>
img_size_node
=
nullptr
;
if
(
graph
->
Has
(
img_size_name
))
{
img_size_node
=
graph
->
Get
(
img_size_name
);
}
else
{
img_size_node
=
graph
->
Add
(
img_size_name
,
*
img_size
);
}
// Softmax node
auto
yolo_box_data
=
graph
->
builder_
.
CreateYoloBox
(
*
x_node
->
data
(),
*
img_size_node
->
data
(),
CvtShape
<
xtcl
::
Integer
>
(
anchors
),
class_num
,
conf_thresh
,
downsample_ratio
);
graph
->
Add
(
boxes_name
,
graph
->
builder_
.
GetField
(
yolo_box_data
,
0
));
graph
->
Add
(
scores_name
,
graph
->
builder_
.
GetField
(
yolo_box_data
,
1
));
return
SUCCESS
;
}
}
// namespace xpu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
yolo_box
,
kXPU
,
paddle
::
lite
::
subgraph
::
xpu
::
YoloBoxConverter
);
lite/kernels/xpu/subgraph_compute.cc
浏览文件 @
917620e8
...
@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() {
...
@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() {
subgraph
::
xpu
::
Graph
graph
;
subgraph
::
xpu
::
Graph
graph
;
const
auto
&
bridges
=
subgraph
::
Registry
::
Instance
();
const
auto
&
bridges
=
subgraph
::
Registry
::
Instance
();
for
(
auto
&
inst
:
origin_program_
)
{
for
(
auto
&
inst
:
origin_program_
)
{
auto
op
=
inst
.
op
(
);
auto
op
=
const_cast
<
OpLite
*>
(
inst
.
op
()
);
CHECK
(
op
);
CHECK
(
op
);
op
->
CheckShape
();
op
->
CheckShape
();
op
->
InferShape
();
op
->
InferShape
();
...
@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() {
...
@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() {
return
subgraph
::
FAILED
;
return
subgraph
::
FAILED
;
}
}
auto
kernel
=
inst
.
kernel
();
auto
kernel
=
inst
.
kernel
();
status
|=
status
|=
bridges
.
Select
(
op_type
,
TARGET
(
kXPU
))(
bridges
.
Select
(
op_type
,
TARGET
(
kXPU
))(
reinterpret_cast
<
void
*>
(
&
graph
),
reinterpret_cast
<
void
*>
(
&
graph
),
op
,
const_cast
<
KernelBase
*>
(
kernel
));
const_cast
<
OpLite
*>
(
op
),
const_cast
<
KernelBase
*>
(
kernel
));
if
(
subgraph
::
CHECK_FAILED
(
status
))
{
if
(
subgraph
::
CHECK_FAILED
(
status
))
{
return
subgraph
::
FAILED
;
return
subgraph
::
FAILED
;
}
}
...
...
lite/model_parser/model_parser.cc
浏览文件 @
917620e8
...
@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
...
@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
pb_dims
->
Resize
(
static_cast
<
int
>
(
dims
.
size
()),
0
);
pb_dims
->
Resize
(
static_cast
<
int
>
(
dims
.
size
()),
0
);
auto
dims_vec
=
dims
.
Vectorize
();
auto
dims_vec
=
dims
.
Vectorize
();
std
::
copy
(
dims_vec
.
begin
(),
dims_vec
.
end
(),
pb_dims
->
begin
());
std
::
copy
(
dims_vec
.
begin
(),
dims_vec
.
end
(),
pb_dims
->
begin
());
int32_t
size
=
desc
.
ByteSize
();
int32_t
size
=
desc
.
ByteSize
Long
();
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
os
.
write
(
reinterpret_cast
<
const
char
*>
(
&
size
),
sizeof
(
size
));
auto
out
=
desc
.
SerializeAsString
();
auto
out
=
desc
.
SerializeAsString
();
os
.
write
(
out
.
data
(),
size
);
os
.
write
(
out
.
data
(),
size
);
...
...
lite/operators/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -141,13 +141,12 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
...
@@ -141,13 +141,12 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
# 4. training op
# 4. training op
add_operator
(
mean_op extra SRCS mean_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
mean_op extra SRCS mean_op.cc DEPS
${
op_DEPS
}
)
if
(
LITE_WITH_TRAIN
)
add_operator
(
mean_grad_op extra SRCS mean_grad_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
mean_grad_op train SRCS mean_grad_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
activation_grad_ops basic SRCS activation_grad_ops.cc DEPS
${
op_DEPS
}
)
add_operator
(
activation_grad_ops train SRCS activation_grad_ops.cc DEPS
${
op_DEPS
}
)
add_operator
(
elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS
${
op_DEPS
}
)
add_operator
(
elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS
${
op_DEPS
}
)
add_operator
(
mul_grad_op basic SRCS mul_grad_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
mul_grad_op train SRCS mul_grad_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
sgd_op extra SRCS sgd_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
sgd_op train SRCS sgd_op.cc DEPS
${
op_DEPS
}
)
endif
()
if
(
NOT LITE_WITH_X86
)
if
(
NOT LITE_WITH_X86
)
lite_cc_test
(
test_fc_op SRCS fc_op_test.cc
lite_cc_test
(
test_fc_op SRCS fc_op_test.cc
...
...
lite/operators/multiclass_nms_op.cc
浏览文件 @
917620e8
...
@@ -42,14 +42,8 @@ bool MulticlassNmsOpLite::CheckShape() const {
...
@@ -42,14 +42,8 @@ bool MulticlassNmsOpLite::CheckShape() const {
}
}
bool
MulticlassNmsOpLite
::
InferShapeImpl
()
const
{
bool
MulticlassNmsOpLite
::
InferShapeImpl
()
const
{
auto
box_dims
=
param_
.
bboxes
->
dims
();
// InferShape is useless for multiclass_nms
auto
score_dims
=
param_
.
scores
->
dims
();
// out's dim is not sure before the end of calculation
auto
score_size
=
score_dims
.
size
();
if
(
score_size
==
3
)
{
param_
.
out
->
Resize
({
box_dims
[
1
],
box_dims
[
2
],
3
});
}
else
{
param_
.
out
->
Resize
({
-
1
,
box_dims
[
2
]
+
2
});
}
return
true
;
return
true
;
}
}
...
...
lite/tests/kernels/CMakeLists.txt
浏览文件 @
917620e8
...
@@ -32,6 +32,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
...
@@ -32,6 +32,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
lite_cc_test
(
test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_multiclass_nms_compute SRCS multiclass_nms_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
...
...
lite/tests/kernels/multiclass_nms_compute_test.cc
0 → 100644
浏览文件 @
917620e8
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cmath>
#include <string>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
#include "lite/tests/utils/fill_data.h"
namespace
paddle
{
namespace
lite
{
template
<
class
T
>
bool
SortScorePairDescend
(
const
std
::
pair
<
float
,
T
>&
pair1
,
const
std
::
pair
<
float
,
T
>&
pair2
)
{
return
pair1
.
first
>
pair2
.
first
;
}
template
<
class
T
>
static
void
GetMaxScoreIndex
(
const
std
::
vector
<
T
>&
scores
,
const
T
threshold
,
int
top_k
,
std
::
vector
<
std
::
pair
<
T
,
int
>>*
sorted_indices
)
{
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
if
(
scores
[
i
]
>
threshold
)
{
sorted_indices
->
push_back
(
std
::
make_pair
(
scores
[
i
],
i
));
}
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
->
begin
(),
sorted_indices
->
end
(),
SortScorePairDescend
<
int
>
);
// Keep top_k scores if needed.
if
(
top_k
>
-
1
&&
top_k
<
static_cast
<
int
>
(
sorted_indices
->
size
()))
{
sorted_indices
->
resize
(
top_k
);
}
}
template
<
class
T
>
static
T
BBoxArea
(
const
T
*
box
,
const
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
w
=
box
[
2
]
-
box
[
0
];
const
T
h
=
box
[
3
]
-
box
[
1
];
if
(
normalized
)
{
return
w
*
h
;
}
else
{
// If coordinate values are not within range [0, 1].
return
(
w
+
1
)
*
(
h
+
1
);
}
}
}
template
<
class
T
>
static
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
const
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
}
else
{
const
T
inter_xmin
=
std
::
max
(
box1
[
0
],
box2
[
0
]);
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
T
norm
=
normalized
?
static_cast
<
T
>
(
0.
)
:
static_cast
<
T
>
(
1.
);
T
inter_w
=
inter_xmax
-
inter_xmin
+
norm
;
T
inter_h
=
inter_ymax
-
inter_ymin
+
norm
;
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
return
inter_area
/
(
bbox1_area
+
bbox2_area
-
inter_area
);
}
}
template
<
class
T
>
void
SliceOneClass
(
const
Tensor
&
items
,
const
int
class_id
,
Tensor
*
one_class_item
)
{
T
*
item_data
=
one_class_item
->
mutable_data
<
T
>
();
const
T
*
items_data
=
items
.
data
<
T
>
();
const
int64_t
num_item
=
items
.
dims
()[
0
];
const
int64_t
class_num
=
items
.
dims
()[
1
];
if
(
items
.
dims
().
size
()
==
3
)
{
int64_t
item_size
=
items
.
dims
()[
2
];
for
(
int
i
=
0
;
i
<
num_item
;
++
i
)
{
std
::
memcpy
(
item_data
+
i
*
item_size
,
items_data
+
i
*
class_num
*
item_size
+
class_id
*
item_size
,
sizeof
(
T
)
*
item_size
);
}
}
else
{
for
(
int
i
=
0
;
i
<
num_item
;
++
i
)
{
item_data
[
i
]
=
items_data
[
i
*
class_num
+
class_id
];
}
}
}
template
<
typename
T
>
void
NMSFast
(
const
Tensor
&
bbox
,
const
Tensor
&
scores
,
const
T
score_threshold
,
const
T
nms_threshold
,
const
T
eta
,
const
int64_t
top_k
,
std
::
vector
<
int
>*
selected_indices
,
const
bool
normalized
)
{
// The total boxes for each instance.
int64_t
num_boxes
=
bbox
.
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
// 8: [x1 y1 x2 y2 x3 y3 x4 y4]
// 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16
int64_t
box_size
=
bbox
.
dims
()[
1
];
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
.
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
GetMaxScoreIndex
(
scores_data
,
score_threshold
,
top_k
,
&
sorted_indices
);
selected_indices
->
clear
();
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
.
data
<
T
>
();
while
(
sorted_indices
.
size
()
!=
0
)
{
const
int
idx
=
sorted_indices
.
front
().
second
;
bool
keep
=
true
;
for
(
size_t
k
=
0
;
k
<
selected_indices
->
size
();
++
k
)
{
if
(
keep
)
{
const
int
kept_idx
=
(
*
selected_indices
)[
k
];
T
overlap
=
T
(
0.
);
// 4: [xmin ymin xmax ymax]
if
(
box_size
==
4
)
{
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
normalized
);
}
else
{
LOG
(
FATAL
)
<<
"not support"
;
}
keep
=
overlap
<=
adaptive_threshold
;
}
else
{
break
;
}
}
if
(
keep
)
{
selected_indices
->
push_back
(
idx
);
}
sorted_indices
.
erase
(
sorted_indices
.
begin
());
if
(
keep
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
}
template
<
typename
T
>
void
MultiClassNMS
(
const
Tensor
&
scores
,
const
Tensor
&
bboxes
,
const
int
scores_size
,
std
::
map
<
int
,
std
::
vector
<
int
>>*
indices
,
int
*
num_nmsed_out
,
int64_t
background_label
,
int64_t
nms_top_k
,
int64_t
keep_top_k
,
bool
normalized
,
T
nms_threshold
,
T
nms_eta
,
T
score_threshold
)
{
int
num_det
=
0
;
int64_t
class_num
=
scores_size
==
3
?
scores
.
dims
()[
0
]
:
scores
.
dims
()[
1
];
Tensor
bbox_slice
,
score_slice
;
for
(
int64_t
c
=
0
;
c
<
class_num
;
++
c
)
{
if
(
c
==
background_label
)
continue
;
if
(
scores_size
==
3
)
{
score_slice
=
scores
.
Slice
<
T
>
(
c
,
c
+
1
);
bbox_slice
=
bboxes
;
}
else
{
score_slice
.
Resize
({
scores
.
dims
()[
0
],
1
});
bbox_slice
.
Resize
({
scores
.
dims
()[
0
],
4
});
SliceOneClass
<
T
>
(
scores
,
c
,
&
score_slice
);
SliceOneClass
<
T
>
(
bboxes
,
c
,
&
bbox_slice
);
}
NMSFast
(
bbox_slice
,
score_slice
,
score_threshold
,
nms_threshold
,
nms_eta
,
nms_top_k
,
&
((
*
indices
)[
c
]),
normalized
);
if
(
scores_size
==
2
)
{
std
::
stable_sort
((
*
indices
)[
c
].
begin
(),
(
*
indices
)[
c
].
end
());
}
num_det
+=
(
*
indices
)[
c
].
size
();
}
*
num_nmsed_out
=
num_det
;
const
T
*
scores_data
=
scores
.
data
<
T
>
();
if
(
keep_top_k
>
-
1
&&
num_det
>
keep_top_k
)
{
const
T
*
sdata
;
std
::
vector
<
std
::
pair
<
float
,
std
::
pair
<
int
,
int
>>>
score_index_pairs
;
for
(
const
auto
&
it
:
*
indices
)
{
int
label
=
it
.
first
;
if
(
scores_size
==
3
)
{
sdata
=
scores_data
+
label
*
scores
.
dims
()[
1
];
}
else
{
score_slice
.
Resize
({
scores
.
dims
()[
0
],
1
});
SliceOneClass
<
T
>
(
scores
,
label
,
&
score_slice
);
sdata
=
score_slice
.
data
<
T
>
();
}
const
std
::
vector
<
int
>&
label_indices
=
it
.
second
;
for
(
size_t
j
=
0
;
j
<
label_indices
.
size
();
++
j
)
{
int
idx
=
label_indices
[
j
];
score_index_pairs
.
push_back
(
std
::
make_pair
(
sdata
[
idx
],
std
::
make_pair
(
label
,
idx
)));
}
}
// Keep top k results per image.
std
::
stable_sort
(
score_index_pairs
.
begin
(),
score_index_pairs
.
end
(),
SortScorePairDescend
<
std
::
pair
<
int
,
int
>>
);
score_index_pairs
.
resize
(
keep_top_k
);
// Store the new indices.
std
::
map
<
int
,
std
::
vector
<
int
>>
new_indices
;
for
(
size_t
j
=
0
;
j
<
score_index_pairs
.
size
();
++
j
)
{
int
label
=
score_index_pairs
[
j
].
second
.
first
;
int
idx
=
score_index_pairs
[
j
].
second
.
second
;
new_indices
[
label
].
push_back
(
idx
);
}
if
(
scores_size
==
2
)
{
for
(
const
auto
&
it
:
new_indices
)
{
int
label
=
it
.
first
;
std
::
stable_sort
(
new_indices
[
label
].
begin
(),
new_indices
[
label
].
end
());
}
}
new_indices
.
swap
(
*
indices
);
*
num_nmsed_out
=
keep_top_k
;
}
}
template
<
typename
T
>
void
MultiClassOutput
(
const
Tensor
&
scores
,
const
Tensor
&
bboxes
,
const
std
::
map
<
int
,
std
::
vector
<
int
>>&
selected_indices
,
const
int
scores_size
,
Tensor
*
outs
,
int
*
oindices
=
nullptr
,
const
int
offset
=
0
)
{
int64_t
class_num
=
scores
.
dims
()[
1
];
int64_t
predict_dim
=
scores
.
dims
()[
1
];
int64_t
box_size
=
bboxes
.
dims
()[
1
];
if
(
scores_size
==
2
)
{
box_size
=
bboxes
.
dims
()[
2
];
}
int64_t
out_dim
=
box_size
+
2
;
auto
*
scores_data
=
scores
.
data
<
T
>
();
auto
*
bboxes_data
=
bboxes
.
data
<
T
>
();
auto
*
odata
=
outs
->
mutable_data
<
T
>
();
const
T
*
sdata
;
Tensor
bbox
;
bbox
.
Resize
({
scores
.
dims
()[
0
],
box_size
});
int
count
=
0
;
for
(
const
auto
&
it
:
selected_indices
)
{
int
label
=
it
.
first
;
const
std
::
vector
<
int
>&
indices
=
it
.
second
;
if
(
scores_size
==
2
)
{
SliceOneClass
<
T
>
(
bboxes
,
label
,
&
bbox
);
}
else
{
sdata
=
scores_data
+
label
*
predict_dim
;
}
for
(
size_t
j
=
0
;
j
<
indices
.
size
();
++
j
)
{
int
idx
=
indices
[
j
];
odata
[
count
*
out_dim
]
=
label
;
// label
const
T
*
bdata
;
if
(
scores_size
==
3
)
{
bdata
=
bboxes_data
+
idx
*
box_size
;
odata
[
count
*
out_dim
+
1
]
=
sdata
[
idx
];
// score
if
(
oindices
!=
nullptr
)
{
oindices
[
count
]
=
offset
+
idx
;
}
}
else
{
bdata
=
bbox
.
data
<
T
>
()
+
idx
*
box_size
;
odata
[
count
*
out_dim
+
1
]
=
*
(
scores_data
+
idx
*
class_num
+
label
);
if
(
oindices
!=
nullptr
)
{
oindices
[
count
]
=
offset
+
idx
*
class_num
+
label
;
}
}
// xmin, ymin, xmax, ymax or multi-points coordinates
std
::
memcpy
(
odata
+
count
*
out_dim
+
2
,
bdata
,
box_size
*
sizeof
(
T
));
count
++
;
}
}
}
class
MulticlassNmsComputeTester
:
public
arena
::
TestCase
{
protected:
// common attributes for this op.
std
::
string
type_
=
"multiclass_nms"
;
std
::
string
bboxes_
=
"bboxes"
;
std
::
string
scores_
=
"scores"
;
std
::
string
out_
=
"out"
;
DDim
bboxes_dims_
{};
DDim
scores_dims_
{};
int
keep_top_k_
{
2
};
float
nms_threshold_
{
0.45
f
};
float
nms_eta_
{
1.
f
};
int
nms_top_k_
{
1
};
int
background_label_
{
-
1
};
float
score_threshold_
{
0.01
f
};
bool
normalized_
{
false
};
public:
MulticlassNmsComputeTester
(
const
Place
&
place
,
const
std
::
string
&
alias
,
DDim
bboxes_dims
,
DDim
scores_dims
,
int
keep_top_k
=
2
,
float
nms_threshold
=
0.45
f
,
float
nms_eta
=
1.
f
,
int
nms_top_k
=
1
,
int
background_label
=
1
,
float
score_threshold
=
0.01
f
,
bool
normalized
=
false
)
:
TestCase
(
place
,
alias
),
bboxes_dims_
(
bboxes_dims
),
scores_dims_
(
scores_dims
),
keep_top_k_
(
keep_top_k
),
nms_threshold_
(
nms_threshold
),
nms_eta_
(
nms_eta
),
nms_top_k_
(
nms_top_k
),
background_label_
(
background_label
),
score_threshold_
(
score_threshold
),
normalized_
(
normalized
)
{}
void
RunBaseline
(
Scope
*
scope
)
override
{
auto
*
boxes
=
scope
->
FindTensor
(
bboxes_
);
auto
*
scores
=
scope
->
FindTensor
(
scores_
);
auto
*
outs
=
scope
->
NewTensor
(
out_
);
CHECK
(
outs
);
outs
->
set_precision
(
PRECISION
(
kFloat
));
auto
score_size
=
scores_dims_
.
size
();
std
::
vector
<
std
::
map
<
int
,
std
::
vector
<
int
>>>
all_indices
;
std
::
vector
<
uint64_t
>
batch_starts
=
{
0
};
int64_t
batch_size
=
scores_dims_
[
0
];
int64_t
box_dim
=
bboxes_dims_
[
2
];
int64_t
out_dim
=
box_dim
+
2
;
int
num_nmsed_out
=
0
;
Tensor
boxes_slice
,
scores_slice
;
int
n
=
score_size
==
3
?
batch_size
:
boxes
->
lod
().
back
().
size
()
-
1
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
if
(
score_size
==
3
)
{
scores_slice
=
scores
->
Slice
<
float
>
(
i
,
i
+
1
);
scores_slice
.
Resize
({
scores_dims_
[
1
],
scores_dims_
[
2
]});
boxes_slice
=
boxes
->
Slice
<
float
>
(
i
,
i
+
1
);
boxes_slice
.
Resize
({
scores_dims_
[
2
],
box_dim
});
}
else
{
auto
boxes_lod
=
boxes
->
lod
().
back
();
scores_slice
=
scores
->
Slice
<
float
>
(
boxes_lod
[
i
],
boxes_lod
[
i
+
1
]);
boxes_slice
=
boxes
->
Slice
<
float
>
(
boxes_lod
[
i
],
boxes_lod
[
i
+
1
]);
}
std
::
map
<
int
,
std
::
vector
<
int
>>
indices
;
MultiClassNMS
<
float
>
(
scores_slice
,
boxes_slice
,
score_size
,
&
indices
,
&
num_nmsed_out
,
background_label_
,
nms_top_k_
,
keep_top_k_
,
normalized_
,
nms_threshold_
,
nms_eta_
,
score_threshold_
);
all_indices
.
push_back
(
indices
);
batch_starts
.
push_back
(
batch_starts
.
back
()
+
num_nmsed_out
);
}
uint64_t
num_kept
=
batch_starts
.
back
();
if
(
num_kept
==
0
)
{
outs
->
Resize
({
1
,
1
});
float
*
od
=
outs
->
mutable_data
<
float
>
();
od
[
0
]
=
-
1
;
batch_starts
=
{
0
,
1
};
}
else
{
outs
->
Resize
({
static_cast
<
int64_t
>
(
num_kept
),
out_dim
});
outs
->
mutable_data
<
float
>
();
int
offset
=
0
;
int
*
oindices
=
nullptr
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
if
(
score_size
==
3
)
{
scores_slice
=
scores
->
Slice
<
float
>
(
i
,
i
+
1
);
boxes_slice
=
boxes
->
Slice
<
float
>
(
i
,
i
+
1
);
scores_slice
.
Resize
({
scores_dims_
[
1
],
scores_dims_
[
2
]});
boxes_slice
.
Resize
({
scores_dims_
[
2
],
box_dim
});
}
else
{
auto
boxes_lod
=
boxes
->
lod
().
back
();
scores_slice
=
scores
->
Slice
<
float
>
(
boxes_lod
[
i
],
boxes_lod
[
i
+
1
]);
boxes_slice
=
boxes
->
Slice
<
float
>
(
boxes_lod
[
i
],
boxes_lod
[
i
+
1
]);
}
int64_t
s
=
static_cast
<
int64_t
>
(
batch_starts
[
i
]);
int64_t
e
=
static_cast
<
int64_t
>
(
batch_starts
[
i
+
1
]);
if
(
e
>
s
)
{
Tensor
out
=
outs
->
Slice
<
float
>
(
s
,
e
);
MultiClassOutput
<
float
>
(
scores_slice
,
boxes_slice
,
all_indices
[
i
],
scores_dims_
.
size
(),
&
out
,
oindices
,
offset
);
}
}
}
LoD
lod
;
lod
.
emplace_back
(
batch_starts
);
outs
->
set_lod
(
lod
);
}
void
PrepareOpDesc
(
cpp
::
OpDesc
*
op_desc
)
{
op_desc
->
SetType
(
type_
);
op_desc
->
SetInput
(
"BBoxes"
,
{
bboxes_
});
op_desc
->
SetInput
(
"Scores"
,
{
scores_
});
op_desc
->
SetOutput
(
"Out"
,
{
out_
});
op_desc
->
SetAttr
(
"keep_top_k"
,
keep_top_k_
);
op_desc
->
SetAttr
(
"nms_threshold"
,
nms_threshold_
);
op_desc
->
SetAttr
(
"nms_eta"
,
nms_eta_
);
op_desc
->
SetAttr
(
"nms_top_k"
,
nms_top_k_
);
op_desc
->
SetAttr
(
"background_label"
,
background_label_
);
op_desc
->
SetAttr
(
"score_threshold"
,
score_threshold_
);
op_desc
->
SetAttr
(
"normalized"
,
normalized_
);
}
void
PrepareData
()
override
{
std
::
vector
<
float
>
bboxes
(
bboxes_dims_
.
production
());
for
(
int
i
=
0
;
i
<
bboxes_dims_
.
production
();
++
i
)
{
bboxes
[
i
]
=
i
*
1.
/
bboxes_dims_
.
production
();
}
SetCommonTensor
(
bboxes_
,
bboxes_dims_
,
bboxes
.
data
());
std
::
vector
<
float
>
scores
(
scores_dims_
.
production
());
for
(
int
i
=
0
;
i
<
scores_dims_
.
production
();
++
i
)
{
scores
[
i
]
=
i
*
1.
/
scores_dims_
.
production
();
}
SetCommonTensor
(
scores_
,
scores_dims_
,
scores
.
data
());
}
};
void
TestMulticlassNms
(
Place
place
,
float
abs_error
)
{
int
N
=
3
;
int
M
=
2500
;
for
(
int
class_num
:
{
2
,
4
,
10
})
{
std
::
vector
<
int64_t
>
bbox_shape
{
N
,
M
,
4
};
std
::
vector
<
int64_t
>
score_shape
{
N
,
class_num
,
M
};
std
::
unique_ptr
<
arena
::
TestCase
>
tester
(
new
MulticlassNmsComputeTester
(
place
,
"def"
,
DDim
(
bbox_shape
),
DDim
(
score_shape
)));
arena
::
Arena
arena
(
std
::
move
(
tester
),
place
,
abs_error
);
arena
.
TestPrecision
();
}
}
TEST
(
multiclass_nms
,
precision
)
{
float
abs_error
=
2e-5
;
Place
place
;
#if defined(LITE_WITH_ARM)
place
=
TARGET
(
kHost
);
#elif defined(LITE_WITH_XPU)
place
=
TARGET
(
kXPU
);
#else
return
;
#endif
TestMulticlassNms
(
place
,
abs_error
);
}
}
// namespace lite
}
// namespace paddle
lite/tests/kernels/yolo_box_compute_test.cc
浏览文件 @
917620e8
此差异已折叠。
点击以展开。
lite/tools/build.sh
浏览文件 @
917620e8
此差异已折叠。
点击以展开。
lite/tools/build_bm.sh
浏览文件 @
917620e8
...
@@ -5,7 +5,7 @@ set -ex
...
@@ -5,7 +5,7 @@ set -ex
BM_SDK_ROOT
=
"
$(
pwd
)
/third-party/bmlibs/bm_sc3_libs"
# BM SDK
BM_SDK_ROOT
=
"
$(
pwd
)
/third-party/bmlibs/bm_sc3_libs"
# BM SDK
TARGET_NAME
=
"BM1682"
# default target
TARGET_NAME
=
"BM1682"
# default target
BUILD_EXTRA
=
OFF
# ON(with sequence ops)/OFF
BUILD_EXTRA
=
OFF
# ON(with sequence ops)/OFF
WITH_TESTING
=
O
N
# ON/OFF
WITH_TESTING
=
O
FF
# ON/OFF
function
print_usage
{
function
print_usage
{
echo
-e
"
\n
USAGE:"
echo
-e
"
\n
USAGE:"
...
...
lite/tools/ci_build.sh
浏览文件 @
917620e8
...
@@ -192,6 +192,7 @@ function build_opencl {
...
@@ -192,6 +192,7 @@ function build_opencl {
cmake_opencl
${
os
}
${
abi
}
${
lang
}
cmake_opencl
${
os
}
${
abi
}
${
lang
}
make opencl_clhpp
-j
$NUM_CORES_FOR_COMPILE
make opencl_clhpp
-j
$NUM_CORES_FOR_COMPILE
make publish_inference
-j
$NUM_CORES_FOR_COMPILE
build
$TESTS_FILE
build
$TESTS_FILE
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录