Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
809f7fc3
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
809f7fc3
编写于
3月 10, 2020
作者:
J
jackzhang235
提交者:
jackzhang235
3月 18, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix some error when compiling (
#6
)
* fix some error when compiling with mlu-sdk1.2.5
上级
ce58801f
变更
50
隐藏空白更改
内联
并排
Showing
50 changed file
with
567 addition
and
135 deletion
+567
-135
CMakeLists.txt
CMakeLists.txt
+5
-0
cmake/configure.cmake
cmake/configure.cmake
+4
-0
cmake/lite.cmake
cmake/lite.cmake
+25
-7
lite/CMakeLists.txt
lite/CMakeLists.txt
+1
-0
lite/api/CMakeLists.txt
lite/api/CMakeLists.txt
+13
-2
lite/api/cxx_api_impl.cc
lite/api/cxx_api_impl.cc
+8
-0
lite/api/opt.cc
lite/api/opt.cc
+2
-0
lite/api/paddle_api.h
lite/api/paddle_api.h
+21
-0
lite/api/paddle_place.cc
lite/api/paddle_place.cc
+4
-1
lite/api/paddle_place.h
lite/api/paddle_place.h
+4
-1
lite/api/paddle_use_passes.h
lite/api/paddle_use_passes.h
+3
-0
lite/api/python/pybind/pybind.cc
lite/api/python/pybind/pybind.cc
+22
-0
lite/backends/CMakeLists.txt
lite/backends/CMakeLists.txt
+1
-0
lite/core/CMakeLists.txt
lite/core/CMakeLists.txt
+2
-1
lite/core/arena/CMakeLists.txt
lite/core/arena/CMakeLists.txt
+1
-1
lite/core/context.h
lite/core/context.h
+98
-0
lite/core/device_info.cc
lite/core/device_info.cc
+61
-1
lite/core/device_info.h
lite/core/device_info.h
+52
-1
lite/core/kernel.h
lite/core/kernel.h
+3
-0
lite/core/memory.cc
lite/core/memory.cc
+16
-0
lite/core/memory.h
lite/core/memory.h
+9
-0
lite/core/mir/CMakeLists.txt
lite/core/mir/CMakeLists.txt
+2
-0
lite/core/mir/ssa_graph.cc
lite/core/mir/ssa_graph.cc
+38
-0
lite/core/mir/ssa_graph.h
lite/core/mir/ssa_graph.h
+5
-0
lite/core/mir/subgraph/subgraph_detector.cc
lite/core/mir/subgraph/subgraph_detector.cc
+3
-2
lite/core/mir/subgraph/subgraph_pass.cc
lite/core/mir/subgraph/subgraph_pass.cc
+16
-0
lite/core/mir/subgraph/subgraph_pass.h
lite/core/mir/subgraph/subgraph_pass.h
+5
-0
lite/core/mir/subgraph_cast_display_pass.cc
lite/core/mir/subgraph_cast_display_pass.cc
+11
-25
lite/core/op_registry.cc
lite/core/op_registry.cc
+12
-0
lite/core/op_registry.h
lite/core/op_registry.h
+26
-1
lite/core/optimizer.h
lite/core/optimizer.h
+6
-0
lite/core/workspace.h
lite/core/workspace.h
+7
-0
lite/kernels/CMakeLists.txt
lite/kernels/CMakeLists.txt
+1
-0
lite/kernels/mlu/bridges/CMakeLists.txt
lite/kernels/mlu/bridges/CMakeLists.txt
+8
-8
lite/kernels/mlu/bridges/act_op.cc
lite/kernels/mlu/bridges/act_op.cc
+4
-0
lite/kernels/mlu/bridges/act_op_test.cc
lite/kernels/mlu/bridges/act_op_test.cc
+3
-7
lite/kernels/mlu/bridges/batch_norm_op_test.cc
lite/kernels/mlu/bridges/batch_norm_op_test.cc
+1
-5
lite/kernels/mlu/bridges/conv_op_test.cc
lite/kernels/mlu/bridges/conv_op_test.cc
+2
-8
lite/kernels/mlu/bridges/elementwise_ops_test.cc
lite/kernels/mlu/bridges/elementwise_ops_test.cc
+4
-14
lite/kernels/mlu/bridges/fc_op_test.cc
lite/kernels/mlu/bridges/fc_op_test.cc
+1
-3
lite/kernels/mlu/bridges/pool_op_test.cc
lite/kernels/mlu/bridges/pool_op_test.cc
+1
-5
lite/kernels/mlu/bridges/softmax_op_test.cc
lite/kernels/mlu/bridges/softmax_op_test.cc
+1
-5
lite/kernels/mlu/bridges/test_helper.cc
lite/kernels/mlu/bridges/test_helper.cc
+1
-1
lite/kernels/mlu/io_copy_compute.cc
lite/kernels/mlu/io_copy_compute.cc
+0
-19
lite/kernels/mlu/subgraph_compute.h
lite/kernels/mlu/subgraph_compute.h
+39
-13
lite/kernels/npu/bridges/CMakeLists.txt
lite/kernels/npu/bridges/CMakeLists.txt
+1
-1
lite/kernels/x86/cast_compute.cc
lite/kernels/x86/cast_compute.cc
+11
-0
lite/tests/cv/CMakeLists.txt
lite/tests/cv/CMakeLists.txt
+1
-1
lite/tests/kernels/CMakeLists.txt
lite/tests/kernels/CMakeLists.txt
+1
-1
lite/tests/math/CMakeLists.txt
lite/tests/math/CMakeLists.txt
+1
-1
未找到文件。
CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option
(
LITE_WITH_X86
"Enable X86 in lite mode"
ON
)
lite_option
(
LITE_WITH_ARM
"Enable ARM in lite mode"
OFF
)
lite_option
(
LITE_WITH_NPU
"Enable NPU in lite mode"
OFF
)
lite_option
(
LITE_WITH_MLU
"Enable MLU in lite mode"
OFF
)
lite_option
(
LITE_WITH_XPU
"Enable XPU in lite mode"
OFF
)
lite_option
(
LITE_WITH_BM
"Enable BM in lite mode"
OFF
)
lite_option
(
LITE_WITH_TRAIN
"Enable training operators and kernels in lite"
OFF
)
...
...
@@ -177,6 +178,10 @@ if(LITE_WITH_XPU)
include
(
device/xpu
)
endif
()
if
(
LITE_WITH_MLU
)
include
(
mlu
)
endif
()
include
(
external/mklml
)
# download mklml package
include
(
external/xbyak
)
# download xbyak package
include
(
external/libxsmm
)
# download, build, install libxsmm
...
...
cmake/configure.cmake
浏览文件 @
809f7fc3
...
...
@@ -150,6 +150,10 @@ if (LITE_WITH_BM)
add_definitions
(
"-DLITE_WITH_BM"
)
endif
()
if
(
LITE_WITH_MLU
)
add_definitions
(
"-DLITE_WITH_MLU"
)
endif
()
if
(
LITE_WITH_PROFILE
)
add_definitions
(
"-DLITE_WITH_PROFILE"
)
if
(
LITE_WITH_PRECISION_PROFILE
)
...
...
cmake/lite.cmake
浏览文件 @
809f7fc3
...
...
@@ -22,7 +22,7 @@ endfunction()
function
(
lite_deps TARGET
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS ARGS
)
set
(
multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
CV_DEPS ARGS
)
cmake_parse_arguments
(
lite_deps
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
deps
${
lite_deps_DEPS
}
)
...
...
@@ -100,6 +100,12 @@ function (lite_deps TARGET)
endforeach
(
var
)
endif
()
if
(
LITE_WITH_MLU
)
foreach
(
var
${
lite_deps_MLU_DEPS
}
)
set
(
deps
${
deps
}
${
var
}
)
endforeach
(
var
)
endif
()
set
(
${
TARGET
}
${
deps
}
PARENT_SCOPE
)
endfunction
()
...
...
@@ -125,7 +131,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function
(
lite_cc_library TARGET
)
set
(
options SHARED shared STATIC static MODULE module
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
...
...
@@ -144,6 +150,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
MLU_DEPS
${
args_MLU_DEPS
}
)
if
(
args_SHARED OR ARGS_shared
)
...
...
@@ -170,7 +177,7 @@ function(lite_cc_binary TARGET)
set
(
options
" -g "
)
endif
()
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
...
...
@@ -189,6 +196,7 @@ function(lite_cc_binary TARGET)
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
CV_DEPS
${
CV_DEPS
}
MLU_DEPS
${
args_MLU_DEPS
}
)
cc_binary
(
${
TARGET
}
SRCS
${
args_SRCS
}
DEPS
${
deps
}
)
target_compile_options
(
${
TARGET
}
BEFORE PRIVATE -Wno-ignored-qualifiers
)
...
...
@@ -218,7 +226,7 @@ function(lite_cc_test TARGET)
endif
()
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL
# (basic|extra)
...
...
@@ -245,6 +253,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
CV_DEPS
${
args_CV_DEPS
}
MLU_DEPS
${
args_MLU_DEPS
}
)
_lite_cc_test
(
${
TARGET
}
SRCS
${
args_SRCS
}
DEPS
${
deps
}
ARGS
${
args_ARGS
}
)
# strip binary target to reduce size
...
...
@@ -269,6 +278,7 @@ set(cuda_kernels CACHE INTERNAL "cuda kernels")
set
(
fpga_kernels CACHE INTERNAL
"fpga kernels"
)
set
(
npu_kernels CACHE INTERNAL
"npu kernels"
)
set
(
xpu_kernels CACHE INTERNAL
"xpu kernels"
)
set
(
mlu_kernels CACHE INTERNAL
"mlu kernels"
)
set
(
bm_kernels CACHE INTERNAL
"bm kernels"
)
set
(
opencl_kernels CACHE INTERNAL
"opencl kernels"
)
set
(
host_kernels CACHE INTERNAL
"host kernels"
)
...
...
@@ -280,12 +290,12 @@ if(LITE_BUILD_TAILOR)
file
(
STRINGS
${
tailored_kernels_list_path
}
tailored_kernels_list
)
endif
()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA, BM)
# device: one of (Host, ARM, X86, NPU,
MLU,
FPGA, OPENCL, CUDA, BM)
# level: one of (basic, extra)
function
(
add_kernel TARGET device level
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
...
...
@@ -357,6 +367,12 @@ function(add_kernel TARGET device level)
endif
()
set
(
bm_kernels
"
${
bm_kernels
}
;
${
TARGET
}
"
CACHE INTERNAL
""
)
endif
()
if
(
"
${
device
}
"
STREQUAL
"MLU"
)
if
(
NOT LITE_WITH_MLU
)
return
()
endif
()
set
(
mlu_kernels
"
${
mlu_kernels
}
;
${
TARGET
}
"
CACHE INTERNAL
""
)
endif
()
if
(
"
${
device
}
"
STREQUAL
"OPENCL"
)
if
(
NOT LITE_WITH_OPENCL
)
return
()
...
...
@@ -391,6 +407,7 @@ function(add_kernel TARGET device level)
NPU_DEPS
${
args_NPU_DEPS
}
XPU_DEPS
${
args_XPU_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
MLU_DEPS
${
args_MLU_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
...
...
@@ -409,7 +426,7 @@ endif()
function
(
add_operator TARGET level
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
...
...
@@ -442,6 +459,7 @@ function(add_operator TARGET level)
NPU_DEPS
${
args_NPU_DEPS
}
XPU_DEPS
${
args_XPU_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
MLU_DEPS
${
args_MLU_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
...
...
lite/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message
(
STATUS
"LITE_WITH_NPU:
\t
${
LITE_WITH_NPU
}
"
)
message
(
STATUS
"LITE_WITH_XPU:
\t
${
LITE_WITH_XPU
}
"
)
message
(
STATUS
"LITE_WITH_FPGA:
\t
${
LITE_WITH_FPGA
}
"
)
message
(
STATUS
"LITE_WITH_MLU:
\t
${
LITE_WITH_MLU
}
"
)
message
(
STATUS
"LITE_WITH_BM:
\t
${
LITE_WITH_BM
}
"
)
message
(
STATUS
"LITE_WITH_PROFILE:
\t
${
LITE_WITH_PROFILE
}
"
)
message
(
STATUS
"LITE_WITH_CV:
\t
${
LITE_WITH_CV
}
"
)
...
...
lite/api/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -65,7 +65,8 @@ if (WITH_TESTING)
CUDA_DEPS
${
cuda_kernels
}
X86_DEPS
${
x86_kernels
}
XPU_DEPS
${
xpu_kernels
}
BM_DEPS
${
bm_kernels
}
)
BM_DEPS
${
bm_kernels
}
MLU_DEPS
${
mlu_kernels
}
)
endif
()
if
(
LITE_WITH_FPGA
)
set
(
light_api_deps
${
light_api_deps
}
${
fpga_deps
}
)
...
...
@@ -87,6 +88,7 @@ message(STATUS "get NPU kernels ${npu_kernels}")
message
(
STATUS
"get XPU kernels
${
xpu_kernels
}
"
)
message
(
STATUS
"get FPGA kernels
${
fpga_kernels
}
"
)
message
(
STATUS
"get BM kernels
${
bm_kernels
}
"
)
message
(
STATUS
"get MLU kernels
${
mlu_kernels
}
"
)
# for full api
if
(
NOT LITE_ON_TINY_PUBLISH
)
...
...
@@ -124,7 +126,8 @@ lite_cc_library(light_api SRCS light_api.cc
XPU_DEPS
${
xpu_kernels
}
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
BM_DEPS
${
bm_kernels
}
)
BM_DEPS
${
bm_kernels
}
MLU_DEPS
${
mlu_kernels
}
)
include
(
ExternalProject
)
set
(
LITE_DEMO_INSTALL_DIR
"
${
THIRD_PARTY_PATH
}
/inference_demo"
CACHE STRING
...
...
@@ -143,6 +146,7 @@ if(WITH_TESTING)
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
BM_DEPS
${
bm_kernels
}
MLU_DEPS
${
mlu_kernels
}
EXCLUDE_COMPILE_DEPS
"ON"
ARGS --model_dir=
${
LITE_MODEL_DIR
}
/lite_naive_model
--optimized_model=
${
LITE_MODEL_DIR
}
/lite_naive_model_opt SERIAL
)
...
...
@@ -288,6 +292,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
XPU_DEPS
${
xpu_kernels
}
FPGA_DEPS
${
fpga_kernels
}
BM_DEPS
${
bm_kernels
}
MLU_DEPS
${
mlu_kernels
}
ARGS --model_dir=
${
LITE_MODEL_DIR
}
/lite_naive_model
--optimized_model=
${
LITE_MODEL_DIR
}
/lite_naive_model_opt SERIAL
)
...
...
@@ -320,6 +325,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
X86_DEPS
${
x86_kernels
}
FPGA_DEPS
${
fpga_kernels
}
BM_DEPS
${
bm_kernels
}
MLU_DEPS
${
mlu_kernels
}
ARGS --model_dir=
${
LITE_MODEL_DIR
}
/lite_naive_model SERIAL
)
if
(
WITH_TESTING
)
add_dependencies
(
test_paddle_api extern_lite_download_lite_naive_model_tar_gz
)
...
...
@@ -333,6 +339,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
MLU_DEPS
${
mlu_kernels
}
CL_DEPS
${
opencl_kernels
}
BM_DEPS
${
bm_kernels
}
FPGA_DEPS
${
fpga_kernels
}
...
...
@@ -345,6 +352,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
MLU_DEPS
${
mlu_kernels
}
CL_DEPS
${
opencl_kernels
}
BM_DEPS
${
bm_kernels
}
FPGA_DEPS
${
fpga_kernels
}
...
...
@@ -357,6 +365,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
MLU_DEPS
${
mlu_kernels
}
CL_DEPS
${
opencl_kernels
}
BM_DEPS
${
bm_kernels
}
FPGA_DEPS
${
fpga_kernels
}
...
...
@@ -369,6 +378,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
MLU_DEPS
${
mlu_kernels
}
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
X86_DEPS
${
x86_kernels
}
...
...
@@ -380,6 +390,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
MLU_DEPS
${
mlu_kernels
}
CL_DEPS
${
opencl_kernels
}
BM_DEPS
${
bm_kernels
}
FPGA_DEPS
${
fpga_kernels
}
...
...
lite/api/cxx_api_impl.cc
浏览文件 @
809f7fc3
...
...
@@ -34,6 +34,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
#ifdef LITE_WITH_CUDA
Env
<
TARGET
(
kCUDA
)
>::
Init
();
#endif
#ifdef LITE_WITH_MLU
Env
<
TARGET
(
kMLU
)
>::
Init
();
mlu_core_version_
=
config
.
mlu_core_version
();
mlu_core_number_
=
config
.
mlu_core_number
();
#endif // LITE_WITH_MLU
auto
places
=
config
.
valid_places
();
std
::
vector
<
std
::
string
>
passes
{};
auto
use_layout_preprocess_pass
=
...
...
@@ -82,6 +87,9 @@ std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
void
CxxPaddleApiImpl
::
Run
()
{
#ifdef LITE_WITH_ARM
lite
::
DeviceInfo
::
Global
().
SetRunMode
(
mode_
,
threads_
);
#endif
#ifdef LITE_WITH_MLU
lite
::
DeviceInfo
::
Global
().
SetMLURunMode
(
mlu_core_version_
,
mlu_core_number_
);
#endif
raw_predictor_
.
Run
();
}
...
...
lite/api/opt.cc
浏览文件 @
809f7fc3
...
...
@@ -109,6 +109,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places
.
emplace_back
(
TARGET
(
kNPU
));
}
else
if
(
target_repr
==
"xpu"
)
{
valid_places
.
emplace_back
(
TARGET
(
kXPU
));
}
else
if
(
target_repr
==
"mlu"
)
{
valid_places
.
emplace_back
(
TARGET
(
kMLU
));
}
else
{
LOG
(
FATAL
)
<<
lite
::
string_format
(
"Wrong target '%s' found, please check the command flag "
...
...
lite/api/paddle_api.h
浏览文件 @
809f7fc3
...
...
@@ -106,6 +106,8 @@ class LITE_API PaddlePredictor {
protected:
int
threads_
{
1
};
lite_api
::
PowerMode
mode_
{
lite_api
::
LITE_POWER_NO_BIND
};
lite_api
::
MLUCoreVersion
mlu_core_version_
{
lite_api
::
MLU_270
};
int
mlu_core_number_
{
1
};
};
/// Base class for all the configs.
...
...
@@ -136,6 +138,11 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_X86
int
x86_math_library_math_threads_
=
1
;
#endif
bool
use_firstconv_
{
false
};
std
::
vector
<
float
>
mean_
=
{
0.0
f
};
std
::
vector
<
float
>
std_
=
{
1.0
f
};
lite_api
::
MLUCoreVersion
mlu_core_version_
{
lite_api
::
MLUCoreVersion
::
MLU_270
};
int
mlu_core_number_
{
1
};
public:
void
set_valid_places
(
const
std
::
vector
<
Place
>&
x
)
{
valid_places_
=
x
;
}
...
...
@@ -163,6 +170,20 @@ class LITE_API CxxConfig : public ConfigBase {
return
x86_math_library_math_threads_
;
}
#endif
void
set_use_firstconv
(
const
bool
firstconv
)
{
use_firstconv_
=
firstconv
;
}
void
set_mean
(
const
std
::
vector
<
float
>
mean
)
{
mean_
=
mean
;
}
void
set_std
(
const
std
::
vector
<
float
>
std
)
{
std_
=
std
;
}
void
set_mlu_core_version
(
lite_api
::
MLUCoreVersion
core_version
)
{
mlu_core_version_
=
core_version
;
}
void
set_mlu_core_number
(
int
core_number
)
{
mlu_core_number_
=
core_number
;
}
bool
use_first_conv
()
const
{
return
use_firstconv_
;
}
std
::
vector
<
float
>
mean
()
const
{
return
mean_
;
}
std
::
vector
<
float
>
std
()
const
{
return
std_
;
}
lite_api
::
MLUCoreVersion
mlu_core_version
()
const
{
return
mlu_core_version_
;
}
int
mlu_core_number
()
const
{
return
mlu_core_number_
;
}
};
/// MobileConfig is the config for the light weight predictor, it will skip
...
...
lite/api/paddle_place.cc
浏览文件 @
809f7fc3
...
...
@@ -71,7 +71,8 @@ const std::string& TargetToStr(TargetType target) {
"fpga"
,
"npu"
,
"xpu"
,
"bm"
};
"bm"
,
"mlu"
};
auto
x
=
static_cast
<
int
>
(
target
);
CHECK_LT
(
x
,
static_cast
<
int
>
(
TARGET
(
NUM
)));
return
target2string
[
x
];
...
...
@@ -111,6 +112,7 @@ const std::string& TargetRepr(TargetType target) {
"kFPGA"
,
"kNPU"
,
"kXPU"
,
"kMLU"
,
"kBM"
};
auto
x
=
static_cast
<
int
>
(
target
);
CHECK_LT
(
x
,
static_cast
<
int
>
(
TARGET
(
NUM
)));
...
...
@@ -153,6 +155,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET
(
kNPU
),
TARGET
(
kXPU
),
TARGET
(
kBM
),
TARGET
(
kMLU
),
TARGET
(
kFPGA
)});
if
(
target
==
TARGET
(
kAny
))
{
return
valid_set
;
...
...
lite/api/paddle_place.h
浏览文件 @
809f7fc3
...
...
@@ -53,8 +53,9 @@ enum class TargetType : int {
kNPU
=
8
,
kXPU
=
9
,
kBM
=
10
,
kMLU
=
11
,
kAny
=
6
,
// any target
NUM
=
1
1
,
// number of fields.
NUM
=
1
2
,
// number of fields.
};
enum
class
PrecisionType
:
int
{
kUnk
=
0
,
...
...
@@ -88,6 +89,8 @@ typedef enum {
LITE_POWER_RAND_LOW
=
5
}
PowerMode
;
typedef
enum
{
MLU_220
=
0
,
MLU_270
=
1
}
MLUCoreVersion
;
enum
class
ActivationType
:
int
{
kIndentity
=
0
,
kRelu
=
1
,
...
...
lite/api/paddle_use_passes.h
浏览文件 @
809f7fc3
...
...
@@ -45,5 +45,8 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS
(
elementwise_mul_constant_eliminate_pass
)
USE_MIR_PASS
(
npu_subgraph_pass
);
USE_MIR_PASS
(
xpu_subgraph_pass
);
USE_MIR_PASS
(
mlu_subgraph_pass
);
USE_MIR_PASS
(
mlu_postprocess_pass
);
USE_MIR_PASS
(
subgraph_cast_display_pass
);
USE_MIR_PASS
(
weight_quantization_preprocess_pass
);
USE_MIR_PASS
(
quantized_op_attributes_inference_pass
);
lite/api/python/pybind/pybind.cc
浏览文件 @
809f7fc3
...
...
@@ -109,6 +109,11 @@ void BindLiteCxxConfig(py::module *m) {
.
def
(
"set_power_mode"
,
&
CxxConfig
::
set_power_mode
)
.
def
(
"power_mode"
,
&
CxxConfig
::
power_mode
);
#endif
#ifdef LITE_WITH_MLU
cxx_config
.
def
(
"set_use_firstconv"
,
&
CxxConfig
::
set_use_firstconv
)
.
def
(
"set_mean"
,
&
CxxConfig
::
set_mean
)
.
def
(
"set_std"
,
&
CxxConfig
::
set_std
)
#endif
}
// TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
...
...
@@ -150,6 +155,9 @@ void BindLitePlace(py::module *m) {
.
value
(
"OpenCL"
,
TargetType
::
kOpenCL
)
.
value
(
"FPGA"
,
TargetType
::
kFPGA
)
.
value
(
"NPU"
,
TargetType
::
kNPU
)
#ifdef LITE_WITH_MLU
.
value
(
"MLU"
,
TargetType
::
kMLU
)
#endif
.
value
(
"Any"
,
TargetType
::
kAny
);
// PrecisionType
...
...
@@ -230,6 +238,20 @@ void BindLiteTensor(py::module *m) {
DO_GETTER_ONCE(data_type__, name__##_data)
DATA_GETTER_SETTER_ONCE
(
int8_t
,
int8
);
#ifdef LITE_WITH_MLU
tensor
.
def
(
"set_uint8_data"
,
[](
Tensor
&
self
,
const
std
::
vector
<
uint8_t
>
&
data
,
TargetType
type
=
TargetType
::
kHost
)
{
if
(
type
==
TargetType
::
kHost
)
{
self
.
CopyFromCpu
<
uint8_t
,
TargetType
::
kHost
>
(
data
.
data
());
}
},
py
::
arg
(
"data"
),
py
::
arg
(
"type"
)
=
TargetType
::
kHost
);
DO_GETTER_ONCE
(
uint8_t
,
"uint8_data"
);
#endif
DATA_GETTER_SETTER_ONCE
(
int32_t
,
int32
);
DATA_GETTER_SETTER_ONCE
(
float
,
float
);
#undef DO_GETTER_ONCE
...
...
lite/backends/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -6,4 +6,5 @@ add_subdirectory(fpga)
add_subdirectory
(
host
)
add_subdirectory
(
npu
)
add_subdirectory
(
xpu
)
add_subdirectory
(
mlu
)
add_subdirectory
(
bm
)
lite/core/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -7,7 +7,8 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
CUDA_DEPS target_wrapper_cuda
CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm
)
BM_DEPS target_wrapper_bm
MLU_DEPS target_wrapper_mlu
)
lite_cc_library
(
memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper
)
...
...
lite/core/arena/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -6,5 +6,5 @@ endif()
lite_cc_library
(
arena_framework SRCS framework.cc DEPS program gtest
)
if
((
NOT LITE_WITH_OPENCL
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
lite_cc_test
(
test_arena_framework SRCS framework_test.cc DEPS arena_framework
${
bm_kernels
}
${
npu_kernels
}
${
xpu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
fpga_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_arena_framework SRCS framework_test.cc DEPS arena_framework
${
mlu_kernels
}
${
bm_kernels
}
${
npu_kernels
}
${
xpu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
fpga_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
endif
()
lite/core/context.h
浏览文件 @
809f7fc3
...
...
@@ -24,6 +24,11 @@
#include "lite/backends/opencl/cl_context.h"
#include "lite/backends/opencl/cl_runtime.h"
#endif
#ifdef LITE_WITH_MLU
#include <cnml.h>
#include <cnrt.h>
#include "lite/backends/mlu/mlu_utils.h"
#endif
#include <map>
#include <memory>
...
...
@@ -52,6 +57,7 @@ using XPUContext = Context<TargetType::kXPU>;
using
OpenCLContext
=
Context
<
TargetType
::
kOpenCL
>
;
using
FPGAContext
=
Context
<
TargetType
::
kFPGA
>
;
using
BMContext
=
Context
<
TargetType
::
kBM
>
;
using
MLUContext
=
Context
<
TargetType
::
kMLU
>
;
template
<
>
class
Context
<
TargetType
::
kHost
>
{
...
...
@@ -171,6 +177,85 @@ class Context<TargetType::kFPGA> {
};
#endif
#ifdef LITE_WITH_MLU
template
<
>
class
Context
<
TargetType
::
kMLU
>
{
public:
typename
Env
<
TargetType
::
kMLU
>::
Devs
&
devs
=
Env
<
TargetType
::
kMLU
>::
Global
();
void
InitOnce
()
{}
MLUContext
&
operator
=
(
const
MLUContext
&
ctx
)
{
this
->
Init
(
ctx
.
device_id_
,
ctx
.
exec_queue_id_
,
ctx
.
io_queue_id_
);
return
*
this
;
}
void
Init
(
int
dev_id
,
int
exec_queue_id
=
0
,
int
io_queue_id
=
0
)
{
CHECK_GT
(
devs
.
size
(),
0UL
)
<<
"Env is not initialized or current target is not exit!"
;
if
(
dev_id
>=
static_cast
<
int
>
(
devs
.
size
()))
{
LOG
(
WARNING
)
<<
"device index exceeds the number of devices, set to "
"default device(0)!"
;
device_id_
=
0
;
}
else
{
device_id_
=
dev_id
;
}
SetMluDevice
(
device_id_
);
if
(
io_queue_id
>=
devs
[
dev_id
].
max_queue
())
{
LOG
(
WARNING
)
<<
"data queue index exceeds the maximum queue number, "
"set to default qeueu(0)!"
;
io_queue_id
=
0
;
}
if
(
exec_queue_id
>=
devs
[
dev_id
].
max_queue
())
{
LOG
(
WARNING
)
<<
"exec queue index exceeds the maximum queue number, "
"set to default qeueu(0)!"
;
exec_queue_id
=
0
;
}
io_queue_
=
devs
[
dev_id
].
io_queues
()[
io_queue_id
];
exec_queue_
=
devs
[
dev_id
].
exec_queues
()[
exec_queue_id
];
exec_queue_id_
=
exec_queue_id
;
io_queue_id_
=
io_queue_id
;
}
void
CopySharedTo
(
MLUContext
*
ctx
)
{
ctx
->
forward_param_
=
forward_param_
;
}
const
cnrtQueue_t
&
exec_queue
()
const
{
return
exec_queue_
;
}
void
SetExecQueue
(
cnrtQueue_t
queue
)
{
exec_queue_
=
queue
;
}
const
cnrtQueue_t
&
io_queue
()
const
{
return
io_queue_
;
}
void
SetIoQueue
(
cnrtQueue_t
queue
)
{
io_queue_
=
queue
;
}
cnmlCoreVersion_t
MLUCoreVersion
()
{
return
DeviceInfo
::
Global
().
MLUCoreVersion
();
}
int
MLUCoreNumber
()
{
return
DeviceInfo
::
Global
().
MLUCoreNumber
();
}
u32_t
affinity
()
{
return
affinity_
;
}
cnrtInvokeFuncParam_t
forward_param
()
{
return
forward_param_
;
}
int
device_id
()
{
return
device_id_
;
}
std
::
string
name
()
const
{
return
"MLUContext"
;
}
private:
int
device_id_
;
// overall information
int
exec_queue_id_
;
int
io_queue_id_
;
cnrtQueue_t
io_queue_
;
cnrtQueue_t
exec_queue_
;
std
::
vector
<
cnrtNotifier_t
>
input_notifiers_
;
std
::
vector
<
cnrtNotifier_t
>
output_notifiers_
;
cnrtInvokeFuncParam_t
forward_param_
;
u32_t
affinity_
=
0x01
;
};
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
// Only works with CUDA kernels.
template
<
>
...
...
@@ -393,6 +478,16 @@ class ContextScheduler {
kernel_contexts_
[
TargetType
::
kBM
].
As
<
BMContext
>
().
CopySharedTo
(
&
ctx
->
As
<
BMContext
>
());
break
;
#endif
#ifdef LITE_WITH_MLU
case
TARGET
(
kMLU
):
{
int
dev_id
=
TargetWrapper
<
TargetType
::
kMLU
>::
GetCurDevice
();
auto
&
context
=
ctx
->
As
<
MLUContext
>
();
context
.
Init
(
dev_id
);
kernel_contexts_
[
TargetType
::
kMLU
].
As
<
MLUContext
>
().
CopySharedTo
(
&
context
);
LOG
(
INFO
)
<<
"New Context for MLU"
;
}
break
;
#endif
default:
#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
...
...
@@ -434,6 +529,9 @@ class ContextScheduler {
#endif
#ifdef LITE_WITH_BM
InitContext
<
TargetType
::
kBM
,
BMContext
>
();
#endif
#ifdef LITE_WITH_MLU
InitContext
<
TargetType
::
kMLU
,
MLUContext
>
();
#endif
}
...
...
lite/core/device_info.cc
浏览文件 @
809f7fc3
...
...
@@ -58,7 +58,7 @@
namespace
paddle
{
namespace
lite
{
#if
def LITE_WITH_ARM
#if
((defined LITE_WITH_ARM_) || (defined LITE_WITH_MLU))
thread_local
lite_api
::
PowerMode
DeviceInfo
::
mode_
;
thread_local
ARMArch
DeviceInfo
::
arch_
;
thread_local
int
DeviceInfo
::
mem_size_
;
...
...
@@ -66,6 +66,11 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local
TensorLite
DeviceInfo
::
workspace_
;
thread_local
int64_t
DeviceInfo
::
count_
=
0
;
#ifdef LITE_WITH_MLU
thread_local
cnmlCoreVersion_t
DeviceInfo
::
mlu_core_version_
{
CNML_MLU270
};
thread_local
int
DeviceInfo
::
mlu_core_number_
{
1
};
#endif
#ifdef TARGET_IOS
const
int
DEFAULT_L1_CACHE_SIZE
=
64
*
1024
;
const
int
DEFAULT_L2_CACHE_SIZE
=
2048
*
1024
;
...
...
@@ -1080,6 +1085,28 @@ int DeviceInfo::Setup() {
return
0
;
}
#ifdef LITE_WITH_MLU
void
DeviceInfo
::
SetMLURunMode
(
lite_api
::
MLUCoreVersion
core_version
,
int
core_number
)
{
switch
(
core_version
)
{
case
(
lite_api
::
MLUCoreVersion
::
MLU_220
):
mlu_core_version_
=
CNML_MLU220
;
break
;
case
(
lite_api
::
MLUCoreVersion
::
MLU_270
):
mlu_core_version_
=
CNML_MLU270
;
break
;
default:
mlu_core_version_
=
CNML_MLU270
;
break
;
}
mlu_core_number_
=
core_number
;
}
cnmlCoreVersion_t
DeviceInfo
::
MLUCoreVersion
()
{
return
mlu_core_version_
;
}
int
DeviceInfo
::
MLUCoreNumber
()
{
return
mlu_core_number_
;
}
#endif // LITE_WITH_MLU
void
DeviceInfo
::
SetRunMode
(
lite_api
::
PowerMode
mode
,
int
thread_num
)
{
#ifdef ARM_WITH_OMP
thread_num
=
std
::
min
(
thread_num
,
core_num_
);
...
...
@@ -1159,6 +1186,39 @@ bool DeviceInfo::ExtendWorkspace(size_t size) {
#endif // LITE_WITH_ARM
#ifdef LITE_WITH_MLU
void
SetMluDevice
(
int
device_id
)
{
LOG
(
INFO
)
<<
"Set mlu device "
<<
device_id
;
cnrtDev_t
dev_handle
;
CNRT_CALL
(
cnrtGetDeviceHandle
(
&
dev_handle
,
device_id
));
CNRT_CALL
(
cnrtSetCurrentDevice
(
dev_handle
));
}
void
Device
<
TARGET
(
kMLU
)
>::
Init
()
{
SetMluDevice
(
idx_
);
GetInfo
();
CreateQueue
();
}
void
Device
<
TARGET
(
kMLU
)
>::
GetInfo
()
{}
void
Device
<
TARGET
(
kMLU
)
>::
CreateQueue
()
{
exec_queue_
.
clear
();
io_queue_
.
clear
();
for
(
size_t
i
=
0
;
i
<
max_queue_
;
++
i
)
{
cnrtQueue_t
exec_queue
;
cnrtQueue_t
io_queue
;
cnrtCreateQueue
(
&
exec_queue
);
cnrtCreateQueue
(
&
io_queue
);
exec_queue_
.
push_back
(
exec_queue
);
io_queue_
.
push_back
(
io_queue
);
cnrtCreateQueue
(
&
exec_queue
);
exec_queue_
.
push_back
(
exec_queue
);
}
}
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
void
Device
<
TARGET
(
kCUDA
)
>::
Init
()
{
...
...
lite/core/device_info.h
浏览文件 @
809f7fc3
...
...
@@ -19,11 +19,14 @@
#include <vector>
#include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h"
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/mlu_utils.h"
#endif
namespace
paddle
{
namespace
lite
{
#if
def LITE_WITH_ARM
#if
((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
typedef
enum
{
kAPPLE
=
0
,
...
...
@@ -52,6 +55,11 @@ class DeviceInfo {
int
Setup
();
void
SetRunMode
(
lite_api
::
PowerMode
mode
,
int
thread_num
);
#ifdef LITE_WITH_MLU
void
SetMLURunMode
(
lite_api
::
MLUCoreVersion
core_version
,
int
core_number
);
cnmlCoreVersion_t
MLUCoreVersion
();
int
MLUCoreNumber
();
#endif
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
);
void
SetArch
(
ARMArch
arch
)
{
arch_
=
arch
;
}
...
...
@@ -103,6 +111,11 @@ class DeviceInfo {
static
thread_local
TensorLite
workspace_
;
static
thread_local
int64_t
count_
;
#ifdef LITE_WITH_MLU
static
thread_local
cnmlCoreVersion_t
mlu_core_version_
;
static
thread_local
int
mlu_core_number_
;
#endif
void
SetDotInfo
(
int
argc
,
...);
void
SetFP16Info
(
int
argc
,
...);
void
SetFP32Info
(
int
argc
,
...);
...
...
@@ -134,6 +147,9 @@ class Env {
return
*
devs
;
}
static
void
Init
(
int
max_stream
=
4
)
{
#ifdef LITE_WITH_MLU
CNRT_CALL
(
cnrtInit
(
0
));
#endif
Devs
&
devs
=
Global
();
if
(
devs
.
size
()
>
0
)
{
return
;
...
...
@@ -156,6 +172,41 @@ class Env {
}
};
#ifdef LITE_WITH_MLU
void
SetMluDevice
(
int
device_id
);
template
<
>
class
Device
<
TARGET
(
kMLU
)
>
{
public:
Device
(
int
dev_id
,
int
max_queue
=
1
)
:
idx_
(
dev_id
),
max_queue_
(
max_queue
)
{}
void
Init
();
int
id
()
{
return
idx_
;
}
int
max_queue
()
{
return
max_queue_
;
}
void
SetId
(
int
idx
)
{
idx_
=
idx
;
}
std
::
string
name
()
{
return
"MLU"
;
}
int
core_num
()
{
return
16
;
}
float
max_memory
()
{
return
16
*
1024
;
}
std
::
vector
<
cnrtQueue_t
>
io_queues
()
{
return
io_queue_
;
}
std
::
vector
<
cnrtQueue_t
>
exec_queues
()
{
return
exec_queue_
;
}
private:
void
CreateQueue
();
void
GetInfo
();
private:
int
idx_
{
0
};
int
max_queue_
;
std
::
string
device_name_
;
float
max_memory_
;
std
::
vector
<
cnrtQueue_t
>
io_queue_
;
std
::
vector
<
cnrtQueue_t
>
exec_queue_
;
};
template
class
Env
<
TARGET
(
kMLU
)>;
#endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
template
<
>
class
Device
<
TARGET
(
kCUDA
)
>
{
...
...
lite/core/kernel.h
浏览文件 @
809f7fc3
...
...
@@ -83,6 +83,9 @@ class KernelBase {
#if defined(LITE_WITH_CUDA)
WorkSpace
::
Global_CUDA
().
AllocReset
();
#endif
#if defined(LITE_WITH_MLU)
WorkSpace
::
Global_MLU
().
AllocReset
();
#endif
#ifdef LITE_WITH_PROFILE
profiler_
->
StopTiming
(
profile
::
Type
::
kCreate
,
profile_id_
,
ctx_
.
get
());
profiler_
->
StartTiming
(
profile
::
Type
::
kDispatch
,
profile_id_
,
ctx_
.
get
());
...
...
lite/core/memory.cc
浏览文件 @
809f7fc3
...
...
@@ -45,6 +45,11 @@ void* TargetMalloc(TargetType target, size_t size) {
data
=
TargetWrapper
<
TARGET
(
kBM
)
>::
Malloc
(
size
);
break
;
#endif
#ifdef LITE_WITH_MLU
case
TargetType
::
kMLU
:
data
=
TargetWrapper
<
TARGET
(
kMLU
)
>::
Malloc
(
size
);
break
;
#endif // LITE_WITH_MLU
default:
LOG
(
FATAL
)
<<
"Unknown supported target "
<<
TargetToStr
(
target
);
}
...
...
@@ -79,6 +84,11 @@ void TargetFree(TargetType target, void* data) {
TargetWrapper
<
TARGET
(
kBM
)
>::
Free
(
data
);
break
;
#endif
#ifdef LITE_WITH_MLU
case
TargetType
::
kMLU
:
TargetWrapper
<
TARGET
(
kMLU
)
>::
Free
(
data
);
break
;
#endif // LITE_WITH_MLU
default:
LOG
(
FATAL
)
<<
"Unknown type"
;
}
...
...
@@ -110,6 +120,12 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
TargetWrapper
<
TARGET
(
kBM
)
>::
MemcpySync
(
dst
,
src
,
size
,
IoDirection
::
DtoD
);
break
;
#endif
#ifdef LITE_WITH_MLU
case
TargetType
::
kMLU
:
TargetWrapper
<
TARGET
(
kMLU
)
>::
MemcpySync
(
dst
,
src
,
size
,
IoDirection
::
HtoD
);
break
;
#endif
#ifdef LITE_WITH_OPENCL
case
TargetType
::
kOpenCL
:
TargetWrapperCL
::
MemcpySync
(
dst
,
src
,
size
,
IoDirection
::
DtoD
);
...
...
lite/core/memory.h
浏览文件 @
809f7fc3
...
...
@@ -30,6 +30,10 @@
#include "lite/backends/bm/target_wrapper.h"
#endif // LITE_WITH_BM
#ifdef LITE_WITH_MLU
#include "lite/backends/mlu/target_wrapper.h"
#endif // LITE_WITH_MLU
namespace
paddle
{
namespace
lite
{
...
...
@@ -81,6 +85,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
case
TARGET
(
kBM
):
TargetWrapper
<
TARGET
(
kBM
)
>::
MemcpySync
(
dst
,
src
,
size
,
dir
);
break
;
#endif
#ifdef LITE_WITH_MLU
case
TARGET
(
kMLU
):
TargetWrapperMlu
::
MemcpySync
(
dst
,
src
,
size
,
dir
);
break
;
#endif
default:
LOG
(
FATAL
)
...
...
lite/core/mir/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -35,6 +35,8 @@ lite_cc_library(mir_passes
demo_pass.cc
runtime_context_assign_pass.cc
memory_optimize_pass.cc
mlu_postprocess_pass.cc
subgraph_cast_display_pass.cc
weight_quantization_preprocess_pass.cc
quantized_op_attributes_inference_pass.cc
DEPS mir_pass types context
${
mir_fusers
}
${
mir_subgraphs
}
)
...
...
lite/core/mir/ssa_graph.cc
浏览文件 @
809f7fc3
...
...
@@ -64,6 +64,26 @@ std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
return
adj_list
;
}
std
::
map
<
mir
::
Node
*
,
std
::
set
<
mir
::
Node
*>>
SSAGraph
::
BuildNodeAdjList
()
{
std
::
map
<
mir
::
Node
*
,
std
::
set
<
mir
::
Node
*>>
adj_list
;
for
(
auto
&
n
:
mutable_nodes
())
{
if
(
adj_list
.
find
(
&
n
)
==
adj_list
.
end
())
{
adj_list
[
&
n
]
=
std
::
set
<
mir
::
Node
*>
();
}
std
::
vector
<
mir
::
Node
*>
nodes
;
for
(
auto
&
var
:
n
.
inlinks
)
{
nodes
.
push_back
(
var
);
}
std
::
sort
(
nodes
.
begin
(),
nodes
.
end
(),
[](
mir
::
Node
*
node1
,
mir
::
Node
*
node2
)
{
return
node1
>
node2
;
});
adj_list
[
&
n
].
insert
(
std
::
make_move_iterator
(
nodes
.
begin
()),
std
::
make_move_iterator
(
nodes
.
end
()));
}
return
adj_list
;
}
void
SSAGraph
::
SortHelper
(
const
std
::
map
<
mir
::
Node
*
,
std
::
set
<
mir
::
Node
*>>
&
adj_list
,
mir
::
Node
*
node
,
...
...
@@ -98,6 +118,24 @@ std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
return
res
;
}
std
::
vector
<
mir
::
Node
*>
SSAGraph
::
NodeTopologicalOrder
()
{
CheckBidirectionalConnection
();
std
::
stack
<
mir
::
Node
*>
stack
;
std
::
set
<
mir
::
Node
*>
visited
;
std
::
vector
<
mir
::
Node
*>
res
;
auto
adj_list
=
BuildNodeAdjList
();
for
(
auto
adj
:
adj_list
)
{
if
(
visited
.
find
(
adj
.
first
)
==
visited
.
end
())
{
SortHelper
(
adj_list
,
adj
.
first
,
&
visited
,
&
res
);
}
}
return
res
;
}
Node
*
SSAGraph
::
GraphCreateInstructNode
(
const
std
::
shared_ptr
<
OpLite
>
&
op
,
const
std
::
vector
<
Place
>
&
valid_places
)
{
node_storage_
.
emplace_back
();
...
...
lite/core/mir/ssa_graph.h
浏览文件 @
809f7fc3
...
...
@@ -42,6 +42,8 @@ class SSAGraph : GraphBase {
std
::
vector
<
mir
::
Node
*>
StmtTopologicalOrder
();
std
::
vector
<
mir
::
Node
*>
NodeTopologicalOrder
();
// The inputs of the graph.
std
::
vector
<
mir
::
Node
*>
inputs
();
...
...
@@ -86,6 +88,9 @@ class SSAGraph : GraphBase {
// Build operator inlink edge table.
std
::
map
<
mir
::
Node
*
,
std
::
set
<
mir
::
Node
*>>
BuildOperationAdjList
();
// Build node inlink edge table.
std
::
map
<
mir
::
Node
*
,
std
::
set
<
mir
::
Node
*>>
BuildNodeAdjList
();
void
SortHelper
(
const
std
::
map
<
mir
::
Node
*
,
std
::
set
<
mir
::
Node
*>>
&
adj_list
,
mir
::
Node
*
node
,
std
::
set
<
mir
::
Node
*>
*
visited
,
...
...
lite/core/mir/subgraph/subgraph_detector.cc
浏览文件 @
809f7fc3
...
...
@@ -313,8 +313,9 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
std
::
vector
<
std
::
vector
<
Node
*>>
SubgraphDetector
::
ExtractSubgraphs
(
node_map_t
*
nodes
)
{
for
(
auto
&
it
:
*
nodes
)
{
node_dat_t
*
node
=
it
.
second
;
for
(
auto
&
n_tpo
:
graph_
->
NodeTopologicalOrder
())
{
CHECK
(
nodes
->
find
(
n_tpo
)
!=
nodes
->
end
());
node_dat_t
*
node
=
(
*
nodes
)[
n_tpo
];
if
(
!
node
->
marked
)
{
continue
;
}
...
...
lite/core/mir/subgraph/subgraph_pass.cc
浏览文件 @
809f7fc3
...
...
@@ -67,6 +67,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser
();
}
void
MLUSubgraphPass
::
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
{
std
::
unordered_set
<
std
::
string
>
supported_lists
;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto
teller
=
[
&
](
Node
*
node
)
{
if
(
!
node
->
IsStmt
())
return
false
;
auto
&
stmt
=
node
->
AsStmt
();
return
supported_lists
.
count
(
stmt
.
op_type
())
!=
0
;
};
SubgraphFuser
fuser
(
graph
.
get
(),
teller
,
1
/* min_subgraph_size */
);
fuser
();
}
}
// namespace mir
}
// namespace lite
}
// namespace paddle
...
...
@@ -77,3 +91,5 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.
BindTargets
({
TARGET
(
kXPU
)});
REGISTER_MIR_PASS
(
bm_subgraph_pass
,
paddle
::
lite
::
mir
::
BMSubgraphPass
)
.
BindTargets
({
TARGET
(
kBM
)});
REGISTER_MIR_PASS
(
mlu_subgraph_pass
,
paddle
::
lite
::
mir
::
MLUSubgraphPass
)
.
BindTargets
({
TARGET
(
kMLU
)});
lite/core/mir/subgraph/subgraph_pass.h
浏览文件 @
809f7fc3
...
...
@@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass {
void
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
override
;
};
class
MLUSubgraphPass
:
public
ProgramPass
{
public:
void
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
override
;
};
}
// namespace mir
}
// namespace lite
}
// namespace paddle
lite/core/mir/subgraph_cast_display_pass.cc
浏览文件 @
809f7fc3
...
...
@@ -22,29 +22,15 @@ namespace mir {
class
SubgraphCastDisplayPass
:
public
DebugPass
{
public:
void
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
override
{
VLOG
(
3
)
<<
"== Argument types =="
;
for
(
auto
&
node
:
graph
->
mutable_nodes
())
{
if
(
!
node
.
IsArg
())
continue
;
auto
*
type
=
node
.
AsArg
().
type
;
if
(
type
)
{
VLOG
(
3
)
<<
"* ARG "
<<
node
.
AsArg
().
name
<<
" type: "
<<
*
type
;
}
else
{
VLOG
(
3
)
<<
"* ARG "
<<
node
.
AsArg
().
name
<<
" type: UNK"
;
}
}
VLOG
(
3
)
<<
"---------------------"
;
//
VLOG
(
0
)
<<
"== SubgraphOp Debug Info =="
;
VLOG
(
4
)
<<
"== SubgraphOp Debug Info =="
;
for
(
auto
&
node
:
graph
->
mutable_nodes
())
{
if
(
node
.
IsStmt
()
&&
node
.
AsStmt
().
op_type
()
==
"subgraph"
)
{
VLOG
(
0
)
<<
"FOUND SUBGRAPH OP"
;
VLOG
(
4
)
<<
"FOUND SUBGRAPH OP"
;
display_debug_info
(
node
,
"subgraph"
);
break
;
}
}
VLOG
(
0
)
<<
"---------------------"
;
VLOG
(
4
)
<<
"---------------------"
;
}
void
display_debug_info
(
const
Node
&
node
,
...
...
@@ -52,17 +38,17 @@ class SubgraphCastDisplayPass : public DebugPass {
bool
display_in_nodes
=
true
,
bool
display_out_nodes
=
true
)
{
CHECK
(
node
.
IsStmt
());
VLOG
(
0
)
<<
node
.
AsStmt
();
// VLOG(4
) << node.AsStmt();
if
(
display_in_nodes
)
{
for
(
auto
p_in_arg_node
:
node
.
inlinks
)
{
CHECK
(
p_in_arg_node
->
IsArg
());
VLOG
(
0
)
<<
"* ARG[IN] "
<<
p_in_arg_node
->
AsArg
().
name
VLOG
(
4
)
<<
"* ARG[IN] "
<<
p_in_arg_node
->
AsArg
().
name
<<
" type: "
<<
*
p_in_arg_node
->
AsArg
().
type
<<
" is_weight: "
<<
p_in_arg_node
->
AsArg
().
is_weight
<<
" is_persist: "
<<
p_in_arg_node
->
AsArg
().
is_persist
<<
" input_count: "
<<
p_in_arg_node
->
inlinks
.
size
();
if
(
p_in_arg_node
->
inlinks
.
size
()
==
0
)
{
VLOG
(
0
)
<<
"** END with No Op"
;
VLOG
(
4
)
<<
"** END with No Op"
;
}
for
(
auto
p_in_stmt_node
:
p_in_arg_node
->
inlinks
)
{
CHECK
(
p_in_stmt_node
->
IsStmt
());
...
...
@@ -71,7 +57,7 @@ class SubgraphCastDisplayPass : public DebugPass {
stmt_op_type
==
"io_copy"
)
{
display_debug_info
(
*
p_in_stmt_node
,
stmt_op_type
,
true
,
false
);
}
else
{
VLOG
(
0
)
<<
"** END with op type: "
<<
stmt_op_type
;
VLOG
(
4
)
<<
"** END with op type: "
<<
stmt_op_type
;
}
}
}
...
...
@@ -79,13 +65,13 @@ class SubgraphCastDisplayPass : public DebugPass {
if
(
display_out_nodes
)
{
for
(
auto
p_out_arg_node
:
node
.
outlinks
)
{
CHECK
(
p_out_arg_node
->
IsArg
());
VLOG
(
0
)
<<
"* ARG[OUT] "
<<
p_out_arg_node
->
AsArg
().
name
VLOG
(
4
)
<<
"* ARG[OUT] "
<<
p_out_arg_node
->
AsArg
().
name
<<
" type: "
<<
*
p_out_arg_node
->
AsArg
().
type
<<
" is_weight: "
<<
p_out_arg_node
->
AsArg
().
is_weight
<<
" is_persist: "
<<
p_out_arg_node
->
AsArg
().
is_persist
<<
" output_count: "
<<
p_out_arg_node
->
outlinks
.
size
();
if
(
p_out_arg_node
->
outlinks
.
size
()
==
0
)
{
VLOG
(
0
)
<<
"** END with No Op"
;
VLOG
(
4
)
<<
"** END with No Op"
;
}
for
(
auto
p_out_stmt_node
:
p_out_arg_node
->
outlinks
)
{
CHECK
(
p_out_stmt_node
->
IsStmt
());
...
...
@@ -94,7 +80,7 @@ class SubgraphCastDisplayPass : public DebugPass {
stmt_op_type
==
"io_copy"
)
{
display_debug_info
(
*
p_out_stmt_node
,
stmt_op_type
,
false
,
true
);
}
else
{
VLOG
(
0
)
<<
"** END with op type: "
<<
stmt_op_type
;
VLOG
(
4
)
<<
"** END with op type: "
<<
stmt_op_type
;
}
}
}
...
...
@@ -108,4 +94,4 @@ class SubgraphCastDisplayPass : public DebugPass {
REGISTER_MIR_PASS
(
subgraph_cast_display_pass
,
paddle
::
lite
::
mir
::
SubgraphCastDisplayPass
)
.
BindTargets
({
TARGET
(
k
Any
)});
.
BindTargets
({
TARGET
(
k
MLU
)});
lite/core/op_registry.cc
浏览文件 @
809f7fc3
...
...
@@ -107,6 +107,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case
TARGET
(
kBM
):
{
CREATE_KERNEL
(
kBM
);
}
break
;
case
TARGET
(
kMLU
):
{
CREATE_KERNEL
(
kMLU
);
}
break
;
default:
CHECK
(
false
)
<<
"not supported kernel target "
<<
TargetToStr
(
target
);
}
...
...
@@ -139,6 +142,15 @@ KernelRegistry::KernelRegistry()
INIT_FOR
(
kCUDA
,
kInt64
,
kNCHW
);
INIT_FOR
(
kCUDA
,
kInt64
,
kNHWC
);
INIT_FOR
(
kMLU
,
kFloat
,
kNHWC
);
INIT_FOR
(
kMLU
,
kFloat
,
kNCHW
);
INIT_FOR
(
kMLU
,
kFP16
,
kNHWC
);
INIT_FOR
(
kMLU
,
kFP16
,
kNCHW
);
INIT_FOR
(
kMLU
,
kInt8
,
kNHWC
);
INIT_FOR
(
kMLU
,
kInt8
,
kNCHW
);
INIT_FOR
(
kMLU
,
kInt16
,
kNHWC
);
INIT_FOR
(
kMLU
,
kInt16
,
kNCHW
);
INIT_FOR
(
kHost
,
kFloat
,
kNCHW
);
INIT_FOR
(
kHost
,
kAny
,
kNCHW
);
INIT_FOR
(
kHost
,
kFloat
,
kNHWC
);
...
...
lite/core/op_registry.h
浏览文件 @
809f7fc3
...
...
@@ -268,7 +268,32 @@ class KernelRegistry final {
DATALAYOUT
(
kAny
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kFPGA
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
)
>
*
//
DATALAYOUT
(
kAny
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNHWC
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNCHW
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNCHW
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kInt8
),
DATALAYOUT
(
kNHWC
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kInt8
),
DATALAYOUT
(
kNCHW
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kInt16
),
DATALAYOUT
(
kNHWC
)
>
*
,
//
KernelRegistryForTarget
<
TARGET
(
kMLU
),
PRECISION
(
kInt16
),
DATALAYOUT
(
kNCHW
)
>
*
//
>
;
KernelRegistry
();
...
...
lite/core/optimizer.h
浏览文件 @
809f7fc3
...
...
@@ -115,9 +115,15 @@ class Optimizer {
"variable_place_inference_pass"
,
//
"argument_type_display_pass"
,
"mlu_subgraph_pass"
,
"mlu_postprocess_pass"
,
// subgraph_cast_display_pass
"runtime_context_assign_pass"
,
"argument_type_display_pass"
,
"memory_optimize_pass"
}};
if
(
passes
.
size
()
==
1
)
{
passes_local
.
push_back
(
passes
[
0
]);
}
...
...
lite/core/workspace.h
浏览文件 @
809f7fc3
...
...
@@ -69,6 +69,13 @@ class WorkSpace {
}
#endif
#if defined(LITE_WITH_MLU)
static
WorkSpace
&
Global_MLU
()
{
thread_local
std
::
unique_ptr
<
WorkSpace
>
x
(
new
WorkSpace
(
TARGET
(
kMLU
)));
return
*
x
;
}
#endif
private:
explicit
WorkSpace
(
TargetType
x
)
:
target_
(
x
)
{}
...
...
lite/kernels/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -10,4 +10,5 @@ add_subdirectory(opencl)
add_subdirectory
(
fpga
)
add_subdirectory
(
npu
)
add_subdirectory
(
xpu
)
add_subdirectory
(
mlu
)
add_subdirectory
(
bm
)
lite/kernels/mlu/bridges/CMakeLists.txt
浏览文件 @
809f7fc3
...
...
@@ -29,13 +29,13 @@ set(mlu_subgraph_bridges
CACHE INTERNAL
"mlu_subgraph_bridges"
)
#
lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
#
lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
#
lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
#
lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
#
lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
#
lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
#
lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
#
lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
lite_cc_library
(
subgraph_test_helper_mlu SRCS test_helper.cc DEPS
${
mlu_subgraph_bridges
}
)
lite_cc_test
(
test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
lite_cc_test
(
test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program
${
mlu_subgraph_bridges
}
subgraph_compute_mlu subgraph_test_helper_mlu
)
message
(
STATUS
"+++++ mlu_subgraph_bridges:
${
mlu_subgraph_bridges
}
"
)
lite/kernels/mlu/bridges/act_op.cc
浏览文件 @
809f7fc3
...
...
@@ -54,4 +54,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
sigmoid
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
relu
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
tanh
,
kMLU
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
lite/kernels/mlu/bridges/act_op_test.cc
浏览文件 @
809f7fc3
...
...
@@ -25,8 +25,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
ActConverter
(
void
*
ctx
,
OpLite
*
op
);
template
void
FillTensor
<
float
,
int
>(
Tensor
*
x
,
float
lower
=
-
2
,
float
upper
=
-
2
);
...
...
@@ -149,8 +147,6 @@ TEST(MLUBridges, activation) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
relu
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
sigmoid
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
tanh
,
paddle
::
lite
::
subgraph
::
mlu
::
ActConverter
);
USE_SUBGRAPH_BRIDGE
(
sigmoid
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
relu
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
tanh
,
kMLU
)
lite/kernels/mlu/bridges/batch_norm_op_test.cc
浏览文件 @
809f7fc3
...
...
@@ -23,8 +23,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
BatchNormConverter
(
void
*
ctx
,
OpLite
*
op
);
template
<
typename
dtype
>
void
batch_norm_ref
(
const
std
::
shared_ptr
<
operators
::
BatchNormOp
>
op
)
{
Scope
*
scope
=
op
->
scope
();
...
...
@@ -181,6 +179,4 @@ TEST(MLUBridges, batch_norm) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
batch_norm
,
paddle
::
lite
::
subgraph
::
mlu
::
BatchNormConverter
);
USE_SUBGRAPH_BRIDGE
(
batch_norm
,
kMLU
)
lite/kernels/mlu/bridges/conv_op_test.cc
浏览文件 @
809f7fc3
...
...
@@ -25,8 +25,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
ConvConverter
(
void
*
ctx
,
OpLite
*
op
);
void
conv_ref
(
const
std
::
shared_ptr
<
operators
::
ConvOpLite
>
op
)
{
Scope
*
scope
=
op
->
scope
();
const
OpInfo
*
op_info
=
op
->
op_info
();
...
...
@@ -342,9 +340,5 @@ TEST(MLUBridges, conv) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
conv2d
,
paddle
::
lite
::
subgraph
::
mlu
::
ConvConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
depthwise_conv2d
,
paddle
::
lite
::
subgraph
::
mlu
::
ConvConverter
);
USE_SUBGRAPH_BRIDGE
(
conv2d
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
depthwise_conv2d
,
kMLU
)
lite/kernels/mlu/bridges/elementwise_ops_test.cc
浏览文件 @
809f7fc3
...
...
@@ -24,8 +24,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
ElementwiseConverter
(
void
*
ctx
,
OpLite
*
op
);
template
<
typename
dtype
>
void
elementwise_add_ref
(
const
std
::
shared_ptr
<
operators
::
ElementwiseOp
>
op
)
{
Scope
*
scope
=
op
->
scope
();
...
...
@@ -184,15 +182,7 @@ TEST(MLUBridges, elementwise_add) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
elementwise_add
,
paddle
::
lite
::
subgraph
::
mlu
::
ElementwiseConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
elementwise_sub
,
paddle
::
lite
::
subgraph
::
mlu
::
ElementwiseConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
elementwise_mul
,
paddle
::
lite
::
subgraph
::
mlu
::
ElementwiseConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
elementwise_div
,
paddle
::
lite
::
subgraph
::
mlu
::
ElementwiseConverter
);
USE_SUBGRAPH_BRIDGE
(
elementwise_add
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
elementwise_sub
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
elementwise_mul
,
kMLU
)
USE_SUBGRAPH_BRIDGE
(
elementwise_div
,
kMLU
)
lite/kernels/mlu/bridges/fc_op_test.cc
浏览文件 @
809f7fc3
...
...
@@ -24,8 +24,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
FCConverter
(
void
*
ctx
,
OpLite
*
op
);
void
fc_ref
(
const
std
::
shared_ptr
<
operators
::
FcOpLite
>
op
)
{
Scope
*
scope
=
op
->
scope
();
const
OpInfo
*
op_info
=
op
->
op_info
();
...
...
@@ -170,4 +168,4 @@ TEST(MLUBridges, fc) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
fc
,
paddle
::
lite
::
subgraph
::
mlu
::
FCConverter
);
USE_SUBGRAPH_BRIDGE
(
fc
,
kMLU
);
lite/kernels/mlu/bridges/pool_op_test.cc
浏览文件 @
809f7fc3
...
...
@@ -24,8 +24,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
PoolConverter
(
void
*
ctx
,
OpLite
*
op
);
void
pool_ref
(
const
std
::
shared_ptr
<
operators
::
PoolOpLite
>
op
)
{
Scope
*
scope
=
op
->
scope
();
const
OpInfo
*
op_info
=
op
->
op_info
();
...
...
@@ -275,6 +273,4 @@ TEST(MLUBridges, pool) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
pool2d
,
paddle
::
lite
::
subgraph
::
mlu
::
PoolConverter
);
USE_SUBGRAPH_BRIDGE
(
pool2d
,
kMLU
)
lite/kernels/mlu/bridges/softmax_op_test.cc
浏览文件 @
809f7fc3
...
...
@@ -23,8 +23,6 @@ namespace lite {
namespace
subgraph
{
namespace
mlu
{
int
SoftmaxConverter
(
void
*
ctx
,
OpLite
*
op
);
template
<
typename
dtype
>
void
softmax_ref
(
const
std
::
shared_ptr
<
operators
::
SoftmaxOp
>
op
)
{
Scope
*
scope
=
op
->
scope
();
...
...
@@ -171,6 +169,4 @@ TEST(MLUBridges, softmax) {
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
MLU
,
softmax
,
paddle
::
lite
::
subgraph
::
mlu
::
SoftmaxConverter
);
USE_SUBGRAPH_BRIDGE
(
softmax
,
kMLU
)
lite/kernels/mlu/bridges/test_helper.cc
浏览文件 @
809f7fc3
...
...
@@ -28,7 +28,7 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
const
std
::
vector
<
std
::
string
>&
input_var_names
,
const
std
::
vector
<
std
::
string
>&
output_var_names
)
{
CNRT_CALL
(
cnrtInit
(
0
));
SetMluDevice
(
0
);
::
paddle
::
lite
::
SetMluDevice
(
0
);
cnrtQueue_t
queue_
;
cnrtInvokeFuncParam_t
forward_param
;
u32_t
affinity
=
1
;
...
...
lite/kernels/mlu/io_copy_compute.cc
浏览文件 @
809f7fc3
...
...
@@ -133,22 +133,3 @@ REGISTER_LITE_KERNEL(
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kMLU
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
Finalize
();
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
// host_to_device)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
// .Finalize();
//
//
// kMLU,
// kFloat,
// kNHWC,
// paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
// device_to_host)
// .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
lite/kernels/mlu/subgraph_compute.h
浏览文件 @
809f7fc3
...
...
@@ -46,6 +46,32 @@ class SubgraphEngine : public subgraph::Engine {
graph_
.
SetFPType
(
type
);
}
int
Build
()
{
// In order to attach all of the ops of the block desc, we need to build
// the original program firstly.
BuildOriginProgram
();
// Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
build_device_program_status_
=
BuildDeviceProgram
();
return
build_device_program_status_
;
}
int
Launch
()
{
// Rebuild device program when the shapes of input tensors have been
// changed.
if
(
subgraph
::
CHECK_SUCCESS
(
build_device_program_status_
)
&&
subgraph
::
CHECK_REBUILD_WHEN_SHAPE_CHANGED
(
build_device_program_status_
)
&&
InputShapeChanged
())
{
Build
();
}
if
(
subgraph
::
CHECK_FAILED
(
build_device_program_status_
))
{
LaunchOriginProgram
();
}
else
{
LaunchDeviceProgram
();
}
return
0
;
}
protected:
int
BuildDeviceProgram
()
override
{
int
status
=
0
;
...
...
@@ -108,23 +134,23 @@ class SubgraphEngine : public subgraph::Engine {
graph_
.
AddInput
(
graph_
.
GetNode
(
input_name
));
}
CHECK
(
!
valid_output_names
.
empty
())
<<
"[MLU] no valid output names"
;
//
auto& mlu_context = this->ctx_->template As<MLUContext>();
//
auto core_version = mlu_context.MLUCoreVersion();
//
auto core_number = mlu_context.MLUCoreNumber();
//
graph_.Compile(core_version, core_number);
auto
&
mlu_context
=
this
->
ctx_
->
template
As
<
MLUContext
>();
auto
core_version
=
mlu_context
.
MLUCoreVersion
();
auto
core_number
=
mlu_context
.
MLUCoreNumber
();
graph_
.
Compile
(
core_version
,
core_number
);
return
status
;
}
int
LaunchDeviceProgram
()
override
{
//
auto& mlu_context = this->ctx_->template As<MLUContext>();
//
auto exec_queue = mlu_context.exec_queue();
//
u32_t affinity = mlu_context.affinity();
//
cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
//
int data_param = 1;
//
forward_param.data_parallelism = &data_param;
//
forward_param.affinity = &affinity;
//
forward_param.end = CNRT_PARAM_END;
//
graph_.Compute(forward_param, exec_queue);
auto
&
mlu_context
=
this
->
ctx_
->
template
As
<
MLUContext
>();
auto
exec_queue
=
mlu_context
.
exec_queue
();
u32_t
affinity
=
mlu_context
.
affinity
();
cnrtInvokeFuncParam_t
forward_param
=
mlu_context
.
forward_param
();
int
data_param
=
1
;
forward_param
.
data_parallelism
=
&
data_param
;
forward_param
.
affinity
=
&
affinity
;
forward_param
.
end
=
CNRT_PARAM_END
;
graph_
.
Compute
(
forward_param
,
exec_queue
);
return
0
;
}
...
...
lite/kernels/npu/bridges/CMakeLists.txt
浏览文件 @
809f7fc3
if
(
NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM
)
if
(
NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM
AND NOT LITE_WITH_MLU
)
return
()
endif
()
...
...
lite/kernels/x86/cast_compute.cc
浏览文件 @
809f7fc3
...
...
@@ -23,3 +23,14 @@ REGISTER_LITE_KERNEL(cast,
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
cast
,
kX86
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
x86
::
CastCompute
<::
paddle
::
lite
::
fluid
::
float16
>
,
fp16_to_any
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
),
PRECISION
(
kFP16
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
Finalize
();
lite/tests/cv/CMakeLists.txt
浏览文件 @
809f7fc3
if
(
LITE_WITH_CV
AND
(
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA
)
AND LITE_WITH_ARM
)
if
(
LITE_WITH_CV
AND
(
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA
AND NOT LITE_WITH_MLU
)
AND LITE_WITH_ARM
)
lite_cc_test
(
image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm
)
endif
()
lite/tests/kernels/CMakeLists.txt
浏览文件 @
809f7fc3
if
((
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
if
((
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM
AND NOT LITE_WITH_MLU
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
lite_cc_test
(
test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
...
...
lite/tests/math/CMakeLists.txt
浏览文件 @
809f7fc3
if
((
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
if
((
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA
AND NOT LITE_WITH_MLU
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
lite_cc_test
(
sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录