Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
e75c01f9
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e75c01f9
编写于
4月 07, 2023
作者:
W
Wang Xin
提交者:
GitHub
4月 07, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
clean up WITH_MLU (#52546)
上级
075d6b14
变更
120
隐藏空白更改
内联
并排
Showing
120 changed file
with
10 addition
and
15503 deletion
+10
-15503
CMakeLists.txt
CMakeLists.txt
+0
-16
cmake/configure.cmake
cmake/configure.cmake
+0
-5
cmake/neuware.cmake
cmake/neuware.cmake
+0
-34
cmake/operators.cmake
cmake/operators.cmake
+0
-23
cmake/third_party.cmake
cmake/third_party.cmake
+0
-5
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+0
-5
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+0
-11
paddle/fluid/framework/garbage_collector.cc
paddle/fluid/framework/garbage_collector.cc
+0
-50
paddle/fluid/framework/garbage_collector.h
paddle/fluid/framework/garbage_collector.h
+0
-43
paddle/fluid/framework/op_registry.h
paddle/fluid/framework/op_registry.h
+0
-9
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+0
-24
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+0
-13
paddle/fluid/framework/phi_utils.cc
paddle/fluid/framework/phi_utils.cc
+0
-9
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+2
-114
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+0
-29
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+0
-4
paddle/fluid/imperative/amp_auto_cast.cc
paddle/fluid/imperative/amp_auto_cast.cc
+0
-9
paddle/fluid/imperative/gradient_accumulator.cc
paddle/fluid/imperative/gradient_accumulator.cc
+0
-38
paddle/fluid/imperative/prepared_operator.cc
paddle/fluid/imperative/prepared_operator.cc
+0
-58
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+0
-16
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+0
-38
paddle/fluid/memory/allocation/buddy_allocator.cc
paddle/fluid/memory/allocation/buddy_allocator.cc
+0
-6
paddle/fluid/memory/allocation/buddy_allocator_test.cc
paddle/fluid/memory/allocation/buddy_allocator_test.cc
+0
-199
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+0
-134
paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
.../fluid/memory/allocation/naive_best_fit_allocator_test.cc
+0
-15
paddle/fluid/memory/allocation/system_allocator.cc
paddle/fluid/memory/allocation/system_allocator.cc
+0
-75
paddle/fluid/memory/allocation/system_allocator.h
paddle/fluid/memory/allocation/system_allocator.h
+0
-15
paddle/fluid/memory/allocation/system_allocator_test.cc
paddle/fluid/memory/allocation/system_allocator_test.cc
+0
-20
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+0
-224
paddle/fluid/memory/memcpy.h
paddle/fluid/memory/memcpy.h
+0
-3
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+0
-8
paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
...le/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+0
-136
paddle/fluid/operators/cast_op.cc
paddle/fluid/operators/cast_op.cc
+1
-18
paddle/fluid/operators/coalesce_tensor_op.cc
paddle/fluid/operators/coalesce_tensor_op.cc
+0
-20
paddle/fluid/operators/collective/barrier_op_mlu.cc
paddle/fluid/operators/collective/barrier_op_mlu.cc
+0
-63
paddle/fluid/operators/collective/c_allgather_op_mlu.cc
paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+0
-121
paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+0
-99
paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
+0
-26
paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+0
-15
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+0
-3
paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
+0
-26
paddle/fluid/operators/controlflow/compare_op_mlu.cc
paddle/fluid/operators/controlflow/compare_op_mlu.cc
+0
-240
paddle/fluid/operators/controlflow/logical_op_mlu.cc
paddle/fluid/operators/controlflow/logical_op_mlu.cc
+0
-81
paddle/fluid/operators/detection/CMakeLists.txt
paddle/fluid/operators/detection/CMakeLists.txt
+0
-5
paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+0
-279
paddle/fluid/operators/detection/prior_box_op_mlu.cc
paddle/fluid/operators/detection/prior_box_op_mlu.cc
+0
-104
paddle/fluid/operators/detection/yolo_box_op_mlu.cc
paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+0
-137
paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+0
-119
paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+0
-180
paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
+0
-51
paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+0
-53
paddle/fluid/operators/elementwise/elementwise_mlu.h
paddle/fluid/operators/elementwise/elementwise_mlu.h
+0
-421
paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+0
-175
paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+0
-212
paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+0
-137
paddle/fluid/operators/expand_v2_op.h
paddle/fluid/operators/expand_v2_op.h
+0
-14
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-5
paddle/fluid/operators/math/concat_and_split.cc
paddle/fluid/operators/math/concat_and_split.cc
+0
-111
paddle/fluid/operators/metrics/accuracy_op_mlu.cc
paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+0
-222
paddle/fluid/operators/mlu/CMakeLists.txt
paddle/fluid/operators/mlu/CMakeLists.txt
+0
-10
paddle/fluid/operators/mlu/activation_op_mlu_test.cc
paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+0
-168
paddle/fluid/operators/mlu/mlu_baseop.cc
paddle/fluid/operators/mlu/mlu_baseop.cc
+0
-5511
paddle/fluid/operators/mlu/mlu_baseop.h
paddle/fluid/operators/mlu/mlu_baseop.h
+0
-2435
paddle/fluid/operators/optimizers/adam_op_mlu.cc
paddle/fluid/operators/optimizers/adam_op_mlu.cc
+0
-610
paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+0
-200
paddle/fluid/operators/optimizers/momentum_op_mlu.cc
paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+0
-115
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+0
-66
paddle/fluid/operators/reader/buffered_reader.h
paddle/fluid/operators/reader/buffered_reader.h
+0
-10
paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+0
-216
paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+0
-103
paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
+0
-104
paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
+0
-85
paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
+0
-37
paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+0
-86
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+0
-45
paddle/fluid/operators/utils.h
paddle/fluid/operators/utils.h
+0
-5
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+1
-9
paddle/fluid/platform/device/CMakeLists.txt
paddle/fluid/platform/device/CMakeLists.txt
+0
-5
paddle/fluid/platform/device/device_wrapper.h
paddle/fluid/platform/device/device_wrapper.h
+0
-5
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-17
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+0
-9
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+0
-16
paddle/fluid/platform/init_test.cc
paddle/fluid/platform/init_test.cc
+1
-16
paddle/fluid/platform/place.h
paddle/fluid/platform/place.h
+0
-10
paddle/fluid/platform/place_test.cc
paddle/fluid/platform/place_test.cc
+0
-10
paddle/fluid/platform/profiler/CMakeLists.txt
paddle/fluid/platform/profiler/CMakeLists.txt
+1
-7
paddle/fluid/platform/profiler/chrometracing_logger.cc
paddle/fluid/platform/profiler/chrometracing_logger.cc
+0
-4
paddle/fluid/platform/profiler/mlu/CMakeLists.txt
paddle/fluid/platform/profiler/mlu/CMakeLists.txt
+0
-8
paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+0
-280
paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
+0
-35
paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
+0
-161
paddle/fluid/platform/profiler/mlu/mlu_tracer.h
paddle/fluid/platform/profiler/mlu/mlu_tracer.h
+0
-60
paddle/fluid/platform/profiler/profiler.cc
paddle/fluid/platform/profiler/profiler.cc
+0
-15
paddle/fluid/platform/profiler_helper.h
paddle/fluid/platform/profiler_helper.h
+0
-11
paddle/fluid/platform/stream_callback_manager.cc
paddle/fluid/platform/stream_callback_manager.cc
+0
-17
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+0
-70
paddle/fluid/pybind/parallel_executor.cc
paddle/fluid/pybind/parallel_executor.cc
+0
-4
paddle/fluid/pybind/place.cc
paddle/fluid/pybind/place.cc
+0
-85
paddle/fluid/pybind/tensor.cc
paddle/fluid/pybind/tensor.cc
+0
-29
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+0
-66
paddle/phi/backends/device_memory_aligment.h
paddle/phi/backends/device_memory_aligment.h
+1
-6
paddle/phi/backends/mlu/mlu_info.h
paddle/phi/backends/mlu/mlu_info.h
+0
-33
paddle/phi/common/place.h
paddle/phi/common/place.h
+0
-10
paddle/phi/core/utils/visit_place.h
paddle/phi/core/utils/visit_place.h
+0
-9
paddle/phi/kernels/funcs/activation_functor.h
paddle/phi/kernels/funcs/activation_functor.h
+0
-4
paddle/phi/kernels/funcs/math_function.cc
paddle/phi/kernels/funcs/math_function.cc
+0
-7
paddle/phi/kernels/funcs/strided_memcpy.h
paddle/phi/kernels/funcs/strided_memcpy.h
+1
-2
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+0
-2
python/paddle/framework/__init__.py
python/paddle/framework/__init__.py
+0
-1
test/CMakeLists.txt
test/CMakeLists.txt
+0
-3
test/cpp/imperative/CMakeLists.txt
test/cpp/imperative/CMakeLists.txt
+0
-6
test/cpp/imperative/cncl_context_test.cc
test/cpp/imperative/cncl_context_test.cc
+0
-141
test/cpp/imperative/test_group.cc
test/cpp/imperative/test_group.cc
+1
-17
未找到文件。
CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
...
@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option
(
WITH_XPU
"Compile PaddlePaddle with BAIDU KUNLUN XPU"
OFF
)
option
(
WITH_XPU
"Compile PaddlePaddle with BAIDU KUNLUN XPU"
OFF
)
option
(
WITH_XPU_KP
"Compile PaddlePaddle with BAIDU XPU compiler "
OFF
)
option
(
WITH_XPU_KP
"Compile PaddlePaddle with BAIDU XPU compiler "
OFF
)
option
(
WITH_XPU_XFT
"Compile PaddlePaddle with BAIDU XPU-XFT"
OFF
)
option
(
WITH_XPU_XFT
"Compile PaddlePaddle with BAIDU XPU-XFT"
OFF
)
option
(
WITH_MLU
"Compile PaddlePaddle with CAMBRICON MLU"
OFF
)
option
(
WITH_WIN_DUMP_DBG
"Compile with windows core dump debug mode"
OFF
)
option
(
WITH_WIN_DUMP_DBG
"Compile with windows core dump debug mode"
OFF
)
option
(
WITH_ASCEND
"Compile PaddlePaddle with ASCEND"
OFF
)
option
(
WITH_ASCEND
"Compile PaddlePaddle with ASCEND"
OFF
)
option
(
WITH_ROCM
"Compile PaddlePaddle with ROCM platform"
OFF
)
option
(
WITH_ROCM
"Compile PaddlePaddle with ROCM platform"
OFF
)
...
@@ -81,9 +80,6 @@ endif()
...
@@ -81,9 +80,6 @@ endif()
if
(
WITH_GPU AND WITH_ROCM
)
if
(
WITH_GPU AND WITH_ROCM
)
message
(
FATAL_ERROR
"Error when compile CUDA and ROCM at the same time"
)
message
(
FATAL_ERROR
"Error when compile CUDA and ROCM at the same time"
)
endif
()
endif
()
if
(
WITH_GPU AND WITH_MLU
)
message
(
FATAL_ERROR
"Error when compile GPU and MLU at the same time"
)
endif
()
if
(
WITH_GPU AND NOT APPLE
)
if
(
WITH_GPU AND NOT APPLE
)
enable_language
(
CUDA
)
enable_language
(
CUDA
)
...
@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL)
...
@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL)
CACHE STRING
"Disable BKCL when compiling without XPU"
FORCE
)
CACHE STRING
"Disable BKCL when compiling without XPU"
FORCE
)
endif
()
endif
()
if
(
NOT WITH_MLU AND WITH_CNCL
)
message
(
WARNING
"Disable CNCL when compiling without MLU. Force WITH_MLU=OFF."
)
set
(
WITH_MLU
OFF
CACHE STRING
"Disable CNCL when compiling without MLU"
FORCE
)
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
)
add_definitions
(
"-DPADDLE_WITH_NCCL"
)
add_definitions
(
"-DPADDLE_WITH_NCCL"
)
include
(
nccl
)
include
(
nccl
)
...
@@ -469,10 +457,6 @@ if(WITH_GPU)
...
@@ -469,10 +457,6 @@ if(WITH_GPU)
endif
()
endif
()
endif
()
endif
()
if
(
WITH_MLU
)
include
(
neuware
)
endif
()
if
(
WITH_ROCM
)
if
(
WITH_ROCM
)
include
(
hip
)
include
(
hip
)
include
(
miopen
)
# set miopen libraries, must before configure
include
(
miopen
)
# set miopen libraries, must before configure
...
...
cmake/configure.cmake
浏览文件 @
e75c01f9
...
@@ -116,11 +116,6 @@ if(WITH_IPU)
...
@@ -116,11 +116,6 @@ if(WITH_IPU)
add_definitions
(
-DPADDLE_WITH_IPU
)
add_definitions
(
-DPADDLE_WITH_IPU
)
endif
()
endif
()
if
(
WITH_MLU
)
message
(
STATUS
"Compile with MLU!"
)
add_definitions
(
-DPADDLE_WITH_MLU
)
endif
()
if
(
WITH_GPU
)
if
(
WITH_GPU
)
add_definitions
(
-DPADDLE_WITH_CUDA
)
add_definitions
(
-DPADDLE_WITH_CUDA
)
add_definitions
(
-DEIGEN_USE_GPU
)
add_definitions
(
-DEIGEN_USE_GPU
)
...
...
cmake/neuware.cmake
已删除
100644 → 0
浏览文件 @
075d6b14
if
(
NOT WITH_MLU
)
return
()
endif
()
if
(
NOT ENV{NEUWARE_HOME}
)
set
(
NEUWARE_HOME
"/usr/local/neuware"
)
else
()
set
(
NEUWARE_HOME $ENV{NEUWARE_HOME}
)
endif
()
message
(
STATUS
"NEUWARE_HOME: "
${
NEUWARE_HOME
}
)
set
(
NEUWARE_INCLUDE_DIR
${
NEUWARE_HOME
}
/include
)
set
(
NEUWARE_LIB_DIR
${
NEUWARE_HOME
}
/lib64
)
include_directories
(
${
NEUWARE_INCLUDE_DIR
}
)
set
(
CNNL_LIB
${
NEUWARE_LIB_DIR
}
/libcnnl.so
)
set
(
MLUOP_LIB
${
NEUWARE_LIB_DIR
}
/libmluops.so
)
set
(
CNRT_LIB
${
NEUWARE_LIB_DIR
}
/libcnrt.so
)
set
(
CNDRV_LIB
${
NEUWARE_LIB_DIR
}
/libcndrv.so
)
set
(
CNPAPI_LIB
${
NEUWARE_LIB_DIR
}
/libcnpapi.so
)
generate_dummy_static_lib
(
LIB_NAME
"neuware_lib"
GENERATOR
"neuware.cmake"
)
set
(
NEUWARE_LIB_DEPS
${
CNNL_LIB
}
${
MLUOP_LIB
}
${
CNRT_LIB
}
${
CNDRV_LIB
}
${
CNPAPI_LIB
}
)
if
(
WITH_CNCL
)
message
(
STATUS
"Compile with CNCL!"
)
add_definitions
(
-DPADDLE_WITH_CNCL
)
set
(
CNCL_LIB
${
NEUWARE_LIB_DIR
}
/libcncl.so
)
list
(
APPEND NEUWARE_LIB_DEPS
${
CNCL_LIB
}
)
endif
()
target_link_libraries
(
neuware_lib
${
NEUWARE_LIB_DEPS
}
)
cmake/operators.cmake
浏览文件 @
e75c01f9
...
@@ -74,9 +74,6 @@ function(op_library TARGET)
...
@@ -74,9 +74,6 @@ function(op_library TARGET)
set
(
MKLDNN_FILE
)
set
(
MKLDNN_FILE
)
set
(
op_common_deps operator op_registry math_function layer
set
(
op_common_deps operator op_registry math_function layer
common_infer_shape_functions
)
common_infer_shape_functions
)
if
(
WITH_MLU
)
set
(
op_common_deps
${
op_common_deps
}
mlu_baseop
)
endif
()
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
set
(
options UNITY
)
set
(
options UNITY
)
...
@@ -169,12 +166,6 @@ function(op_library TARGET)
...
@@ -169,12 +166,6 @@ function(op_library TARGET)
list
(
APPEND xpu_kp_cc_srcs
${
TARGET
}
.kps
)
list
(
APPEND xpu_kp_cc_srcs
${
TARGET
}
.kps
)
endif
()
endif
()
endif
()
endif
()
if
(
WITH_MLU
)
string
(
REPLACE
"_op"
"_op_mlu"
MLU_FILE
"
${
TARGET
}
"
)
if
(
EXISTS
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
MLU_FILE
}
.cc
)
list
(
APPEND mlu_cc_srcs
${
MLU_FILE
}
.cc
)
endif
()
endif
()
else
()
else
()
foreach
(
src
${
op_library_SRCS
}
)
foreach
(
src
${
op_library_SRCS
}
)
if
(
WITH_ROCM AND
${
src
}
MATCHES
".*_cudnn_op.cu$"
)
if
(
WITH_ROCM AND
${
src
}
MATCHES
".*_cudnn_op.cu$"
)
...
@@ -201,8 +192,6 @@ function(op_library TARGET)
...
@@ -201,8 +192,6 @@ function(op_library TARGET)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
elseif
(
WITH_XPU_KP AND
${
src
}
MATCHES
".*
\\
.kps$"
)
elseif
(
WITH_XPU_KP AND
${
src
}
MATCHES
".*
\\
.kps$"
)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
list
(
APPEND xpu_kp_cc_srcs
${
src
}
)
elseif
(
WITH_MLU AND
${
src
}
MATCHES
".*_op_mlu.cc$"
)
list
(
APPEND mlu_cc_srcs
${
src
}
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
elseif
(
${
src
}
MATCHES
".*
\\
.cc$"
)
list
(
APPEND cc_srcs
${
src
}
)
list
(
APPEND cc_srcs
${
src
}
)
elseif
((
WITH_ROCM OR WITH_GPU
)
AND
${
src
}
MATCHES
".*
\\
.kps$"
)
elseif
((
WITH_ROCM OR WITH_GPU
)
AND
${
src
}
MATCHES
".*
\\
.kps$"
)
...
@@ -519,18 +508,6 @@ function(op_library TARGET)
...
@@ -519,18 +508,6 @@ function(op_library TARGET)
endforeach
()
endforeach
()
endif
()
endif
()
# pybind USE_OP_DEVICE_KERNEL for MLU
if
(
WITH_MLU AND
${
mlu_cc_srcs_len
}
GREATER 0
)
foreach
(
mlu_src
${
mlu_cc_srcs
}
)
set
(
op_name
""
)
find_register
(
${
mlu_src
}
"REGISTER_OP_MLU_KERNEL"
op_name
)
if
(
NOT
${
op_name
}
EQUAL
""
)
file
(
APPEND
${
pybind_file
}
"USE_OP_DEVICE_KERNEL(
${
op_name
}
, MLU);
\n
"
)
set
(
pybind_flag 1
)
endif
()
endforeach
()
endif
()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if
(
WITH_MKLDNN AND
${
mkldnn_cc_srcs_len
}
GREATER 0
)
if
(
WITH_MKLDNN AND
${
mkldnn_cc_srcs_len
}
GREATER 0
)
# Append first implemented MKLDNN activation operator
# Append first implemented MKLDNN activation operator
...
...
cmake/third_party.cmake
浏览文件 @
e75c01f9
...
@@ -356,11 +356,6 @@ if(WITH_XPU)
...
@@ -356,11 +356,6 @@ if(WITH_XPU)
list
(
APPEND third_party_deps extern_xpu
)
list
(
APPEND third_party_deps extern_xpu
)
endif
()
endif
()
if
(
WITH_MLU
)
include
(
external/concurrentqueue
)
# download, build, install concurrentqueue
list
(
APPEND third_party_deps extern_concurrentqueue
)
endif
()
if
(
WITH_PSLIB
)
if
(
WITH_PSLIB
)
include
(
external/pslib
)
# download, build, install pslib
include
(
external/pslib
)
# download, build, install pslib
list
(
APPEND third_party_deps extern_pslib
)
list
(
APPEND third_party_deps extern_pslib
)
...
...
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
e75c01f9
...
@@ -99,11 +99,6 @@ struct DLDeviceVisitor
...
@@ -99,11 +99,6 @@ struct DLDeviceVisitor
"platform::NPUPinnedPlace is not supported"
));
"platform::NPUPinnedPlace is not supported"
));
}
}
inline
::
DLDevice
operator
()(
const
platform
::
MLUPlace
&
place
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"platform::MLUPlace is not supported"
));
}
inline
::
DLDevice
operator
()(
const
platform
::
CustomPlace
&
place
)
const
{
inline
::
DLDevice
operator
()(
const
platform
::
CustomPlace
&
place
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"platform::CustomPlace is not supported"
));
"platform::CustomPlace is not supported"
));
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
e75c01f9
...
@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
...
@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"No IPU gc found in CPU/IPU paddle"
));
platform
::
errors
::
Unimplemented
(
"No IPU gc found in CPU/IPU paddle"
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place_
))
{
#ifdef PADDLE_WITH_MLU
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
MLUUnsafeFastGarbageCollector
(
place_
,
max_memory_size
));
}
else
{
gc
.
reset
(
new
MLUDefaultStreamGarbageCollector
(
place_
,
max_memory_size
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"No MLU gc found in CPU/MLU paddle"
));
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
place_
))
{
}
else
if
(
platform
::
is_custom_place
(
place_
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
...
paddle/fluid/framework/garbage_collector.cc
浏览文件 @
e75c01f9
...
@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
...
@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
MLUDefaultStreamGarbageCollector
::
MLUDefaultStreamGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
MLUDefaultStreamGarbageCollector
::
Wait
()
const
{
static_cast
<
platform
::
MLUDeviceContext
*>
(
this
->
dev_ctx_
)
->
WaitStreamCallback
();
}
void
MLUDefaultStreamGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
static_cast
<
platform
::
MLUDeviceContext
*>
(
this
->
dev_ctx_
)
->
AddStreamCallback
(
callback
);
}
MLUUnsafeFastGarbageCollector
::
MLUUnsafeFastGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
MLUUnsafeFastGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback
();
}
MLUStreamGarbageCollector
::
MLUStreamGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{
platform
::
MLUDeviceGuard
guard
(
place
.
device
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueCreate
(
&
stream_
));
callback_manager_
.
reset
(
new
platform
::
StreamCallbackManager
<
mluStream
>
(
stream_
));
}
MLUStreamGarbageCollector
::~
MLUStreamGarbageCollector
()
{
auto
place
=
this
->
dev_ctx_
->
GetPlace
();
platform
::
MLUDeviceGuard
guard
(
place
.
device
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
stream_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueDestroy
(
stream_
));
}
mluStream
MLUStreamGarbageCollector
::
stream
()
const
{
return
stream_
;
}
void
MLUStreamGarbageCollector
::
Wait
()
const
{
callback_manager_
->
Wait
();
}
void
MLUStreamGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback_manager_
->
AddCallback
(
callback
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDefaultStreamGarbageCollector
::
CustomDefaultStreamGarbageCollector
(
CustomDefaultStreamGarbageCollector
::
CustomDefaultStreamGarbageCollector
(
const
platform
::
CustomPlace
&
place
,
size_t
max_memory_size
)
const
platform
::
CustomPlace
&
place
,
size_t
max_memory_size
)
...
...
paddle/fluid/framework/garbage_collector.h
浏览文件 @
e75c01f9
...
@@ -22,9 +22,6 @@
...
@@ -22,9 +22,6 @@
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
#include "paddle/fluid/platform/stream_callback_manager.h"
#include "paddle/fluid/platform/stream_callback_manager.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
...
@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
};
};
#endif
#endif
#ifdef PADDLE_WITH_MLU
class
MLUDefaultStreamGarbageCollector
:
public
GarbageCollector
{
public:
MLUDefaultStreamGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
);
void
Wait
()
const
override
;
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
class
MLUUnsafeFastGarbageCollector
:
public
GarbageCollector
{
public:
MLUUnsafeFastGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
);
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
class
MLUStreamGarbageCollector
:
public
GarbageCollector
{
public:
MLUStreamGarbageCollector
(
const
platform
::
MLUPlace
&
place
,
size_t
max_memory_size
);
~
MLUStreamGarbageCollector
();
void
Wait
()
const
override
;
mluStream
stream
()
const
;
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
private:
mluStream
stream_
;
std
::
unique_ptr
<
platform
::
StreamCallbackManager
<
mluStream
>>
callback_manager_
;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class
CustomDefaultStreamGarbageCollector
:
public
GarbageCollector
{
class
CustomDefaultStreamGarbageCollector
:
public
GarbageCollector
{
public:
public:
...
...
paddle/fluid/framework/op_registry.h
浏览文件 @
e75c01f9
...
@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
...
@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
#define REGISTER_OP_MLU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_name, \
customized_type_value, \
customized_type_value, \
...
@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
...
@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
__VA_ARGS__)
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...) \
#define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
REGISTER_OP_KERNEL_EX( \
op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE, \
op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE, \
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
e75c01f9
...
@@ -57,10 +57,6 @@ class DenseTensor;
...
@@ -57,10 +57,6 @@ class DenseTensor;
#include "paddle/fluid/platform/mkldnn_op_list.h"
#include "paddle/fluid/platform/mkldnn_op_list.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#endif
#endif
...
@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
...
@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
#else
auto
dev_id
=
place
.
device
;
auto
dev_id
=
place
.
device
;
platform
::
SetXPUDeviceId
(
dev_id
);
platform
::
SetXPUDeviceId
(
dev_id
);
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifndef PADDLE_WITH_MLU
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with MLU support."
,
place
));
#else
auto
dev_id
=
place
.
device
;
platform
::
SetMLUDeviceId
(
dev_id
);
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifndef PADDLE_WITH_CUSTOM_DEVICE
#ifndef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
...
@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
kernel_iter
==
kernels
.
end
()
&&
platform
::
is_mlu_place
(
expected_kernel_key
.
place_
))
{
VLOG
(
3
)
<<
"missing MLU kernel: "
<<
type_
<<
", expected_kernel_key:"
<<
expected_kernel_key
<<
", fallbacking to CPU one!"
;
expected_kernel_key
.
place_
=
platform
::
CPUPlace
();
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if
(
kernel_iter
==
kernels
.
end
()
&&
if
(
kernel_iter
==
kernels
.
end
()
&&
platform
::
is_custom_place
(
expected_kernel_key
.
place_
))
{
platform
::
is_custom_place
(
expected_kernel_key
.
place_
))
{
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
e75c01f9
...
@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
...
@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
"Please recompile or reinstall Paddle with GPU support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_MLU
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
MLUUnsafeFastGarbageCollector
(
place
,
max_memory_size
));
}
else
{
gc
.
reset
(
new
MLUStreamGarbageCollector
(
place
,
max_memory_size
));
}
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."
));
#endif
#endif
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
#if defined(PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_XPU)
...
...
paddle/fluid/framework/phi_utils.cc
浏览文件 @
e75c01f9
...
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
...
@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
phi
::
Backend
::
CPU
,
kernel_key
.
layout
(),
kernel_key
.
dtype
());
phi
::
Backend
::
CPU
,
kernel_key
.
layout
(),
kernel_key
.
dtype
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
kernel_key
.
backend
()
==
phi
::
Backend
::
MLU
)
{
VLOG
(
3
)
<<
"phi missing MLU kernel: "
<<
op
.
Type
()
<<
", expected_kernel_key:"
<<
kernel_key
<<
", fallback to CPU one!"
;
return
phi
::
KernelKey
(
phi
::
Backend
::
CPU
,
kernel_key
.
layout
(),
kernel_key
.
dtype
());
}
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
if
(
kernel_key
.
backend
()
==
phi
::
Backend
::
IPU
)
{
if
(
kernel_key
.
backend
()
==
phi
::
Backend
::
IPU
)
{
VLOG
(
3
)
<<
"phi missing IPU kernel: "
<<
op
.
Type
()
VLOG
(
3
)
<<
"phi missing IPU kernel: "
<<
op
.
Type
()
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
e75c01f9
...
@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src,
...
@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src,
"Copying from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copying from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
auto
src_mlu_place
=
src_place
;
auto
dst_cpu_place
=
dst_place
;
auto
stream
=
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_cpu_place
,
dst_ptr
,
src_mlu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_mlu_place
(
dst_place
))
{
auto
src_cpu_place
=
src_place
;
auto
dst_mlu_place
=
dst_place
;
auto
stream
=
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_mlu_place
,
dst_ptr
,
src_cpu_place
,
src_ptr
,
size
,
stream
);
}
else
if
(
platform
::
is_mlu_place
(
src_place
)
&&
// NOLINT
platform
::
is_mlu_place
(
dst_place
))
{
auto
src_mlu_place
=
src_place
;
auto
dst_mlu_place
=
dst_place
;
auto
stream
=
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
).
stream
();
memory
::
Copy
(
dst_mlu_place
,
dst_ptr
,
src_mlu_place
,
src_ptr
,
size
,
stream
);
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copying from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#ifdef PADDLE_WITH_IPU
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_ipu_place
(
dst_place
))
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
// NOLINT
platform
::
is_ipu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data sync from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
);
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copying from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
}
}
template
<
typename
TENSOR
>
template
<
typename
TENSOR
>
...
@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src,
...
@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src,
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_mlu_place
(
dst_place
))
{
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_mlu_place
(
src_place
)
&&
// NOLINT
platform
::
is_mlu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
return
;
}
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
is_cpu_place
(
dst_place
))
{
...
@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os,
...
@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os,
#else
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported when not compiled with XPU"
));
"XPUPlace is not supported when not compiled with XPU"
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_MLU
constexpr
size_t
kBufSize
=
1024
*
1024
*
64
;
// 64MB
std
::
unique_ptr
<
char
[]
>
buf
(
new
char
[
kBufSize
]);
auto
&
mlu_dev_ctx
=
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
dev_ctx
);
platform
::
CPUPlace
cpu
;
uintptr_t
data
=
reinterpret_cast
<
uintptr_t
>
(
data_ptr
);
while
(
size
!=
0
)
{
size_t
size_to_write
=
std
::
min
(
kBufSize
,
static_cast
<
size_t
>
(
size
));
memory
::
Copy
(
cpu
,
buf
.
get
(),
tensor
.
place
(),
reinterpret_cast
<
const
void
*>
(
data
),
size_to_write
,
mlu_dev_ctx
.
stream
());
mlu_dev_ctx
.
Wait
();
os
.
write
(
buf
.
get
(),
size_to_write
);
data
+=
size_to_write
;
size
-=
size_to_write
;
}
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"MLUPlace is not supported when not compiled with MLU"
));
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
tensor
.
place
()))
{
}
else
if
(
platform
::
is_custom_place
(
tensor
.
place
()))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is,
...
@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is,
platform
::
is_npu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_npu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
phi
::
DenseTensor
cpu_tensor
;
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
shape
));
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
shape
));
framework
::
VisitDataType
(
framework
::
VisitDataType
(
...
@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is,
...
@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is,
}
else
if
(
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
}
else
if
(
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported when not compiled with XPU"
));
"XPUPlace is not supported when not compiled with XPU"
));
}
else
if
(
platform
::
is_mlu_place
(
dev_ctx
.
GetPlace
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"MLUPlace is not supported when not compiled with MLU"
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported when not compiled with NPU"
));
}
}
#endif
#endif
}
else
{
}
else
{
...
@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is,
...
@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is,
platform
::
is_npu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_npu_place
(
dev_ctx
.
GetPlace
())
||
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
platform
::
is_custom_place
(
dev_ctx
.
GetPlace
()))
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
phi
::
DenseTensor
cpu_tensor
;
phi
::
DenseTensor
cpu_tensor
;
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
dims
));
cpu_tensor
.
Resize
(
phi
::
make_ddim
(
dims
));
framework
::
VisitDataType
(
framework
::
VisitDataType
(
...
@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is,
...
@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is,
}
else
if
(
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
}
else
if
(
platform
::
is_xpu_place
(
dev_ctx
.
GetPlace
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported when not compiled with XPU"
));
"XPUPlace is not supported when not compiled with XPU"
));
}
else
if
(
platform
::
is_mlu_place
(
dev_ctx
.
GetPlace
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"MLUPlace is not supported when not compiled with MLU"
));
}
else
if
(
platform
::
is_npu_place
(
dev_ctx
.
GetPlace
()))
{
}
else
if
(
platform
::
is_npu_place
(
dev_ctx
.
GetPlace
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported when not compiled with NPU"
));
"NPUPlace is not supported when not compiled with NPU"
));
...
...
paddle/fluid/framework/tensor_util.h
浏览文件 @
e75c01f9
...
@@ -26,9 +26,6 @@ limitations under the License. */
...
@@ -26,9 +26,6 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
...
@@ -142,11 +139,6 @@ void TensorFromArray(const T* src,
...
@@ -142,11 +139,6 @@ void TensorFromArray(const T* src,
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else
if
(
platform
::
is_custom_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_custom_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
memory
::
Copy
(
...
@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src,
...
@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
reinterpret_cast
<
const
phi
::
GPUContext
&>
(
ctx
).
stream
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else
if
(
platform
::
is_custom_place
(
dst_place
))
{
// NOLINT
else
if
(
platform
::
is_custom_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
memory
::
Copy
(
...
@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src,
...
@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src,
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
).
stream
());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else
if
(
platform
::
is_custom_place
(
src
.
place
()))
{
// NOLINT
else
if
(
platform
::
is_custom_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
...
@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
...
@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else
if
(
platform
::
is_custom_place
(
src
.
place
()))
{
// NOLINT
else
if
(
platform
::
is_custom_place
(
src
.
place
()))
{
// NOLINT
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
memory
::
Copy
(
dst_place
,
dst_ptr
,
src
.
place
(),
src_ptr
,
size
,
nullptr
);
...
...
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -177,10 +177,6 @@ if(WITH_GLOO)
...
@@ -177,10 +177,6 @@ if(WITH_GLOO)
endif
()
endif
()
endif
()
endif
()
if
(
WITH_MLU
)
set
(
MLU_DEPS mlu_baseop
)
endif
()
if
(
NOT WITH_ASCEND_CL
)
if
(
NOT WITH_ASCEND_CL
)
cc_library
(
cc_library
(
gradient_accumulator
gradient_accumulator
...
...
paddle/fluid/imperative/amp_auto_cast.cc
浏览文件 @
e75c01f9
...
@@ -159,15 +159,6 @@ AmpOperators::AmpOperators()
...
@@ -159,15 +159,6 @@ AmpOperators::AmpOperators()
OpSupportedInfos
(
"XPU"
,
paddle
::
framework
::
proto
::
VarType
::
BF16
));
OpSupportedInfos
(
"XPU"
,
paddle
::
framework
::
proto
::
VarType
::
BF16
));
unsupported_bf16_ops_
->
insert
(
unsupported_ops_xpu_bf16
.
begin
(),
unsupported_bf16_ops_
->
insert
(
unsupported_ops_xpu_bf16
.
begin
(),
unsupported_ops_xpu_bf16
.
end
());
unsupported_ops_xpu_bf16
.
end
());
#elif defined(PADDLE_WITH_MLU)
auto
unsupported_ops_mlu_fp16
=
std
::
get
<
2
>
(
OpSupportedInfos
(
"MLU"
,
paddle
::
framework
::
proto
::
VarType
::
FP16
));
unsupported_fp16_ops_
->
insert
(
unsupported_ops_mlu_fp16
.
begin
(),
unsupported_ops_mlu_fp16
.
end
());
auto
unsupported_ops_mlu_bf16
=
std
::
get
<
2
>
(
OpSupportedInfos
(
"MLU"
,
paddle
::
framework
::
proto
::
VarType
::
BF16
));
unsupported_bf16_ops_
->
insert
(
unsupported_ops_mlu_bf16
.
begin
(),
unsupported_ops_mlu_bf16
.
end
());
#endif
#endif
VLOG
(
4
)
<<
allow_ops_
->
size
()
<<
" "
<<
block_ops_
->
size
()
<<
" "
VLOG
(
4
)
<<
allow_ops_
->
size
()
<<
" "
<<
block_ops_
->
size
()
<<
" "
<<
unsupported_fp16_ops_
->
size
()
<<
" "
<<
unsupported_fp16_ops_
->
size
()
<<
" "
...
...
paddle/fluid/imperative/gradient_accumulator.cc
浏览文件 @
e75c01f9
...
@@ -34,9 +34,6 @@
...
@@ -34,9 +34,6 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "xpu/refactor/math.h"
#include "xpu/refactor/math.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/backends/device_manager.h"
#endif
#endif
...
@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
...
@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
platform
::
is_mlu_place
(
place
))
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContext
*
ctx
=
pool
.
Get
(
place
);
auto
dev_ctx
=
dynamic_cast
<
platform
::
MLUDeviceContext
*>
(
ctx
);
if
(
data_type
==
framework
::
DataTypeTrait
<
float
>::
DataType
())
{
dst_tensor
->
mutable_data
<
float
>
(
place
);
}
else
if
(
data_type
==
framework
::
DataTypeTrait
<
platform
::
float16
>::
DataType
())
{
dst_tensor
->
mutable_data
<
platform
::
float16
>
(
place
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode"
,
framework
::
DataTypeToString
(
data_type
),
place
));
}
static
const
float
alpha
=
1.
f
;
static
const
float
beta
=
1.
f
;
operators
::
MLUCnnlTensorDesc
src_tensor_desc
(
src_tensor
);
operators
::
MLUCnnlTensorDesc
dst_tensor_desc
(
*
dst_tensor
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlAssignAdd
(
dev_ctx
->
cnnl_handle
(),
static_cast
<
const
void
*>
(
&
alpha
),
src_tensor_desc
.
get
(),
operators
::
GetBasePtr
(
&
src_tensor
),
nullptr
,
0
,
static_cast
<
const
void
*>
(
&
beta
),
dst_tensor_desc
.
get
(),
operators
::
GetBasePtr
(
dst_tensor
)));
return
;
}
#endif
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Gradient accumulation of data type (%s) on place (%s) is not "
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode"
,
"supported in imperative mode"
,
...
...
paddle/fluid/imperative/prepared_operator.cc
浏览文件 @
e75c01f9
...
@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
...
@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
kernel_signature_
(
std
::
move
(
kernel_signature
)),
kernel_signature_
(
std
::
move
(
kernel_signature
)),
phi_kernel_
(
phi_kernel
)
{}
phi_kernel_
(
phi_kernel
)
{}
#ifdef PADDLE_WITH_MLU
static
void
tokenize
(
const
std
::
string
&
ops
,
char
delim
,
std
::
unordered_set
<
std
::
string
>*
op_set
)
{
std
::
string
::
size_type
beg
=
0
;
for
(
uint64_t
end
=
0
;
(
end
=
ops
.
find
(
delim
,
end
))
!=
std
::
string
::
npos
;
++
end
)
{
op_set
->
insert
(
ops
.
substr
(
beg
,
end
-
beg
));
beg
=
end
+
1
;
}
op_set
->
insert
(
ops
.
substr
(
beg
));
}
static
bool
is_in_mlu_black_list
(
const
std
::
string
&
op_name
)
{
static
bool
inited
=
false
;
static
std
::
unordered_set
<
std
::
string
>
mlu_black_list
;
static
std
::
mutex
s_mtx
;
if
(
!
inited
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
s_mtx
);
if
(
!
inited
)
{
if
(
std
::
getenv
(
"MLU_BLACK_LIST"
)
!=
nullptr
)
{
std
::
string
ops
(
std
::
getenv
(
"MLU_BLACK_LIST"
));
tokenize
(
ops
,
','
,
&
mlu_black_list
);
}
inited
=
true
;
VLOG
(
3
)
<<
"MLU Black List: "
;
for
(
auto
iter
=
mlu_black_list
.
begin
();
iter
!=
mlu_black_list
.
end
();
++
iter
)
{
VLOG
(
3
)
<<
*
iter
<<
" "
;
}
}
}
if
(
mlu_black_list
.
find
(
op_name
)
!=
mlu_black_list
.
end
())
{
return
true
;
}
return
false
;
}
#endif
template
<
typename
VarType
>
template
<
typename
VarType
>
PreparedOp
PrepareImpl
(
PreparedOp
PrepareImpl
(
const
NameVarMap
<
VarType
>&
ins
,
const
NameVarMap
<
VarType
>&
ins
,
...
@@ -258,12 +216,6 @@ PreparedOp PrepareImpl(
...
@@ -258,12 +216,6 @@ PreparedOp PrepareImpl(
op
.
Type
(),
expected_kernel_key
.
dtype
());
op
.
Type
(),
expected_kernel_key
.
dtype
());
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
is_in_mlu_black_list
(
op
.
Type
()))
{
expected_kernel_key
.
set_backend
(
phi
::
Backend
::
CPU
);
}
#endif
bool
has_phi_kernel
=
false
;
bool
has_phi_kernel
=
false
;
const
auto
*
arg_map_fn
=
phi_op_utils_map
.
GetArgumentMappingFn
(
op
.
Type
());
const
auto
*
arg_map_fn
=
phi_op_utils_map
.
GetArgumentMappingFn
(
op
.
Type
());
...
@@ -468,16 +420,6 @@ PreparedOp PrepareImpl(
...
@@ -468,16 +420,6 @@ PreparedOp PrepareImpl(
kernel_iter
=
kernels
.
find
(
fluid_kernel_type
);
kernel_iter
=
kernels
.
find
(
fluid_kernel_type
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
kernel_iter
==
kernels
.
end
()
&&
paddle
::
platform
::
is_mlu_place
(
fluid_kernel_type
.
place_
))
{
VLOG
(
3
)
<<
"missing MLU kernel: "
<<
op
.
Type
()
<<
", expected_kernel_key:"
<<
fluid_kernel_type
<<
", fallbacking to CPU one!"
;
fluid_kernel_type
.
place_
=
platform
::
CPUPlace
();
kernel_iter
=
kernels
.
find
(
fluid_kernel_type
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if
(
kernel_iter
==
kernels
.
end
()
&&
if
(
kernel_iter
==
kernels
.
end
()
&&
paddle
::
platform
::
is_custom_place
(
fluid_kernel_type
.
place_
))
{
paddle
::
platform
::
is_custom_place
(
fluid_kernel_type
.
place_
))
{
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
e75c01f9
...
@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
...
@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use IPU device since it's not compiled with IPU,"
"Paddle can't use IPU device since it's not compiled with IPU,"
"Please recompile or reinstall Paddle with IPU support."
));
"Please recompile or reinstall Paddle with IPU support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#if defined(PADDLE_WITH_MLU)
gc
.
reset
(
new
framework
::
MLUDefaultStreamGarbageCollector
(
place
,
0
));
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."
));
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
...
@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type,
...
@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type,
}
else
if
(
platform
::
is_npu_place
(
place
))
{
}
else
if
(
platform
::
is_npu_place
(
place
))
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU if use NPUPlace."
));
"PaddlePaddle should compile with NPU if use NPUPlace."
));
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_MLU
platform
::
SetMLUDeviceId
(
place
.
device
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with MLU if use MLUPlace."
));
#endif
}
else
if
(
platform
::
is_custom_place
(
place
))
{
}
else
if
(
platform
::
is_custom_place
(
place
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi
::
DeviceManager
::
SetDevice
(
place
);
phi
::
DeviceManager
::
SetDevice
(
place
);
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
e75c01f9
...
@@ -58,10 +58,6 @@
...
@@ -58,10 +58,6 @@
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
...
@@ -194,11 +190,6 @@ class AllocatorFacadePrivate {
...
@@ -194,11 +190,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
InitNaiveBestFitXPUAllocator
(
platform
::
XPUPlace
(
dev_id
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetMLUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitMLUAllocator
(
platform
::
MLUPlace
(
dev_id
));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
for
(
const
auto
&
dev_type
:
device_types
)
{
for
(
const
auto
&
dev_type
:
device_types
)
{
...
@@ -254,11 +245,6 @@ class AllocatorFacadePrivate {
...
@@ -254,11 +245,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitIPUAllocator
(
platform
::
IPUPlace
(
dev_id
));
InitNaiveBestFitIPUAllocator
(
platform
::
IPUPlace
(
dev_id
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetMLUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitMLUAllocator
(
platform
::
MLUPlace
(
dev_id
));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
for
(
const
auto
&
dev_type
:
device_types
)
{
for
(
const
auto
&
dev_type
:
device_types
)
{
...
@@ -290,11 +276,6 @@ class AllocatorFacadePrivate {
...
@@ -290,11 +276,6 @@ class AllocatorFacadePrivate {
InitThreadLocalCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
));
InitThreadLocalCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
));
}
}
InitNaiveBestFitCUDAPinnedAllocator
();
InitNaiveBestFitCUDAPinnedAllocator
();
#endif
#ifdef PADDLE_WITH_MLU
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetMLUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitMLUAllocator
(
platform
::
MLUPlace
(
dev_id
));
}
#endif
#endif
break
;
break
;
}
}
...
@@ -801,12 +782,6 @@ class AllocatorFacadePrivate {
...
@@ -801,12 +782,6 @@ class AllocatorFacadePrivate {
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
void
InitNaiveBestFitMLUAllocator
(
platform
::
MLUPlace
p
)
{
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void
InitNaiveBestFitCustomDeviceAllocator
(
platform
::
CustomPlace
p
)
{
void
InitNaiveBestFitCustomDeviceAllocator
(
platform
::
CustomPlace
p
)
{
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
...
@@ -851,13 +826,6 @@ class AllocatorFacadePrivate {
...
@@ -851,13 +826,6 @@ class AllocatorFacadePrivate {
system_allocators_
[
p
]
=
CreateCUDAAllocator
(
p
);
system_allocators_
[
p
]
=
CreateCUDAAllocator
(
p
);
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
int
device_count
=
platform
::
GetMLUDeviceCount
();
for
(
int
i
=
0
;
i
<
device_count
;
++
i
)
{
platform
::
MLUPlace
p
(
i
);
system_allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
for
(
const
auto
&
dev_type
:
device_types
)
{
for
(
const
auto
&
dev_type
:
device_types
)
{
...
@@ -894,12 +862,6 @@ class AllocatorFacadePrivate {
...
@@ -894,12 +862,6 @@ class AllocatorFacadePrivate {
places
.
emplace_back
(
platform
::
IPUPlace
(
dev_id
));
places
.
emplace_back
(
platform
::
IPUPlace
(
dev_id
));
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
int
device_count
=
platform
::
GetMLUDeviceCount
();
for
(
int
dev_id
=
0
;
dev_id
<
device_count
;
++
dev_id
)
{
places
.
emplace_back
(
platform
::
MLUPlace
(
dev_id
));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
auto
device_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
for
(
const
auto
&
dev_type
:
device_types
)
{
for
(
const
auto
&
dev_type
:
device_types
)
{
...
...
paddle/fluid/memory/allocation/buddy_allocator.cc
浏览文件 @
e75c01f9
...
@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
...
@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_
=
&
platform
::
GpuInitAllocSize
;
init_allocate_size_func_
=
&
platform
::
GpuInitAllocSize
;
re_allocate_size_func_
=
&
platform
::
GpuReallocSize
;
re_allocate_size_func_
=
&
platform
::
GpuReallocSize
;
#elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_
=
&
platform
::
MLUInitAllocSize
;
re_allocate_size_func_
=
&
platform
::
MLUReallocSize
;
#endif
#endif
}
}
#endif
#endif
...
@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
...
@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes
=
DeviceAllocateSize
(
allocate_bytes
=
DeviceAllocateSize
(
&
platform
::
GpuInitAllocSize
,
&
platform
::
GpuReallocSize
,
request_bytes
);
&
platform
::
GpuInitAllocSize
,
&
platform
::
GpuReallocSize
,
request_bytes
);
#elif defined(PADDLE_WITH_MLU)
allocate_bytes
=
DeviceAllocateSize
(
&
platform
::
MLUInitAllocSize
,
&
platform
::
MLUReallocSize
,
request_bytes
);
#endif
#endif
#endif
#endif
...
...
paddle/fluid/memory/allocation/buddy_allocator_test.cc
浏览文件 @
e75c01f9
...
@@ -25,9 +25,6 @@ limitations under the License. */
...
@@ -25,9 +25,6 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
...
@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) {
...
@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
TEST
(
BuddyAllocator
,
MluFraction
)
{
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
;
FLAGS_initial_gpu_memory_in_mb
=
0
;
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
());
// Less than pool size
TestBuddyAllocator
(
&
buddy_allocator
,
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
20
);
buddy_allocator
.
Release
();
// Greater than max chunk size
TestBuddyAllocator
(
&
buddy_allocator
,
600
<<
20
,
/* use_system_allocator = */
true
);
TestBuddyAllocator
(
&
buddy_allocator
,
1
*
static_cast
<
size_t
>
(
1
<<
30
),
/* use_system_allocator = */
true
);
}
TEST
(
BuddyAllocator
,
InitRealloc
)
{
FLAGS_initial_gpu_memory_in_mb
=
100
;
FLAGS_reallocate_gpu_memory_in_mb
=
50
;
EXPECT_EQ
(
platform
::
MLUMaxChunkSize
(),
static_cast
<
size_t
>
(
100
<<
20
));
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
());
// Less then initial size and reallocate size
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
20
);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator
(
&
buddy_allocator
,
80
<<
20
);
TestBuddyAllocator
(
&
buddy_allocator
,
99
<<
20
);
// Greater than max chunk size
TestBuddyAllocator
(
&
buddy_allocator
,
101
<<
20
,
/* use_system_allocator = */
true
);
TestBuddyAllocator
(
&
buddy_allocator
,
1
*
static_cast
<
size_t
>
(
1
<<
30
),
/* use_system_allocator = */
true
);
}
TEST
(
BuddyAllocator
,
ReallocSizeGreaterThanInit
)
{
FLAGS_initial_gpu_memory_in_mb
=
5
;
FLAGS_reallocate_gpu_memory_in_mb
=
10
;
EXPECT_EQ
(
platform
::
MLUMaxChunkSize
(),
static_cast
<
size_t
>
(
10
<<
20
));
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
());
// Less than initial size and reallocate size
TestBuddyAllocator
(
&
buddy_allocator
,
1
<<
20
);
// Between initial size and reallocate size and exceed pool
TestBuddyAllocator
(
&
buddy_allocator
,
6
<<
20
);
TestBuddyAllocator
(
&
buddy_allocator
,
8
<<
20
);
TestBuddyAllocator
(
&
buddy_allocator
,
9
<<
20
);
// Greater than max trunk size
TestBuddyAllocator
(
&
buddy_allocator
,
11
<<
20
,
/* use_system_allocator = */
true
);
TestBuddyAllocator
(
&
buddy_allocator
,
1
*
static_cast
<
size_t
>
(
1
<<
30
),
/* use_system_allocator = */
true
);
}
TEST
(
BuddyAllocator
,
FractionRefillPool
)
{
FLAGS_fraction_of_gpu_memory_to_use
=
0.6
;
FLAGS_initial_gpu_memory_in_mb
=
0
;
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
size_t
max_chunk_size
=
platform
::
MLUMaxChunkSize
();
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
max_chunk_size
);
// Less than pool size
int
*
p0
=
TestBuddyAllocator
(
&
buddy_allocator
,
max_chunk_size
-
1000
,
/* use_system_allocator = */
false
,
/* free_ptr = */
false
);
// Max chunk size should be same during allocation
EXPECT_EQ
(
max_chunk_size
,
buddy_allocator
.
GetMaxChunkSize
());
size_t
alloc
=
platform
::
MLUAvailableMemToAlloc
()
*
FLAGS_fraction_of_gpu_memory_to_use
;
// Exceed pool trigger refilling size of fraction of avaiable mlu, and should
// be able to alloc 60% of the remaining MLU
int
*
p1
=
TestBuddyAllocator
(
&
buddy_allocator
,
alloc
,
/* use_system_allocator = */
false
,
/* free_ptr = */
false
);
// Max chunk size should be same during allocation
EXPECT_EQ
(
max_chunk_size
,
buddy_allocator
.
GetMaxChunkSize
());
alloc
=
platform
::
MLUAvailableMemToAlloc
()
*
FLAGS_fraction_of_gpu_memory_to_use
;
// Exceed pool trigger refilling size of fraction of avaiable mlu, and should
// be able to alloc 60% of the remaining MLU
TestBuddyAllocator
(
&
buddy_allocator
,
alloc
,
/* use_system_allocator = */
false
);
// Max chunk size should be same during allocation
EXPECT_EQ
(
max_chunk_size
,
buddy_allocator
.
GetMaxChunkSize
());
buddy_allocator
.
Free
(
p0
);
buddy_allocator
.
Free
(
p1
);
}
TEST
(
BuddyAllocator
,
AllocFromAvailable
)
{
FLAGS_fraction_of_gpu_memory_to_use
=
0.7
;
FLAGS_initial_gpu_memory_in_mb
=
0
;
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
size_t
total
=
0
,
available
=
0
;
platform
::
SetMLUDeviceId
(
0
);
platform
::
MLUMemoryUsage
(
&
available
,
&
total
);
// Take half of available MLU
void
*
p
;
cnrtStatus
result
=
cnrtMalloc
(
&
p
,
available
>>
1
);
EXPECT_TRUE
(
result
==
cnrtSuccess
);
// BuddyAllocator should be able to alloc the remaining MLU
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
());
TestBuddyAllocator
(
&
buddy_allocator
,
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
20
);
TestBuddyAllocator
(
&
buddy_allocator
,
static_cast
<
size_t
>
(
1
<<
30
));
if
(
p
)
{
EXPECT_TRUE
(
cnrtFree
(
p
)
==
cnrtSuccess
);
}
}
TEST
(
BuddyAllocator
,
AllocFromAvailableWhenFractionIsOne
)
{
FLAGS_fraction_of_gpu_memory_to_use
=
1.0
;
FLAGS_initial_gpu_memory_in_mb
=
0
;
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
void
*
p
=
nullptr
;
EXPECT_TRUE
(
cnrtMalloc
(
&
p
,
static_cast
<
size_t
>
(
1
)
<<
30
)
==
cnrtSuccess
);
// BuddyAllocator should be able to alloc the remaining MLU
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
());
TestBuddyAllocator
(
&
buddy_allocator
,
static_cast
<
size_t
>
(
1
)
<<
30
);
TestBuddyAllocator
(
&
buddy_allocator
,
static_cast
<
size_t
>
(
1
)
<<
30
);
if
(
p
)
{
EXPECT_TRUE
(
cnrtFree
(
p
)
==
cnrtSuccess
);
}
}
TEST
(
BuddyAllocator
,
Release
)
{
// In a 8 GB machine, the pool size will be about 800 MB
FLAGS_fraction_of_gpu_memory_to_use
=
0.1
;
FLAGS_initial_gpu_memory_in_mb
=
0
;
FLAGS_reallocate_gpu_memory_in_mb
=
0
;
BuddyAllocator
buddy_allocator
(
std
::
unique_ptr
<
SystemAllocator
>
(
new
MLUAllocator
(
0
)),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
());
// Less than pool size
TestBuddyAllocator
(
&
buddy_allocator
,
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
10
<<
10
);
TestBuddyAllocator
(
&
buddy_allocator
,
50
<<
20
);
buddy_allocator
.
Release
();
}
#endif
}
// namespace detail
}
// namespace detail
}
// namespace memory
}
// namespace memory
}
// namespace paddle
}
// namespace paddle
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
e75c01f9
...
@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>(
...
@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>(
#endif
#endif
}
}
// For MLU
#ifdef PADDLE_WITH_MLU
class
MLUBuddyAllocatorList
{
private:
MLUBuddyAllocatorList
()
:
devices_
(
platform
::
GetMLUSelectedDevices
())
{
auto
mlu_num
=
devices_
.
size
();
allocators_
.
resize
(
mlu_num
);
init_flags_
.
reserve
(
mlu_num
);
for
(
size_t
i
=
0
;
i
<
mlu_num
;
++
i
)
{
init_flags_
.
emplace_back
(
new
std
::
once_flag
());
}
}
static
MLUBuddyAllocatorList
*
CreateNewInstance
()
{
return
new
MLUBuddyAllocatorList
();
}
public:
static
MLUBuddyAllocatorList
*
Instance
()
{
static
auto
*
instance
=
CreateNewInstance
();
return
instance
;
}
BuddyAllocator
*
Get
(
int
mlu_id
)
{
auto
pos
=
std
::
distance
(
devices_
.
begin
(),
std
::
find
(
devices_
.
begin
(),
devices_
.
end
(),
mlu_id
));
PADDLE_ENFORCE_LT
(
pos
,
devices_
.
size
(),
platform
::
errors
::
OutOfRange
(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d"
,
devices_
.
size
(),
pos
));
std
::
call_once
(
*
init_flags_
[
pos
],
[
this
,
pos
]
{
platform
::
SetMLUDeviceId
(
devices_
[
pos
]);
allocators_
[
pos
].
reset
(
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
MLUAllocator
(
devices_
[
pos
])),
platform
::
MLUMinChunkSize
(),
platform
::
MLUMaxChunkSize
()));
VLOG
(
10
)
<<
"
\n\n
NOTE:
\n
"
<<
"You can set GFlags environment variable "
<<
"(mlu reuse gpu GFlags) "
<<
"'FLAGS_fraction_of_gpu_memory_to_use' "
<<
"or 'FLAGS_initial_gpu_memory_in_mb' "
<<
"or 'FLAGS_reallocate_gpu_memory_in_mb' "
<<
"to change the memory size for MLU usage.
\n
"
<<
"Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<<
FLAGS_fraction_of_gpu_memory_to_use
<<
". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<<
FLAGS_initial_gpu_memory_in_mb
<<
". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<<
FLAGS_reallocate_gpu_memory_in_mb
<<
"
\n\n
"
;
});
return
allocators_
[
pos
].
get
();
}
private:
std
::
vector
<
int
>
devices_
;
std
::
vector
<
std
::
unique_ptr
<
std
::
once_flag
>>
init_flags_
;
std
::
vector
<
std
::
unique_ptr
<
BuddyAllocator
>>
allocators_
;
};
BuddyAllocator
*
GetMLUBuddyAllocator
(
int
mlu_id
)
{
return
MLUBuddyAllocatorList
::
Instance
()
->
Get
(
mlu_id
);
}
#endif
template
<
>
size_t
Used
<
platform
::
MLUPlace
>
(
const
platform
::
MLUPlace
&
place
)
{
#ifdef PADDLE_WITH_MLU
return
GetMLUBuddyAllocator
(
place
.
device
)
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'MLUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
*
Alloc
<
platform
::
MLUPlace
>
(
const
platform
::
MLUPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_MLU
auto
*
buddy_allocator
=
GetMLUBuddyAllocator
(
place
.
device
);
auto
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
platform
::
MLUDeviceGuard
(
place
.
device
);
size_t
avail
=
0
,
total
=
0
;
platform
::
MLUMemoryUsage
(
&
avail
,
&
total
);
PADDLE_THROW
(
platform
::
errors
::
ResourceExhausted
(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s."
,
string
::
HumanReadableSize
(
size
),
place
.
device
,
string
::
HumanReadableSize
(
avail
),
string
::
HumanReadableSize
(
total
),
string
::
HumanReadableSize
(
buddy_allocator
->
GetMinChunkSize
()),
string
::
HumanReadableSize
(
buddy_allocator
->
GetMaxChunkSize
()),
string
::
HumanReadableSize
(
Used
<
platform
::
MLUPlace
>
(
place
))));
}
else
{
if
(
FLAGS_init_allocated_mem
)
{
cnrtMemset
(
ptr
,
0xEF
,
size
);
}
}
return
ptr
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'MLUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
Free
<
platform
::
MLUPlace
>
(
const
platform
::
MLUPlace
&
place
,
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_MLU
VLOG
(
10
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
GetMLUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'MLUPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
uint64_t
Release
<
platform
::
MLUPlace
>
(
const
platform
::
MLUPlace
&
place
)
{
#ifdef PADDLE_WITH_MLU
return
GetMLUBuddyAllocator
(
place
.
device
)
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'MLUPlace' is not supported in CPU only device."
));
#endif
}
// For CustomDevice
// For CustomDevice
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class
BuddyAllocatorList
{
class
BuddyAllocatorList
{
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
浏览文件 @
e75c01f9
...
@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
...
@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
TEST
(
NaiveBestFitAllocatorTest
,
MluAlloc
)
{
NaiveBestFitAllocator
alloc
{
platform
::
MLUPlace
(
0
)};
{
size_t
size
=
(
1
<<
20
);
auto
allocation
=
alloc
.
Allocate
(
size
);
}
sleep
(
10
);
alloc
.
Release
(
platform
::
MLUPlace
(
0
));
size_t
size
=
(
1
<<
20
);
auto
allocation
=
alloc
.
Allocate
(
size
);
alloc
.
Release
(
platform
::
MLUPlace
(
0
));
}
#endif
}
// namespace allocation
}
// namespace allocation
}
// namespace memory
}
// namespace memory
}
// namespace paddle
}
// namespace paddle
paddle/fluid/memory/allocation/system_allocator.cc
浏览文件 @
e75c01f9
...
@@ -31,9 +31,6 @@ limitations under the License. */
...
@@ -31,9 +31,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
...
@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
...
@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
#endif
#ifdef PADDLE_WITH_MLU
void
*
MLUAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
void
*
p
;
auto
result
=
platform
::
RecordedMLUMalloc
(
&
p
,
size
,
mlu_id_
);
if
(
result
==
cnrtSuccess
)
{
*
index
=
0
;
mlu_alloc_size_
+=
size
;
return
p
;
}
else
{
size_t
avail
,
total
,
actual_avail
,
actual_total
;
bool
is_limited
=
platform
::
RecordedMLUMemGetInfo
(
&
avail
,
&
total
,
&
actual_avail
,
&
actual_total
,
mlu_id_
);
size_t
allocated
=
total
-
avail
;
std
::
string
err_msg
;
if
(
is_limited
)
{
auto
limit_size
=
(
total
>>
20
);
err_msg
=
string
::
Sprintf
(
"
\n
3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.
\n
"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`."
,
limit_size
,
limit_size
);
}
PADDLE_THROW_BAD_ALLOC
(
platform
::
errors
::
ResourceExhausted
(
"
\n\n
Out of memory error on MLU %d. "
"Cannot allocate %s memory on MLU %d, %s memory has been allocated and "
"available memory is only %s.
\n\n
"
"Please check whether there is any other process using MLU %d.
\n
"
"1. If yes, please stop them, or start PaddlePaddle on another MLU.
\n
"
"2. If no, please try one of the following suggestions:
\n
"
" 1) Decrease the batch size of your model.
\n
"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.
\n
"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s
\n\n
"
,
mlu_id_
,
string
::
HumanReadableSize
(
size
),
mlu_id_
,
string
::
HumanReadableSize
(
allocated
),
string
::
HumanReadableSize
(
avail
),
mlu_id_
,
FLAGS_fraction_of_gpu_memory_to_use
,
err_msg
));
}
}
void
MLUAllocator
::
Free
(
void
*
p
,
size_t
size
,
size_t
index
)
{
PADDLE_ENFORCE_EQ
(
index
,
0
,
platform
::
errors
::
InvalidArgument
(
"The index should be 0, index is %d"
,
index
));
PADDLE_ENFORCE_GE
(
mlu_alloc_size_
,
size
,
platform
::
errors
::
InvalidArgument
(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)"
,
size
,
mlu_alloc_size_
));
mlu_alloc_size_
-=
size
;
platform
::
RecordedMLUFree
(
p
,
size
,
mlu_id_
);
}
bool
MLUAllocator
::
UseGpu
()
const
{
return
true
;
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void
*
CustomAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
void
*
CustomAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
if
(
size
<=
0
)
return
nullptr
;
...
...
paddle/fluid/memory/allocation/system_allocator.h
浏览文件 @
e75c01f9
...
@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
...
@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
};
};
#endif
#endif
#ifdef PADDLE_WITH_MLU
class
MLUAllocator
:
public
SystemAllocator
{
public:
explicit
MLUAllocator
(
int
mlu_id
)
:
mlu_id_
(
mlu_id
)
{}
virtual
void
*
Alloc
(
size_t
*
index
,
size_t
size
);
virtual
void
Free
(
void
*
p
,
size_t
size
,
size_t
index
);
virtual
bool
UseGpu
()
const
;
private:
size_t
mlu_alloc_size_
=
0
;
int
mlu_id_
;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class
CustomAllocator
:
public
SystemAllocator
{
class
CustomAllocator
:
public
SystemAllocator
{
public:
public:
...
...
paddle/fluid/memory/allocation/system_allocator_test.cc
浏览文件 @
e75c01f9
...
@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) {
...
@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) {
}
}
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
TEST
(
MLUAllocator
,
Alloc
)
{
paddle
::
memory
::
detail
::
MLUAllocator
a
(
0
);
TestAllocator
(
&
a
,
2048
);
TestAllocator
(
&
a
,
0
);
}
TEST
(
MLUAllocator
,
AllocFailure
)
{
paddle
::
memory
::
detail
::
MLUAllocator
allocator
(
0
);
size_t
index
;
size_t
alloc_size
=
(
static_cast
<
size_t
>
(
1
)
<<
40
);
// Very large number
try
{
allocator
.
Alloc
(
&
index
,
alloc_size
);
ASSERT_TRUE
(
false
);
}
catch
(
paddle
::
memory
::
allocation
::
BadAlloc
&
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtGetLastError
());
}
}
#endif
paddle/fluid/memory/memcpy.cc
浏览文件 @
e75c01f9
...
@@ -23,10 +23,6 @@ limitations under the License. */
...
@@ -23,10 +23,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
...
@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
MLUPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
MLUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetMLUDeviceId
(
src_place
.
device
);
if
(
stream
)
{
VLOG
(
4
)
<<
"Async memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by mlu stream("
<<
stream
<<
")"
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyD2HAsync:MLU->CPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyD2HAsync
(
dst
,
src
,
num
,
reinterpret_cast
<
mluStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
MLUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
VLOG
(
4
)
<<
"Sync memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyD2HSync:MLU->CPU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyD2HSync
(
dst
,
src
,
num
);
}
}
template
<
>
void
Copy
<
platform
::
MLUPlace
,
platform
::
CPUPlace
>
(
platform
::
MLUPlace
dst_place
,
void
*
dst
,
platform
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetMLUDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
VLOG
(
4
)
<<
"Async memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by mlu stream("
<<
stream
<<
")"
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyH2DAsync:CPU->MLU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyH2DAsync
(
dst
,
src
,
num
,
reinterpret_cast
<
mluStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
MLUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
VLOG
(
4
)
<<
"Sync memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyH2DSync:CPU->MLU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyH2DSync
(
dst
,
src
,
num
);
}
}
template
<
>
void
Copy
<
platform
::
MLUPlace
,
platform
::
MLUPlace
>
(
platform
::
MLUPlace
dst_place
,
void
*
dst
,
platform
::
MLUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
if
(
dst_place
==
src_place
)
{
platform
::
SetMLUDeviceId
(
dst_place
.
device
);
if
(
stream
)
{
VLOG
(
4
)
<<
"Async memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by mlu stream("
<<
stream
<<
")"
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyD2DAsync(same_mlu):MLU->MLU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyD2DAsync
(
dst
,
src
,
num
,
reinterpret_cast
<
mluStream
>
(
stream
));
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
MLUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
VLOG
(
4
)
<<
"Sync memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyD2DSync(same_mlu):MLU->MLU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyD2DSync
(
dst
,
src
,
num
);
}
}
else
{
if
(
stream
)
{
VLOG
(
4
)
<<
"Async memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by mlu stream("
<<
stream
<<
")"
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyPeerAsync:MLU->MLU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyPeerAsync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
,
reinterpret_cast
<
mluStream
>
(
stream
));
}
else
{
VLOG
(
4
)
<<
"Sync memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
platform
::
RecordEvent
record_event
(
"MLUMemcpyPeerSync:MLU->MLU"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
platform
::
MLUMemcpyPeerSync
(
dst
,
dst_place
.
device
,
src
,
src_place
.
device
,
num
);
}
}
}
// NOTE: only for CPUPlace and MLUPlace.
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
Place
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
CPUPlace
place_dst
,
place_src
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
MLU
)
{
platform
::
MLUPlace
place_dst
(
dst_place
.
GetDeviceId
());
platform
::
CPUPlace
place_src
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
MLU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
MLUPlace
place_src
(
src_place
.
GetDeviceId
());
platform
::
CPUPlace
place_dst
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
MLU
&&
dst_place
.
GetType
()
==
phi
::
AllocationType
::
MLU
)
{
platform
::
MLUPlace
place_src
(
src_place
.
GetDeviceId
());
platform
::
MLUPlace
place_dst
(
dst_place
.
GetDeviceId
());
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
)
{
platform
::
CPUPlace
place_src
;
platform
::
CustomPlace
place_dst
(
dst_place
);
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CPU
)
{
platform
::
CustomPlace
place_src
(
src_place
);
platform
::
CPUPlace
place_dst
;
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
}
else
if
(
src_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
&&
// NOLINT
dst_place
.
GetType
()
==
phi
::
AllocationType
::
CUSTOM
)
{
platform
::
CustomPlace
place_src
(
src_place
);
platform
::
CustomPlace
place_dst
(
dst_place
);
return
Copy
(
place_dst
,
dst
,
place_src
,
src
,
num
,
stream
);
#endif
}
}
// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
template
<
>
void
Copy
<
phi
::
MLUPlace
,
phi
::
Place
>
(
phi
::
MLUPlace
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
Copy
(
phi
::
Place
(
dst_place
.
GetType
(),
dst_place
.
GetDeviceId
()),
dst
,
src_place
,
src
,
num
,
stream
);
}
// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
MLUPlace
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
MLUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
Copy
(
dst_place
,
dst
,
phi
::
Place
(
src_place
.
GetType
(),
src_place
.
GetDeviceId
()),
src
,
num
,
stream
);
}
// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
template
<
>
void
Copy
<
phi
::
CPUPlace
,
phi
::
Place
>
(
phi
::
CPUPlace
dst_place
,
void
*
dst
,
phi
::
Place
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
Copy
(
phi
::
Place
(
dst_place
.
GetType
()),
dst
,
src_place
,
src
,
num
,
stream
);
}
// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
CPUPlace
>
(
phi
::
Place
dst_place
,
void
*
dst
,
phi
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
void
*
stream
)
{
Copy
(
dst_place
,
dst
,
phi
::
Place
(
src_place
.
GetType
()),
src
,
num
,
stream
);
}
#endif // PADDLE_WITH_MLU
// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
template
<
>
template
<
>
void
Copy
<
phi
::
Place
,
phi
::
Place
>
(
phi
::
Place
dst_place
,
void
Copy
<
phi
::
Place
,
phi
::
Place
>
(
phi
::
Place
dst_place
,
...
...
paddle/fluid/memory/memcpy.h
浏览文件 @
e75c01f9
...
@@ -16,9 +16,6 @@ limitations under the License. */
...
@@ -16,9 +16,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -55,10 +55,6 @@ if (WITH_LITE)
...
@@ -55,10 +55,6 @@ if (WITH_LITE)
add_subdirectory
(
lite
)
add_subdirectory
(
lite
)
endif
()
endif
()
if
(
WITH_MLU
)
add_subdirectory
(
mlu
)
endif
()
if
(
WITH_CINN
)
if
(
WITH_CINN
)
add_subdirectory
(
cinn
)
add_subdirectory
(
cinn
)
endif
()
endif
()
...
@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL)
...
@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL)
op_library
(
sync_batch_norm_op
)
op_library
(
sync_batch_norm_op
)
endif
()
endif
()
if
(
WITH_MLU
)
op_library
(
sync_batch_norm_op
)
endif
()
op_library
(
lstm_op DEPS
${
OP_HEADER_DEPS
}
lstm_compute
)
op_library
(
lstm_op DEPS
${
OP_HEADER_DEPS
}
lstm_compute
)
op_library
(
recurrent_op DEPS
${
OP_HEADER_DEPS
}
)
op_library
(
recurrent_op DEPS
${
OP_HEADER_DEPS
}
)
...
...
paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CheckFiniteAndUnscaleMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
const
auto
xs
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"X"
);
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
auto
outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
found_inf
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"FoundInfinite"
);
found_inf
->
mutable_data
<
bool
>
(
dev_ctx
.
GetPlace
());
MLUCnnlTensorDesc
scale_desc
(
*
scale
);
MLUCnnlTensorDesc
found_inf_desc
(
*
found_inf
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
bool
>
());
for
(
size_t
i
=
0
;
i
<
xs
.
size
();
++
i
)
{
const
auto
*
x
=
xs
[
i
];
auto
*
out
=
outs
[
i
];
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// check is_finite or is_nan
phi
::
DenseTensor
is_finite
(
found_inf
->
type
());
if
(
i
!=
0
)
{
is_finite
.
Resize
(
phi
::
make_ddim
({
1
}));
is_finite
.
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
}
else
{
is_finite
.
ShareDataWith
(
*
found_inf
);
}
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
IsNanInf
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
GetBasePtr
(
&
is_finite
));
// save is_finite by logical_and op after checking every input
if
(
i
!=
0
)
{
MLUCnnlTensorDesc
is_finite_desc
(
is_finite
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
bool
>
());
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_OR
,
found_inf_desc
.
get
(),
GetBasePtr
(
found_inf
),
is_finite_desc
.
get
(),
GetBasePtr
(
&
is_finite
),
found_inf_desc
.
get
(),
GetBasePtr
(
found_inf
));
}
// The normal logic is :
// out = in, if found_inf = true
// out = in/scale, if found_inf = false
// But when found_inf is true, the data of Out should not be used.
// So, on MLU, we always compute out with in/scale.
phi
::
DenseTensor
float_x
;
phi
::
DenseTensor
float_out
;
if
(
std
::
is_same
<
T
,
paddle
::
platform
::
float16
>::
value
)
{
float_x
.
Resize
(
x
->
dims
());
float_out
.
Resize
(
out
->
dims
());
float_x
.
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
float_out
.
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
float_x_desc
(
float_x
);
MLUCnnlTensorDesc
float_out_desc
(
float_out
);
auto
cast_fp16_type
=
GetCastDataType
(
DataType
::
FLOAT16
,
DataType
::
FLOAT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_fp16_type
,
x_desc
.
get
(),
GetBasePtr
(
x
),
float_x_desc
.
get
(),
GetBasePtr
(
&
float_x
));
MLUCnnl
::
Div
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
float_x_desc
.
get
(),
GetBasePtr
(
&
float_x
),
scale_desc
.
get
(),
GetBasePtr
(
scale
),
float_out_desc
.
get
(),
GetBasePtr
(
&
float_out
));
auto
cast_fp32_type
=
GetCastDataType
(
DataType
::
FLOAT32
,
DataType
::
FLOAT16
);
MLUCnnl
::
Cast
(
ctx
,
cast_fp32_type
,
float_out_desc
.
get
(),
GetBasePtr
(
&
float_out
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
else
{
MLUCnnl
::
Div
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
x_desc
.
get
(),
GetBasePtr
(
x
),
scale_desc
.
get
(),
GetBasePtr
(
scale
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
check_finite_and_unscale
,
ops
::
CheckFiniteAndUnscaleMLUKernel
<
float
>
,
ops
::
CheckFiniteAndUnscaleMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/cast_op.cc
浏览文件 @
e75c01f9
...
@@ -21,9 +21,7 @@ limitations under the License. */
...
@@ -21,9 +21,7 @@ limitations under the License. */
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/infermeta/unary.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
...
@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel {
...
@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel {
}
}
// NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
// NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MLU
auto
src_type
=
static_cast
<
VT
::
Type
>
(
ctx
.
Attr
<
int
>
(
"in_dtype"
));
auto
dst_type
=
static_cast
<
VT
::
Type
>
(
ctx
.
Attr
<
int
>
(
"out_dtype"
));
if
(
src_type
==
dst_type
||
MLUSupportsCast
(
src_type
,
dst_type
))
{
return
phi
::
KernelKey
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
()),
tensor_place
);
}
else
{
VLOG
(
3
)
<<
"MLU not support cast type: "
<<
framework
::
DataTypeToString
(
src_type
)
<<
" to type: "
<<
framework
::
DataTypeToString
(
dst_type
)
<<
", fallbacking to CPU one!"
;
return
phi
::
KernelKey
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
()),
platform
::
CPUPlace
());
}
#endif
return
phi
::
KernelKey
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
()),
return
phi
::
KernelKey
(
framework
::
TransToProtoVarType
(
tensor
->
dtype
()),
tensor_place
);
tensor_place
);
}
}
...
...
paddle/fluid/operators/coalesce_tensor_op.cc
浏览文件 @
e75c01f9
...
@@ -23,9 +23,6 @@
...
@@ -23,9 +23,6 @@
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/convert_utils.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
#include "paddle/phi/infermeta/multiary.h"
...
@@ -57,17 +54,8 @@ struct FillConstantVisitor {
...
@@ -57,17 +54,8 @@ struct FillConstantVisitor {
void
apply
(
typename
std
::
enable_if
<!
(
std
::
is_same
<
T
,
int8_t
>::
value
||
void
apply
(
typename
std
::
enable_if
<!
(
std
::
is_same
<
T
,
int8_t
>::
value
||
std
::
is_same
<
T
,
int16_t
>::
value
)
>::
type
std
::
is_same
<
T
,
int16_t
>::
value
)
>::
type
*
=
nullptr
)
const
{
*
=
nullptr
)
const
{
#if defined(PADDLE_WITH_MLU)
if
(
platform
::
is_mlu_place
(
context_
.
GetPlace
()))
{
FillMLUTensorWithHostValue
<
T
>
(
context_
,
static_cast
<
T
>
(
value_
),
tensor_
);
}
else
{
phi
::
funcs
::
SetConstant
<
DeviceContext
,
T
>
set_constant
;
set_constant
(
dev_ctx_
,
tensor_
,
static_cast
<
T
>
(
value_
));
}
#else
phi
::
funcs
::
SetConstant
<
DeviceContext
,
T
>
set_constant
;
phi
::
funcs
::
SetConstant
<
DeviceContext
,
T
>
set_constant
;
set_constant
(
dev_ctx_
,
tensor_
,
static_cast
<
T
>
(
value_
));
set_constant
(
dev_ctx_
,
tensor_
,
static_cast
<
T
>
(
value_
));
#endif
}
}
const
DeviceContext
&
dev_ctx_
;
const
DeviceContext
&
dev_ctx_
;
...
@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor,
...
@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor,
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
namespace
plat
=
paddle
::
platform
;
#if defined(PADDLE_WITH_MLU)
REGISTER_OP_MLU_KERNEL
(
coalesce_tensor
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
plat
::
float16
>
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
int
>
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
float
>
);
#endif
REGISTER_OP_VERSION
(
coalesce_tensor
)
REGISTER_OP_VERSION
(
coalesce_tensor
)
.
AddCheckpoint
(
.
AddCheckpoint
(
R"ROC(
R"ROC(
...
...
paddle/fluid/operators/collective/barrier_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
BarrierOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_CNCL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int64_t
numel
=
in
->
numel
();
const
void
*
sendbuff
=
in
->
data
();
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
cncl_comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
auto
*
comm
=
cncl_comm
->
comm
();
auto
comm_stream
=
cncl_comm
->
stream
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
cnclReduceOp_t
cncl_red_type
=
cnclSum
;
dev_ctx
.
Wait
();
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
cncl_red_type
,
comm
,
comm_stream
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
comm_stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"PaddlePaddle should compile with CNCL."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
barrier
,
ops
::
BarrierOpMLUKernel
<
int
>
);
paddle/fluid/operators/collective/c_allgather_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#include "paddle/fluid/framework/convert_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CAllGatherOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
place
=
ctx
.
GetPlace
();
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
#if defined(PADDLE_WITH_CNCL)
auto
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
nranks
=
ctx
.
Attr
<
int
>
(
"nranks"
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
PADDLE_ENFORCE_EQ
(
nranks
,
comm
->
nranks
(),
platform
::
errors
::
InvalidArgument
(
"nranks: %s should equal to %s"
,
nranks
,
comm
->
nranks
()));
framework
::
DDim
out_dims
=
x
->
dims
();
out_dims
[
0
]
*=
nranks
;
out
->
mutable_data
<
T
>
(
out_dims
,
place
);
uint32_t
send_numel
=
x
->
numel
();
void
*
send_buff
;
void
*
recv_buff
;
phi
::
DenseTensor
in_tensor
,
out_tensor
;
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
// cast from int64 to int32 since cncl do not support int64
in_tensor
.
mutable_data
<
int32_t
>
(
x
->
dims
(),
place
);
out_tensor
.
mutable_data
<
int32_t
>
(
out
->
dims
(),
place
);
MLUCnnlTensorDesc
x_int64_desc
(
*
x
);
MLUCnnlTensorDesc
x_int32_desc
(
in_tensor
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT64
,
VT
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
x_int64_desc
.
get
(),
GetBasePtr
(
x
),
x_int32_desc
.
get
(),
GetBasePtr
(
&
in_tensor
));
send_buff
=
reinterpret_cast
<
void
*>
(
in_tensor
.
data
<
int32_t
>
());
recv_buff
=
reinterpret_cast
<
void
*>
(
out_tensor
.
data
<
int32_t
>
());
}
else
{
in_tensor
.
ShareDataWith
(
*
x
);
out_tensor
.
ShareDataWith
(
*
out
);
send_buff
=
reinterpret_cast
<
void
*>
(
in_tensor
.
data
<
T
>
());
recv_buff
=
reinterpret_cast
<
void
*>
(
out_tensor
.
data
<
T
>
());
}
mluStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
in_tensor
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllGather
(
send_buff
,
recv_buff
,
send_numel
,
dtype
,
comm
->
comm
(),
stream
));
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
// cast back from int64 out_tensor to out
MLUCnnlTensorDesc
out_int64_desc
(
*
out
);
MLUCnnlTensorDesc
out_int32_desc
(
out_tensor
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
out_int32_desc
.
get
(),
GetBasePtr
(
&
out_tensor
),
out_int64_desc
.
get
(),
GetBasePtr
(
out
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with MLU."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_allgather
,
ops
::
CAllGatherOpMLUKernel
<
float
>
,
ops
::
CAllGatherOpMLUKernel
<
uint8_t
>
,
ops
::
CAllGatherOpMLUKernel
<
int
>
,
ops
::
CAllGatherOpMLUKernel
<
int8_t
>
,
ops
::
CAllGatherOpMLUKernel
<
int16_t
>
,
ops
::
CAllGatherOpMLUKernel
<
int64_t
>
,
ops
::
CAllGatherOpMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/c_allreduce_max_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_allreduce_max
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMax
,
float
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMax
,
plat
::
float16
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMax
,
int
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMax
,
int16_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMax
,
int8_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMax
,
uint8_t
>
)
paddle/fluid/operators/collective/c_allreduce_min_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_allreduce_min
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMin
,
float
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMin
,
plat
::
float16
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMin
,
int
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMin
,
int16_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMin
,
int8_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedMin
,
uint8_t
>
)
paddle/fluid/operators/collective/c_allreduce_prod_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_allreduce_prod
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedProd
,
float
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedProd
,
plat
::
float16
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedProd
,
int
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedProd
,
int16_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedProd
,
int8_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedProd
,
uint8_t
>
)
paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_allreduce_sum
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
float
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
plat
::
float16
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
int
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
int16_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
int8_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
uint8_t
>
)
paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CBroadcastOPMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_CNCL)
auto
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
numel
=
x
->
numel
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
x
->
dtype
()));
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
mluStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
if
(
root
==
comm
->
rank
())
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclBcast
(
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
())),
numel
,
dtype
,
root
,
comm
->
comm
(),
stream
));
VLOG
(
3
)
<<
"rank "
<<
comm
->
rank
()
<<
" invoke Bcast. sent "
<<
x
->
numel
();
if
(
out
!=
x
)
{
framework
::
TensorCopy
(
*
static_cast
<
const
phi
::
DenseTensor
*>
(
x
),
place
,
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
),
static_cast
<
phi
::
DenseTensor
*>
(
out
));
}
}
else
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclBcast
(
out
->
mutable_data
<
T
>
(
place
),
numel
,
dtype
,
root
,
comm
->
comm
(),
stream
));
VLOG
(
3
)
<<
"rank "
<<
comm
->
rank
()
<<
" invoke Bcast. received "
<<
phi
::
product
(
out
->
dims
());
}
out
->
Resize
(
x
->
dims
());
out
->
set_lod
(
x
->
lod
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with MLU."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_broadcast
,
ops
::
CBroadcastOPMLUKernel
<
float
>
,
ops
::
CBroadcastOPMLUKernel
<
plat
::
float16
>
,
ops
::
CBroadcastOPMLUKernel
<
int
>
,
ops
::
CBroadcastOPMLUKernel
<
int16_t
>
,
ops
::
CBroadcastOPMLUKernel
<
int8_t
>
,
ops
::
CBroadcastOPMLUKernel
<
uint8_t
>
);
paddle/fluid/operators/collective/c_reduce_max_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_reduce_max
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMax
,
float
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMax
,
plat
::
float16
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMax
,
int
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMax
,
int16_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMax
,
int8_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMax
,
uint8_t
>
)
paddle/fluid/operators/collective/c_reduce_min_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_reduce_min
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMin
,
float
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMin
,
plat
::
float16
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMin
,
int
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMin
,
int16_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMin
,
int8_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedMin
,
uint8_t
>
)
paddle/fluid/operators/collective/c_reduce_prod_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_reduce_prod
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedProd
,
float
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedProd
,
plat
::
float16
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedProd
,
int
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedProd
,
int16_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedProd
,
int8_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedProd
,
uint8_t
>
)
paddle/fluid/operators/collective/c_reduce_sum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
c_reduce_sum
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedSum
,
float
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedSum
,
plat
::
float16
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedSum
,
int
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedSum
,
int16_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedSum
,
int8_t
>
,
ops
::
CReduceOpMLUKernel
<
ops
::
kRedSum
,
uint8_t
>
)
paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
浏览文件 @
e75c01f9
...
@@ -33,22 +33,7 @@ Call calculation stream synchronization.
...
@@ -33,22 +33,7 @@ Call calculation stream synchronization.
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
namespace
plat
=
paddle
::
platform
;
using
MLU
=
plat
::
MLUPlace
;
REGISTER_OP_WITHOUT_GRADIENT
(
c_sync_calc_stream
,
REGISTER_OP_WITHOUT_GRADIENT
(
c_sync_calc_stream
,
ops
::
CSyncCalcStreamOp
,
ops
::
CSyncCalcStreamOp
,
ops
::
CSyncCalcStreamOpMaker
);
ops
::
CSyncCalcStreamOpMaker
);
REGISTER_OP_NPU_KERNEL
(
c_sync_calc_stream
,
ops
::
CSyncCalcStreamKernel
<
float
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
double
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
int
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
int64_t
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
plat
::
float16
,
MLU
>
);
REGISTER_OP_MLU_KERNEL
(
c_sync_calc_stream
,
ops
::
CSyncCalcStreamKernel
<
float
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
double
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
int
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
int64_t
,
MLU
>
,
ops
::
CSyncCalcStreamKernel
<
plat
::
float16
,
MLU
>
);
paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
浏览文件 @
e75c01f9
...
@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,
...
@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,
REGISTER_OP_NPU_KERNEL
(
c_sync_comm_stream
,
REGISTER_OP_NPU_KERNEL
(
c_sync_comm_stream
,
ops
::
CSyncCommStreamKernel
<
float
,
plat
::
NPUPlace
>
);
ops
::
CSyncCommStreamKernel
<
float
,
plat
::
NPUPlace
>
);
REGISTER_OP_MLU_KERNEL
(
c_sync_comm_stream
,
ops
::
CSyncCommStreamKernel
<
float
,
plat
::
MLUPlace
>
);
paddle/fluid/operators/collective/mp_allreduce_sum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
mp_allreduce_sum
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
float
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
plat
::
float16
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
int
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
int16_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
int8_t
>
,
ops
::
CAllReduceOpMLUKernel
<
ops
::
kRedSum
,
uint8_t
>
)
paddle/fluid/operators/controlflow/compare_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
EqualMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_x
(
*
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnlTensorDesc
input_y
(
*
y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
y
->
dtype
()));
MLUCnnlTensorDesc
output
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_EQ
,
input_x
.
get
(),
GetBasePtr
(
x
),
input_y
.
get
(),
GetBasePtr
(
y
),
output
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
NotEqualMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_x
(
*
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnlTensorDesc
input_y
(
*
y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
y
->
dtype
()));
MLUCnnlTensorDesc
output
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_NE
,
input_x
.
get
(),
GetBasePtr
(
x
),
input_y
.
get
(),
GetBasePtr
(
y
),
output
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
LessThanMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_x
(
*
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnlTensorDesc
input_y
(
*
y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
y
->
dtype
()));
MLUCnnlTensorDesc
output
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_LT
,
input_x
.
get
(),
GetBasePtr
(
x
),
input_y
.
get
(),
GetBasePtr
(
y
),
output
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
LessEqualMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_x
(
*
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnlTensorDesc
input_y
(
*
y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
y
->
dtype
()));
MLUCnnlTensorDesc
output
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_LE
,
input_x
.
get
(),
GetBasePtr
(
x
),
input_y
.
get
(),
GetBasePtr
(
y
),
output
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
GreaterThanMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_x
(
*
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnlTensorDesc
input_y
(
*
y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
y
->
dtype
()));
MLUCnnlTensorDesc
output
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_GT
,
input_x
.
get
(),
GetBasePtr
(
x
),
input_y
.
get
(),
GetBasePtr
(
y
),
output
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
GreaterEqualMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_x
(
*
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnlTensorDesc
input_y
(
*
y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
y
->
dtype
()));
MLUCnnlTensorDesc
output
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_GE
,
input_x
.
get
(),
GetBasePtr
(
x
),
input_y
.
get
(),
GetBasePtr
(
y
),
output
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
equal
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
int16_t
>
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
EqualMLUKernel
<
plat
::
MLUDeviceContext
,
bool
>
);
REGISTER_OP_MLU_KERNEL
(
not_equal
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int16_t
>
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
NotEqualMLUKernel
<
plat
::
MLUDeviceContext
,
bool
>
);
REGISTER_OP_MLU_KERNEL
(
less_than
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
int16_t
>
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
LessThanMLUKernel
<
plat
::
MLUDeviceContext
,
bool
>
);
REGISTER_OP_MLU_KERNEL
(
less_equal
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int16_t
>
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
LessEqualMLUKernel
<
plat
::
MLUDeviceContext
,
bool
>
);
REGISTER_OP_MLU_KERNEL
(
greater_than
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
int16_t
>
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
GreaterThanMLUKernel
<
plat
::
MLUDeviceContext
,
bool
>
);
REGISTER_OP_MLU_KERNEL
(
greater_equal
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int16_t
>
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
GreaterEqualMLUKernel
<
plat
::
MLUDeviceContext
,
bool
>
);
paddle/fluid/operators/controlflow/logical_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
cnnlLogicOp_t
log_method
>
class
LogicalMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
log_method
==
CNNL_LOGIC_OP_NOT
)
{
y
=
x
;
}
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Logic
(
ctx
,
log_method
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
logical_not
,
ops
::
LogicalMLUKernel
<
bool
,
CNNL_LOGIC_OP_NOT
>
,
ops
::
LogicalMLUKernel
<
int8_t
,
CNNL_LOGIC_OP_NOT
>
,
ops
::
LogicalMLUKernel
<
int16_t
,
CNNL_LOGIC_OP_NOT
>
,
ops
::
LogicalMLUKernel
<
int
,
CNNL_LOGIC_OP_NOT
>
,
ops
::
LogicalMLUKernel
<
float
,
CNNL_LOGIC_OP_NOT
>
);
REGISTER_OP_MLU_KERNEL
(
logical_and
,
ops
::
LogicalMLUKernel
<
bool
,
CNNL_LOGIC_OP_AND
>
,
ops
::
LogicalMLUKernel
<
int8_t
,
CNNL_LOGIC_OP_AND
>
,
ops
::
LogicalMLUKernel
<
int16_t
,
CNNL_LOGIC_OP_AND
>
,
ops
::
LogicalMLUKernel
<
int
,
CNNL_LOGIC_OP_AND
>
,
ops
::
LogicalMLUKernel
<
float
,
CNNL_LOGIC_OP_AND
>
);
REGISTER_OP_MLU_KERNEL
(
logical_or
,
ops
::
LogicalMLUKernel
<
bool
,
CNNL_LOGIC_OP_OR
>
,
ops
::
LogicalMLUKernel
<
int8_t
,
CNNL_LOGIC_OP_OR
>
,
ops
::
LogicalMLUKernel
<
int16_t
,
CNNL_LOGIC_OP_OR
>
,
ops
::
LogicalMLUKernel
<
int
,
CNNL_LOGIC_OP_OR
>
,
ops
::
LogicalMLUKernel
<
float
,
CNNL_LOGIC_OP_OR
>
);
REGISTER_OP_MLU_KERNEL
(
logical_xor
,
ops
::
LogicalMLUKernel
<
bool
,
CNNL_LOGIC_OP_XOR
>
,
ops
::
LogicalMLUKernel
<
int8_t
,
CNNL_LOGIC_OP_XOR
>
,
ops
::
LogicalMLUKernel
<
int16_t
,
CNNL_LOGIC_OP_XOR
>
,
ops
::
LogicalMLUKernel
<
int
,
CNNL_LOGIC_OP_XOR
>
,
ops
::
LogicalMLUKernel
<
float
,
CNNL_LOGIC_OP_XOR
>
);
paddle/fluid/operators/detection/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -38,11 +38,6 @@ if(WITH_XPU)
...
@@ -38,11 +38,6 @@ if(WITH_XPU)
detection_library
(
prior_box_op SRCS prior_box_op.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc
)
detection_library
(
generate_proposals_v2_op SRCS generate_proposals_v2_op.cc
)
detection_library
(
generate_proposals_v2_op SRCS generate_proposals_v2_op.cc
)
elseif
(
WITH_MLU
)
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_mlu.cc
)
detection_library
(
prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc
)
detection_library
(
yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc
)
else
()
else
()
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
detection_library
(
iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu
)
iou_similarity_op.cu
)
...
...
paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/iou_similarity_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
struct
IouFunction
{
public:
explicit
IouFunction
(
const
framework
::
ExecutionContext
&
ctx
)
:
ctx
(
ctx
)
{
place
=
ctx
.
GetPlace
();
}
void
Transpose
(
const
phi
::
DenseTensor
*
x
,
phi
::
DenseTensor
*
y
,
const
std
::
vector
<
int
>&
axis
)
{
// y should be init first
TransposeFromMLUTensor
<
T
>
(
ctx
,
axis
,
x
,
y
,
false
/*need_reshape_or_alloc*/
);
}
void
Add
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
z
)
{
// y should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
z_desc
(
*
z
);
MLUCnnlOpTensorDesc
add_op_desc
(
CNNL_OP_TENSOR_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
add_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
z_desc
.
get
(),
GetBasePtr
(
z
),
ToCnnlDataType
<
T
>
());
}
void
Sub
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
z
)
{
// y should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
z_desc
(
*
z
);
MLUCnnlOpTensorDesc
sub_op_desc
(
CNNL_OP_TENSOR_SUB
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
sub_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
z_desc
.
get
(),
GetBasePtr
(
z
),
ToCnnlDataType
<
T
>
());
}
void
Mul
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
z
)
{
// z should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
z_desc
(
*
z
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
z_desc
.
get
(),
GetBasePtr
(
z
),
ToCnnlDataType
<
T
>
());
}
void
DivNoNan
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
z
)
{
// z should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
z_desc
(
*
z
);
cnnlComputationPreference_t
prefer
=
CNNL_COMPUTATION_FAST
;
MLUCnnl
::
DivNoNan
(
ctx
,
prefer
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
z_desc
.
get
(),
GetBasePtr
(
z
));
}
void
Adds
(
const
phi
::
DenseTensor
*
x
,
float
scalar
,
phi
::
DenseTensor
*
y
)
{
// y should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
float
alpha
=
1.0
;
float
beta
=
scalar
;
MLUCnnl
::
Transform
(
ctx
,
&
alpha
,
&
beta
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
));
}
void
Maximum
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
z
)
{
// z should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
z_desc
(
*
z
);
MLUCnnl
::
Maximum
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
z_desc
.
get
(),
GetBasePtr
(
z
));
}
void
Minimum
(
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
phi
::
DenseTensor
*
z
)
{
// z should be init first
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
z_desc
(
*
z
);
MLUCnnl
::
Minimum
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
z_desc
.
get
(),
GetBasePtr
(
z
));
}
private:
platform
::
Place
place
;
const
framework
::
ExecutionContext
&
ctx
;
};
template
<
typename
T
>
class
IouSimilarityMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
bool
normalized
=
ctx
.
Attr
<
bool
>
(
"box_normalized"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
_type
=
x
->
dtype
();
auto
place
=
ctx
.
GetPlace
();
IouFunction
<
T
>
F
(
ctx
);
auto
N
=
x
->
dims
()[
0
];
auto
M
=
y
->
dims
()[
0
];
out
->
mutable_data
<
T
>
({
N
,
M
},
place
);
phi
::
DenseTensor
xt
(
_type
);
phi
::
DenseTensor
yt
(
_type
);
xt
.
mutable_data
<
T
>
({
4
,
N
},
place
);
yt
.
mutable_data
<
T
>
({
4
,
M
},
place
);
std
::
vector
<
int
>
vec_trans
=
{
1
,
0
};
F
.
Transpose
(
x
,
&
xt
,
vec_trans
);
F
.
Transpose
(
y
,
&
yt
,
vec_trans
);
phi
::
DenseTensor
xmin1
=
xt
.
Slice
(
0
,
1
);
phi
::
DenseTensor
ymin1
=
xt
.
Slice
(
1
,
2
);
phi
::
DenseTensor
xmax1
=
xt
.
Slice
(
2
,
3
);
phi
::
DenseTensor
ymax1
=
xt
.
Slice
(
3
,
4
);
phi
::
DenseTensor
xmin2
=
yt
.
Slice
(
0
,
1
);
phi
::
DenseTensor
ymin2
=
yt
.
Slice
(
1
,
2
);
phi
::
DenseTensor
xmax2
=
yt
.
Slice
(
2
,
3
);
phi
::
DenseTensor
ymax2
=
yt
.
Slice
(
3
,
4
);
xmin1
.
Resize
({
N
,
1
});
ymin1
.
Resize
({
N
,
1
});
xmax1
.
Resize
({
N
,
1
});
ymax1
.
Resize
({
N
,
1
});
xmin2
.
Resize
({
1
,
M
});
ymin2
.
Resize
({
1
,
M
});
xmax2
.
Resize
({
1
,
M
});
ymax2
.
Resize
({
1
,
M
});
phi
::
DenseTensor
w1
(
_type
);
phi
::
DenseTensor
h1
(
_type
);
phi
::
DenseTensor
w2
(
_type
);
phi
::
DenseTensor
h2
(
_type
);
phi
::
DenseTensor
area1
(
_type
);
phi
::
DenseTensor
area2
(
_type
);
w1
.
mutable_data
<
T
>
({
N
,
1
},
place
);
h1
.
mutable_data
<
T
>
({
N
,
1
},
place
);
w2
.
mutable_data
<
T
>
({
1
,
M
},
place
);
h2
.
mutable_data
<
T
>
({
1
,
M
},
place
);
area1
.
mutable_data
<
T
>
({
N
,
1
},
place
);
area2
.
mutable_data
<
T
>
({
1
,
M
},
place
);
F
.
Sub
(
&
xmax1
,
&
xmin1
,
&
w1
);
F
.
Sub
(
&
ymax1
,
&
ymin1
,
&
h1
);
F
.
Sub
(
&
xmax2
,
&
xmin2
,
&
w2
);
F
.
Sub
(
&
ymax2
,
&
ymin2
,
&
h2
);
if
(
!
normalized
)
{
F
.
Adds
(
&
w1
,
1.0
f
,
&
w1
);
F
.
Adds
(
&
h1
,
1.0
f
,
&
h1
);
F
.
Adds
(
&
w2
,
1.0
f
,
&
w2
);
F
.
Adds
(
&
h2
,
1.0
f
,
&
h2
);
}
F
.
Mul
(
&
w1
,
&
h1
,
&
area1
);
F
.
Mul
(
&
w2
,
&
h2
,
&
area2
);
phi
::
DenseTensor
inter_xmax
(
_type
);
phi
::
DenseTensor
inter_ymax
(
_type
);
phi
::
DenseTensor
inter_xmin
(
_type
);
phi
::
DenseTensor
inter_ymin
(
_type
);
inter_xmax
.
mutable_data
<
T
>
({
N
,
M
},
place
);
inter_ymax
.
mutable_data
<
T
>
({
N
,
M
},
place
);
inter_xmin
.
mutable_data
<
T
>
({
N
,
M
},
place
);
inter_ymin
.
mutable_data
<
T
>
({
N
,
M
},
place
);
F
.
Minimum
(
&
xmax1
,
&
xmax2
,
&
inter_xmax
);
F
.
Minimum
(
&
ymax1
,
&
ymax2
,
&
inter_ymax
);
F
.
Maximum
(
&
xmin1
,
&
xmin2
,
&
inter_xmin
);
F
.
Maximum
(
&
ymin1
,
&
ymin2
,
&
inter_ymin
);
phi
::
DenseTensor
inter_w
(
_type
);
phi
::
DenseTensor
inter_h
(
_type
);
inter_w
.
mutable_data
<
T
>
({
N
,
M
},
place
);
inter_h
.
mutable_data
<
T
>
({
N
,
M
},
place
);
F
.
Sub
(
&
inter_xmax
,
&
inter_xmin
,
&
inter_w
);
F
.
Sub
(
&
inter_ymax
,
&
inter_ymin
,
&
inter_h
);
if
(
!
normalized
)
{
F
.
Adds
(
&
inter_w
,
1.0
f
,
&
inter_w
);
F
.
Adds
(
&
inter_h
,
1.0
f
,
&
inter_h
);
}
phi
::
DenseTensor
zeros
(
_type
);
zeros
.
mutable_data
<
T
>
({
1
},
place
);
FillMLUTensorWithHostValue
<
T
>
(
ctx
,
static_cast
<
T
>
(
0
),
&
zeros
);
F
.
Maximum
(
&
inter_w
,
&
zeros
,
&
inter_w
);
F
.
Maximum
(
&
inter_h
,
&
zeros
,
&
inter_h
);
F
.
Mul
(
&
inter_w
,
&
inter_h
,
out
);
phi
::
DenseTensor
union_area
(
_type
);
union_area
.
mutable_data
<
T
>
({
N
,
M
},
place
);
F
.
Add
(
&
area1
,
&
area2
,
&
union_area
);
F
.
Sub
(
&
union_area
,
out
,
&
union_area
);
F
.
DivNoNan
(
out
,
&
union_area
,
out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
iou_similarity
,
ops
::
IouSimilarityMLUKernel
<
float
>
,
ops
::
IouSimilarityMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/detection/prior_box_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/detection/prior_box_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PriorBoxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
image
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Image"
);
auto
*
boxes
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Boxes"
);
auto
*
variances
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Variances"
);
float
step_w
=
ctx
.
Attr
<
float
>
(
"step_w"
);
float
step_h
=
ctx
.
Attr
<
float
>
(
"step_h"
);
float
offset
=
ctx
.
Attr
<
float
>
(
"offset"
);
bool
clip
=
ctx
.
Attr
<
bool
>
(
"clip"
);
bool
min_max_aspect_ratios_order
=
ctx
.
Attr
<
bool
>
(
"min_max_aspect_ratios_order"
);
int
im_width
=
image
->
dims
()[
3
];
int
im_height
=
image
->
dims
()[
2
];
int
width
=
input
->
dims
()[
3
];
int
height
=
input
->
dims
()[
2
];
auto
aspect_ratios
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"aspect_ratios"
);
bool
flip
=
ctx
.
Attr
<
bool
>
(
"flip"
);
std
::
vector
<
float
>
new_aspect_ratios
;
ExpandAspectRatios
(
aspect_ratios
,
flip
,
&
new_aspect_ratios
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
phi
::
DenseTensor
ratios
;
paddle
::
framework
::
TensorFromVector
(
new_aspect_ratios
,
dev_ctx
,
&
ratios
);
MLUOpTensorDesc
new_aspect_ratios_desc
(
ratios
);
auto
min_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"min_sizes"
);
phi
::
DenseTensor
min
;
paddle
::
framework
::
TensorFromVector
(
min_sizes
,
dev_ctx
,
&
min
);
MLUOpTensorDesc
min_sizes_desc
(
min
);
auto
max_sizes
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"max_sizes"
);
phi
::
DenseTensor
max
;
paddle
::
framework
::
TensorFromVector
(
max_sizes
,
dev_ctx
,
&
max
);
MLUOpTensorDesc
max_sizes_desc
(
max
);
auto
variances_attr
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"variances"
);
phi
::
DenseTensor
var_tensor
;
paddle
::
framework
::
TensorFromVector
(
variances_attr
,
dev_ctx
,
&
var_tensor
);
MLUOpTensorDesc
variances_attr_desc
(
var_tensor
);
auto
place
=
ctx
.
GetPlace
();
boxes
->
mutable_data
<
T
>
(
place
);
variances
->
mutable_data
<
T
>
(
place
);
MLUOpTensorDesc
var_desc
(
*
variances
);
MLUOpTensorDesc
output_desc
(
*
boxes
);
MLUOP
::
OpPriorBox
(
ctx
,
min_sizes_desc
.
get
(),
GetBasePtr
(
&
min
),
new_aspect_ratios_desc
.
get
(),
GetBasePtr
(
&
ratios
),
variances_attr_desc
.
get
(),
GetBasePtr
(
&
var_tensor
),
max_sizes_desc
.
get
(),
GetBasePtr
(
&
max
),
height
,
width
,
im_height
,
im_width
,
step_h
,
step_w
,
offset
,
clip
,
min_max_aspect_ratios_order
,
output_desc
.
get
(),
GetBasePtr
(
boxes
),
var_desc
.
get
(),
GetBasePtr
(
variances
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
prior_box
,
ops
::
PriorBoxMLUKernel
<
float
>
);
paddle/fluid/operators/detection/yolo_box_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
YoloBoxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
img_size
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ImgSize"
);
auto
*
boxes
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Boxes"
);
auto
*
scores
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Scores"
);
const
std
::
vector
<
int
>
anchors
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"anchors"
);
auto
class_num
=
ctx
.
Attr
<
int
>
(
"class_num"
);
auto
conf_thresh
=
ctx
.
Attr
<
float
>
(
"conf_thresh"
);
auto
downsample_ratio
=
ctx
.
Attr
<
int
>
(
"downsample_ratio"
);
auto
clip_bbox
=
ctx
.
Attr
<
bool
>
(
"clip_bbox"
);
auto
scale
=
ctx
.
Attr
<
float
>
(
"scale_x_y"
);
auto
iou_aware
=
ctx
.
Attr
<
bool
>
(
"iou_aware"
);
auto
iou_aware_factor
=
ctx
.
Attr
<
float
>
(
"iou_aware_factor"
);
int
anchor_num
=
anchors
.
size
()
/
2
;
int64_t
size
=
anchors
.
size
();
auto
dim_x
=
x
->
dims
();
int
n
=
dim_x
[
0
];
int
s
=
anchor_num
;
int
h
=
dim_x
[
2
];
int
w
=
dim_x
[
3
];
// The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
// H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
// anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
std
::
vector
<
int64_t
>
boxes_dim_mluops
({
n
,
s
,
4
,
h
*
w
});
std
::
vector
<
int64_t
>
scores_dim_mluops
({
n
,
s
,
class_num
,
h
*
w
});
// In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
// of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
// classification scores of boxes.
std
::
vector
<
int64_t
>
boxes_out_dim
({
n
,
s
,
h
*
w
,
4
});
std
::
vector
<
int64_t
>
scores_out_dim
({
n
,
s
,
h
*
w
,
class_num
});
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
phi
::
DenseTensor
boxes_tensor_mluops
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
n
,
s
,
4
,
h
*
w
},
dev_ctx
);
phi
::
DenseTensor
scores_tensor_mluops
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
n
,
s
,
class_num
,
h
*
w
},
dev_ctx
);
MLUOpTensorDesc
boxes_trans_desc_mluops
(
4
,
boxes_dim_mluops
.
data
(),
ToMluOpDataType
<
T
>
());
MLUCnnlTensorDesc
boxes_trans_desc_cnnl
(
4
,
boxes_dim_mluops
.
data
(),
ToCnnlDataType
<
T
>
());
MLUOpTensorDesc
scores_trans_desc_mluops
(
4
,
scores_dim_mluops
.
data
(),
ToMluOpDataType
<
T
>
());
MLUCnnlTensorDesc
scores_trans_desc_cnnl
(
4
,
scores_dim_mluops
.
data
(),
ToCnnlDataType
<
T
>
());
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
scores
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0
),
boxes
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0
),
scores
);
MLUOpTensorDesc
x_desc
(
*
x
,
MLUOP_LAYOUT_ARRAY
,
ToMluOpDataType
<
T
>
());
MLUOpTensorDesc
img_size_desc
(
*
img_size
,
MLUOP_LAYOUT_ARRAY
,
ToMluOpDataType
<
int32_t
>
());
phi
::
DenseTensor
anchors_temp
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
anchors_temp
.
Resize
({
size
});
paddle
::
framework
::
TensorFromVector
(
anchors
,
ctx
.
device_context
(),
&
anchors_temp
);
MLUOpTensorDesc
anchors_desc
(
anchors_temp
);
MLUCnnlTensorDesc
boxes_desc_cnnl
(
4
,
boxes_out_dim
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
scores_desc_cnnl
(
4
,
scores_out_dim
.
data
(),
ToCnnlDataType
<
T
>
());
MLUOP
::
OpYoloBox
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
img_size_desc
.
get
(),
GetBasePtr
(
img_size
),
anchors_desc
.
get
(),
GetBasePtr
(
&
anchors_temp
),
class_num
,
conf_thresh
,
downsample_ratio
,
clip_bbox
,
scale
,
iou_aware
,
iou_aware_factor
,
boxes_trans_desc_mluops
.
get
(),
GetBasePtr
(
&
boxes_tensor_mluops
),
scores_trans_desc_mluops
.
get
(),
GetBasePtr
(
&
scores_tensor_mluops
));
const
std
::
vector
<
int
>
perm
=
{
0
,
1
,
3
,
2
};
// transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
MLUCnnl
::
Transpose
(
ctx
,
perm
,
4
,
boxes_trans_desc_cnnl
.
get
(),
GetBasePtr
(
&
boxes_tensor_mluops
),
boxes_desc_cnnl
.
get
(),
GetBasePtr
(
boxes
));
// transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
// class_num]
MLUCnnl
::
Transpose
(
ctx
,
perm
,
4
,
scores_trans_desc_cnnl
.
get
(),
GetBasePtr
(
&
scores_tensor_mluops
),
scores_desc_cnnl
.
get
(),
GetBasePtr
(
scores
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
yolo_box
,
ops
::
YoloBoxMLUKernel
<
float
>
);
paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseAddMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUOpTensorKernel
<
T
>
(
ctx
,
CNNL_OP_TENSOR_ADD
);
}
};
template
<
typename
T
>
class
ElementwiseAddGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
std
::
abs
(
x
->
dims
().
size
()
-
y
->
dims
().
size
())
:
axis
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec
;
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxesAndDstDims
(
axis
,
dout
->
dims
(),
dx
->
dims
(),
&
reduce_axes
,
&
dst_dims_vec
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dx_desc
(
dst_dims_vec
.
size
(),
dst_dims_vec
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
0
,
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
else
{
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
dev_ctx
,
dx
);
}
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec
;
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxesAndDstDims
(
axis
,
dout
->
dims
(),
dy
->
dims
(),
&
reduce_axes
,
&
dst_dims_vec
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dy_desc
(
dst_dims_vec
.
size
(),
dst_dims_vec
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
0
,
nullptr
,
nullptr
,
dy_desc
.
get
(),
GetBasePtr
(
dy
));
}
else
{
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
dev_ctx
,
dy
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
elementwise_add
,
ops
::
ElementwiseAddMLUKernel
<
float
>
,
ops
::
ElementwiseAddMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_add_grad
,
ops
::
ElementwiseAddGradMLUKernel
<
float
>
,
ops
::
ElementwiseAddGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseDivMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUBinaryOp
<
DIV
,
T
>
(
ctx
);
}
};
template
<
typename
T
>
class
ElementwiseDivGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
(
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
)
:
axis
);
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
MLUCnnlTensorDesc
x_desc
(
max_dim
,
x_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
max_dim
,
y_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
// compute dout/y == 1/y * dout
phi
::
DenseTensor
dout_div_y
(
dout
->
dtype
());
dout_div_y
.
Resize
(
dout
->
dims
());
dout_div_y
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUBinary
<
DIV
>
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
y_desc
.
get
(),
GetBasePtr
(
y
),
dout_desc
.
get
(),
GetBasePtr
(
&
dout_div_y
));
if
(
dx
)
{
// compute dx = dout/y = 1/y * dout
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dout_div_y
.
dims
(),
dx
->
dims
(),
&
reduce_axes
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
&
dout_div_y
),
0
,
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
else
{
dx
->
ShareDataWith
(
dout_div_y
);
}
}
if
(
dy
)
{
// compute dy = -out * (dout/y) = -out/y * dout
phi
::
DenseTensor
neg_out
(
out
->
type
());
neg_out
.
mutable_data
<
T
>
(
out
->
dims
(),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUUnary
<
NEG
>
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
out_desc
.
get
(),
GetBasePtr
(
out
),
out_desc
.
get
(),
GetBasePtr
(
&
neg_out
));
phi
::
DenseTensor
dy_temp
(
y
->
dtype
());
dy_temp
.
Resize
(
dout
->
dims
());
dy_temp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
&
neg_out
),
dout_desc
.
get
(),
GetBasePtr
(
&
dout_div_y
),
dout_desc
.
get
(),
GetBasePtr
(
&
dy_temp
),
ToCnnlDataType
<
T
>
());
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dy_temp
.
dims
(),
dy
->
dims
(),
&
reduce_axes
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dy_desc
(
*
dy
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
&
dy_temp
),
0
,
nullptr
,
nullptr
,
dy_desc
.
get
(),
GetBasePtr
(
dy
));
}
else
{
dy
->
ShareDataWith
(
dy_temp
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivMLUKernel
<
int
>
,
ops
::
ElementwiseDivMLUKernel
<
float
>
,
ops
::
ElementwiseDivMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_div_grad
,
ops
::
ElementwiseDivGradMLUKernel
<
int
>
,
ops
::
ElementwiseDivGradMLUKernel
<
float
>
,
ops
::
ElementwiseDivGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseMaxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUBinaryOp
<
MAXIMUM
,
T
>
(
ctx
);
}
};
template
<
typename
T
>
class
ElementwiseMaxGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUMinMaxGradHelper
<
MAXIMUM_GRAD
,
T
>
(
ctx
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
elementwise_max
,
ops
::
ElementwiseMaxMLUKernel
<
int
>
,
ops
::
ElementwiseMaxMLUKernel
<
float
>
,
ops
::
ElementwiseMaxMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_max_grad
,
ops
::
ElementwiseMaxGradMLUKernel
<
int
>
,
ops
::
ElementwiseMaxGradMLUKernel
<
float
>
,
ops
::
ElementwiseMaxGradMLUKernel
<
paddle
::
platform
::
float16
>
);
#endif
paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseMinMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUBinaryOp
<
MINIMUM
,
T
>
(
ctx
);
}
};
template
<
typename
T
>
class
ElementwiseMinGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUMinMaxGradHelper
<
MINIMUM_GRAD
,
T
>
(
ctx
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
elementwise_min
,
ops
::
ElementwiseMinMLUKernel
<
int
>
,
ops
::
ElementwiseMinMLUKernel
<
float
>
,
ops
::
ElementwiseMinMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_min_grad
,
ops
::
ElementwiseMinGradMLUKernel
<
int
>
,
ops
::
ElementwiseMinGradMLUKernel
<
float
>
,
ops
::
ElementwiseMinGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_mlu.h
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_MLU
#include <vector>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
inline
void
GetReduceAxes
(
const
int
axis
,
const
framework
::
DDim
&
src_ddims
,
const
framework
::
DDim
&
target_ddims
,
std
::
vector
<
int
>*
axes
)
{
int64_t
src_dim_size
=
src_ddims
.
size
();
int64_t
target_dim_size
=
target_ddims
.
size
();
for
(
int64_t
i
=
0
;
i
<
src_dim_size
;
++
i
)
{
if
(
i
<
axis
||
i
>=
target_dim_size
+
axis
)
{
axes
->
push_back
(
i
);
continue
;
}
if
(
src_ddims
[
i
]
>
target_ddims
[
i
-
axis
])
{
axes
->
push_back
(
i
);
}
}
}
inline
void
GetReduceAxesAndDstDims
(
const
int
axis
,
const
framework
::
DDim
&
src_ddims
,
const
framework
::
DDim
&
target_ddims
,
std
::
vector
<
int
>*
reduce_axes
,
std
::
vector
<
int
>*
dst_dims_vec
)
{
int64_t
src_dim_size
=
src_ddims
.
size
();
int64_t
target_dim_size
=
target_ddims
.
size
();
int
src_axis
=
(
target_dim_size
<
src_dim_size
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
src_dim_size
;
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
target_dim_size
)
||
(
src_ddims
[
ax
]
>
1
&&
target_ddims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
->
push_back
(
ax
);
}
else
{
dst_dims_vec
->
push_back
(
src_ddims
[
ax
]);
}
}
if
(
dst_dims_vec
->
size
()
==
0
)
{
// target_var is scalar
dst_dims_vec
->
push_back
(
1
);
}
}
template
<
typename
T
>
void
MLUOpTensorKernel
(
const
framework
::
ExecutionContext
&
ctx
,
const
cnnlOpTensorDesc_t
op_tensor_op
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
Unavailable
(
"This kernel only runs on MLU."
));
PADDLE_ENFORCE_EQ
((
op_tensor_op
==
CNNL_OP_TENSOR_ADD
)
||
(
op_tensor_op
==
CNNL_OP_TENSOR_SUB
)
||
(
op_tensor_op
==
CNNL_OP_TENSOR_MUL
),
true
,
platform
::
errors
::
Unavailable
(
"This kernel of MLU only support ADD, SUB, MUL."
));
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
(
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
)
:
axis
);
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
MLUCnnlTensorDesc
x_desc
(
max_dim
,
x_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
max_dim
,
y_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnlOpTensorDesc
op_tensor_desc
(
op_tensor_op
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
out_desc
.
get
(),
GetBasePtr
(
out
),
ToCnnlDataType
<
T
>
());
}
// ------------------ BinaryOp -----------------
enum
BINARY_FUNCTOR
{
DIV
,
DIVNONAN
,
MAXIMUM
,
MINIMUM
,
POW
,
};
template
<
BINARY_FUNCTOR
func
>
void
MLUBinary
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
out_desc
,
void
*
out
);
template
<
>
inline
void
MLUBinary
<
DIV
>
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
out_desc
,
void
*
out
)
{
MLUCnnl
::
Div
(
ctx
,
prefer
,
x_desc
,
x
,
y_desc
,
y
,
out_desc
,
out
);
}
template
<
>
inline
void
MLUBinary
<
MAXIMUM
>
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
// useless, only for compatible
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
out_desc
,
void
*
out
)
{
MLUCnnl
::
Maximum
(
ctx
,
x_desc
,
x
,
y_desc
,
y
,
out_desc
,
out
);
}
template
<
>
inline
void
MLUBinary
<
MINIMUM
>
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
in2_desc
,
const
void
*
in2
,
const
cnnlTensorDescriptor_t
out_desc
,
void
*
out
)
{
MLUCnnl
::
Minimum
(
ctx
,
in1_desc
,
in1
,
in2_desc
,
in2
,
out_desc
,
out
);
}
template
<
>
inline
void
MLUBinary
<
POW
>
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
out_desc
,
void
*
out
)
{
MLUCnnl
::
Pow
(
ctx
,
prefer
,
x_desc
,
x
,
y_desc
,
y
,
out_desc
,
out
);
}
template
<
BINARY_FUNCTOR
Functor
,
typename
T
>
void
MLUBinaryOp
(
const
framework
::
ExecutionContext
&
ctx
)
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
(
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
)
:
axis
);
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
MLUCnnlTensorDesc
x_desc
(
max_dim
,
x_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
max_dim
,
y_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
cnnlComputationPreference_t
prefer_type
=
CNNL_COMPUTATION_HIGH_PRECISION
;
MLUBinary
<
Functor
>
(
ctx
,
prefer_type
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
// ------------------ UnaryOp -----------------
enum
UNARY_FUNCTOR
{
NEG
,
RECIPROCAL
,
};
template
<
UNARY_FUNCTOR
func
>
void
MLUUnary
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
template
<
>
inline
void
MLUUnary
<
NEG
>
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
MLUCnnl
::
Neg
(
ctx
,
input_desc
,
input
,
output_desc
,
output
);
}
template
<
>
inline
void
MLUUnary
<
RECIPROCAL
>
(
const
framework
::
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
MLUCnnl
::
Reciprocal
(
ctx
,
input_desc
,
input
,
output_desc
,
output
);
}
template
<
UNARY_FUNCTOR
Functor
,
typename
Tin
,
typename
Tout
=
Tin
>
void
MLUUnaryOp
(
const
framework
::
ExecutionContext
&
ctx
)
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
Tout
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
Tin
>
());
MLUCnnlTensorDesc
out_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
Tout
>
());
cnnlComputationPreference_t
prefer_type
=
CNNL_COMPUTATION_HIGH_PRECISION
;
MLUUnary
<
Functor
>
(
ctx
,
prefer_type
,
x_desc
.
get
(),
GetBasePtr
(
x
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
// ------------------ MLUElementwiseGradOp -----------------
enum
MINMAX_GRAD_FUNCTOR
{
MAXIMUM_GRAD
,
MINIMUM_GRAD
,
};
template
<
MINMAX_GRAD_FUNCTOR
Functor
,
typename
Tin
,
typename
Tout
=
Tin
>
void
MLUMinMaxGradHelper
(
const
framework
::
ExecutionContext
&
ctx
)
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
(
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
)
:
axis
);
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
// mask = Logic(x, y) only support min & max
cnnlLogicOp_t
logic
=
Functor
==
MAXIMUM_GRAD
?
CNNL_LOGIC_OP_GE
:
CNNL_LOGIC_OP_LE
;
phi
::
DenseTensor
mask
(
x
->
dtype
());
mask
.
Resize
(
phi
::
make_ddim
(
out_dims_array
));
mask
.
mutable_data
<
Tin
>
(
ctx
.
GetPlace
());
cnnlDataType_t
data_type
=
ToCnnlDataType
<
Tin
>
();
MLUCnnlTensorDesc
x_desc
(
max_dim
,
x_dims_array
.
data
(),
data_type
);
MLUCnnlTensorDesc
y_desc
(
max_dim
,
y_dims_array
.
data
(),
data_type
);
MLUCnnlTensorDesc
mask_desc
(
max_dim
,
out_dims_array
.
data
(),
data_type
);
MLUCnnl
::
Logic
(
ctx
,
logic
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
mask_desc
.
get
(),
GetBasePtr
(
&
mask
));
// dx = Mul(dz, mask)
phi
::
DenseTensor
dx_temp
(
x
->
dtype
());
dx_temp
.
Resize
(
dout
->
dims
());
dx_temp
.
mutable_data
<
Tout
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
dout_desc
.
get
(),
GetBasePtr
(
&
mask
),
dout_desc
.
get
(),
GetBasePtr
(
&
dx_temp
),
data_type
);
// dy = Sub(dz, dx)
phi
::
DenseTensor
dy_temp
(
y
->
dtype
());
dy_temp
.
Resize
(
dout
->
dims
());
dy_temp
.
mutable_data
<
Tout
>
(
ctx
.
GetPlace
());
MLUCnnlOpTensorDesc
sub_op_desc
(
CNNL_OP_TENSOR_SUB
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
sub_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
dout_desc
.
get
(),
GetBasePtr
(
&
dx_temp
),
dout_desc
.
get
(),
GetBasePtr
(
&
dy_temp
),
data_type
);
if
(
dx
)
{
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
dx
->
mutable_data
<
Tout
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dx_temp
.
dims
(),
dx
->
dims
(),
&
reduce_axes
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
&
dx_temp
),
0
,
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
else
{
dx
->
ShareDataWith
(
dx_temp
);
}
}
if
(
dy
)
{
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
dy
->
mutable_data
<
Tout
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dy_temp
.
dims
(),
dy
->
dims
(),
&
reduce_axes
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dy_desc
(
*
dy
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
&
dy_temp
),
0
,
nullptr
,
nullptr
,
dy_desc
.
get
(),
GetBasePtr
(
dy
));
}
else
{
dy
->
ShareDataWith
(
dy_temp
);
}
}
}
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace
paddle
{
namespace
operators
{
using
MLUDeviceContext
=
platform
::
MLUDeviceContext
;
template
<
typename
T
>
class
ElementwiseMulMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUOpTensorKernel
<
T
>
(
ctx
,
CNNL_OP_TENSOR_MUL
);
}
};
template
<
typename
T
>
class
ElementwiseMulGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
(
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
)
:
axis
);
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
MLUCnnlTensorDesc
x_desc
(
max_dim
,
x_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
max_dim
,
y_dims_array
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dx
->
dims
()
==
dout
->
dims
())
{
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
y_desc
.
get
(),
GetBasePtr
(
y
),
x_desc
.
get
(),
GetBasePtr
(
dx
),
ToCnnlDataType
<
T
>
());
}
else
{
phi
::
DenseTensor
dx_temp
(
x
->
dtype
());
dx_temp
.
Resize
(
dout
->
dims
());
dx_temp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
y_desc
.
get
(),
GetBasePtr
(
y
),
dout_desc
.
get
(),
GetBasePtr
(
&
dx_temp
),
ToCnnlDataType
<
T
>
());
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dx_temp
.
dims
(),
dx
->
dims
(),
&
reduce_axes
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
&
dx_temp
),
0
,
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dy
->
dims
()
==
dout
->
dims
())
{
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
dy
),
ToCnnlDataType
<
T
>
());
}
else
{
phi
::
DenseTensor
dy_temp
(
y
->
dtype
());
dy_temp
.
Resize
(
dout
->
dims
());
dy_temp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
x_desc
.
get
(),
GetBasePtr
(
x
),
dout_desc
.
get
(),
GetBasePtr
(
&
dy_temp
),
ToCnnlDataType
<
T
>
());
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dy_temp
.
dims
(),
dy
->
dims
(),
&
reduce_axes
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dy_desc
(
*
dy
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
&
dy_temp
),
0
,
nullptr
,
nullptr
,
dy_desc
.
get
(),
GetBasePtr
(
dy
));
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
elementwise_mul
,
ops
::
ElementwiseMulMLUKernel
<
float
>
,
ops
::
ElementwiseMulMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ElementwiseMulMLUKernel
<
int
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_mul_grad
,
ops
::
ElementwiseMulGradMLUKernel
<
float
>
,
ops
::
ElementwiseMulGradMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ElementwiseMulGradMLUKernel
<
int
>
);
paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwisePowMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUBinaryOp
<
POW
,
T
>
(
ctx
);
}
};
template
<
typename
T
>
class
ElementwisePowGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
place
=
ctx
.
GetPlace
();
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
:
axis
);
int
max_dim
=
std
::
max
(
x_dims
.
size
(),
y_dims
.
size
());
std
::
vector
<
int
>
x_dims_array
(
max_dim
);
std
::
vector
<
int
>
y_dims_array
(
max_dim
);
std
::
vector
<
int
>
out_dims_array
(
max_dim
);
GetBroadcastDimsArrays
(
x_dims
,
y_dims
,
x_dims_array
.
data
(),
y_dims_array
.
data
(),
out_dims_array
.
data
(),
max_dim
,
axis
);
cnnlDataType_t
data_type
=
ToCnnlDataType
<
T
>
();
MLUCnnlTensorDesc
x_desc
(
max_dim
,
x_dims_array
.
data
(),
data_type
);
MLUCnnlTensorDesc
y_desc
(
max_dim
,
y_dims_array
.
data
(),
data_type
);
MLUCnnlTensorDesc
out_desc
(
max_dim
,
out_dims_array
.
data
(),
data_type
);
auto
dout_dims
=
dout
->
dims
();
if
(
dx
)
{
// dx = dout * y * pow(x, y - 1);
phi
::
DenseTensor
one_dx
(
y
->
type
());
one_dx
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
y_dims_array
),
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
1
),
&
one_dx
);
phi
::
DenseTensor
sub_dx
(
y
->
type
());
sub_dx
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
y_dims_array
),
place
);
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_SUB
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
y_desc
.
get
(),
GetBasePtr
(
y
),
y_desc
.
get
(),
GetBasePtr
(
&
one_dx
),
y_desc
.
get
(),
GetBasePtr
(
&
sub_dx
),
data_type
);
phi
::
DenseTensor
tmp_dx
(
x
->
type
());
tmp_dx
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
out_dims_array
),
place
);
MLUCnnl
::
Pow
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
&
sub_dx
),
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dx
));
MLUCnnl
::
MulAx
(
ctx
,
y_desc
.
get
(),
GetBasePtr
(
y
),
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dx
));
MLUCnnl
::
MulAx
(
ctx
,
out_desc
.
get
(),
GetBasePtr
(
dout
),
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dx
));
if
(
x_dims
!=
dout_dims
)
{
dx
->
mutable_data
<
T
>
(
place
);
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dout_dims
,
x_dims
,
&
reduce_axes
);
if
(
!
reduce_axes
.
empty
())
{
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dx
),
0
,
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
}
else
{
dx
->
ShareDataWith
(
tmp_dx
);
}
}
if
(
dy
)
{
// dy = dout * log(x) * pow(x, y)
phi
::
DenseTensor
tmp_dy
(
y
->
type
());
tmp_dy
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
out_dims_array
),
place
);
MLUCnnl
::
Pow
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
x_desc
.
get
(),
GetBasePtr
(
x
),
y_desc
.
get
(),
GetBasePtr
(
y
),
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dy
));
phi
::
DenseTensor
log_x
(
x
->
type
());
log_x
.
mutable_data
<
T
>
(
x
->
dims
(),
place
);
MLUCnnl
::
Log
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
CNNL_LOG_E
,
x_desc
.
get
(),
GetBasePtr
(
x
),
x_desc
.
get
(),
GetBasePtr
(
&
log_x
));
MLUCnnl
::
MulAx
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
&
log_x
),
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dy
));
MLUCnnl
::
MulAx
(
ctx
,
out_desc
.
get
(),
GetBasePtr
(
dout
),
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dy
));
if
(
y_dims
!=
dout_dims
)
{
dy
->
mutable_data
<
T
>
(
place
);
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxes
(
axis
,
dout_dims
,
y_dims
,
&
reduce_axes
);
if
(
!
reduce_axes
.
empty
())
{
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dy_desc
(
*
dy
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
&
tmp_dy
),
0
,
nullptr
,
nullptr
,
dy_desc
.
get
(),
GetBasePtr
(
dy
));
}
}
else
{
dy
->
ShareDataWith
(
tmp_dy
);
}
}
if
(
!
dx
&&
!
dy
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not support all outputs to be empty."
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
elementwise_pow
,
ops
::
ElementwisePowMLUKernel
<
plat
::
float16
>
,
ops
::
ElementwisePowMLUKernel
<
float
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_pow_grad
,
ops
::
ElementwisePowGradMLUKernel
<
plat
::
float16
>
,
ops
::
ElementwisePowGradMLUKernel
<
float
>
);
paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseSubMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
MLUOpTensorKernel
<
T
>
(
ctx
,
CNNL_OP_TENSOR_SUB
);
}
};
template
<
typename
T
>
class
ElementwiseSubGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
std
::
abs
(
x
->
dims
().
size
()
-
y
->
dims
().
size
())
:
axis
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec
;
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxesAndDstDims
(
axis
,
dout
->
dims
(),
dx
->
dims
(),
&
reduce_axes
,
&
dst_dims_vec
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dx_desc
(
dst_dims_vec
.
size
(),
dst_dims_vec
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
0
,
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
else
{
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
dev_ctx
,
dx
);
}
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
*
tmp_dout
=
const_cast
<
phi
::
DenseTensor
*>
(
dout
);
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec
;
std
::
vector
<
int
>
reduce_axes
;
GetReduceAxesAndDstDims
(
axis
,
dout
->
dims
(),
dy
->
dims
(),
&
reduce_axes
,
&
dst_dims_vec
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_axes
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnlTensorDesc
dy_desc
(
dst_dims_vec
.
size
(),
dst_dims_vec
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
0
,
nullptr
,
nullptr
,
dy_desc
.
get
(),
GetBasePtr
(
dy
));
tmp_dout
=
dy
;
}
// call neg op, dy = -dout
MLUCnnlTensorDesc
tmp_dout_desc
(
*
tmp_dout
);
MLUCnnlTensorDesc
dy_desc
(
*
dy
);
MLUUnary
<
NEG
>
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
tmp_dout_desc
.
get
(),
GetBasePtr
(
tmp_dout
),
dy_desc
.
get
(),
GetBasePtr
(
dy
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubMLUKernel
<
int
>
,
ops
::
ElementwiseSubMLUKernel
<
float
>
,
ops
::
ElementwiseSubMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradMLUKernel
<
int
>
,
ops
::
ElementwiseSubGradMLUKernel
<
float
>
,
ops
::
ElementwiseSubGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/expand_v2_op.h
浏览文件 @
e75c01f9
...
@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape(
...
@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape(
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
}
}
#endif
#ifdef PADDLE_WITH_MLU
if
(
platform
::
is_mlu_place
(
shape_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
*
shape_tensor
,
platform
::
CPUPlace
(),
&
cpu_shape_tensor
);
shape_data
=
cpu_shape_tensor
.
data
<
int
>
();
}
#endif
#endif
auto
vec_shape
=
auto
vec_shape
=
std
::
vector
<
int
>
(
shape_data
,
shape_data
+
shape_tensor
->
numel
());
std
::
vector
<
int
>
(
shape_data
,
shape_data
+
shape_tensor
->
numel
());
...
@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape(
...
@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape(
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
vec_epxand_shape
.
push_back
(
*
temp
.
data
<
int32_t
>
());
vec_epxand_shape
.
push_back
(
*
temp
.
data
<
int32_t
>
());
}
}
#endif
#ifdef PADDLE_WITH_MLU
else
if
(
platform
::
is_mlu_place
(
tensor
->
place
()))
{
// NOLINT
phi
::
DenseTensor
temp
;
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
vec_epxand_shape
.
push_back
(
*
temp
.
data
<
int32_t
>
());
}
#endif
#endif
else
{
// NOLINT
else
{
// NOLINT
vec_epxand_shape
.
push_back
(
*
tensor
->
data
<
int32_t
>
());
vec_epxand_shape
.
push_back
(
*
tensor
->
data
<
int32_t
>
());
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -6,11 +6,7 @@ if(WITH_XPU)
...
@@ -6,11 +6,7 @@ if(WITH_XPU)
endif
()
endif
()
# please add new math_library in alphabetical order
# please add new math_library in alphabetical order
if
(
WITH_MLU
)
math_library
(
concat_and_split DEPS concat_and_split_functor
)
math_library
(
concat_and_split DEPS concat_and_split_functor mlu_baseop
)
else
()
math_library
(
concat_and_split DEPS concat_and_split_functor
)
endif
()
math_library
(
context_project DEPS im2col math_function
)
math_library
(
context_project DEPS im2col math_function
)
math_library
(
cos_sim_functor
)
math_library
(
cos_sim_functor
)
math_library
(
depthwise_conv
)
math_library
(
depthwise_conv
)
...
...
paddle/fluid/operators/math/concat_and_split.cc
浏览文件 @
e75c01f9
...
@@ -17,9 +17,6 @@ limitations under the License. */
...
@@ -17,9 +17,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/common/float16.h"
...
@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
...
@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
};
};
#endif
#endif
#ifdef PADDLE_WITH_MLU
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
MLUDeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>&
input
,
int
axis
,
phi
::
DenseTensor
*
output
)
{
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
MLUDeviceGuard
guard
(
dev_id
);
auto
ins_size
=
input
.
size
();
const
int
axis_t
=
axis
;
const
int
ins_size_t
=
ins_size
;
// mlu should do sth
// init ins tensors
std
::
vector
<
const
void
*>
inputs
;
std
::
vector
<
MLUCnnlTensorDesc
>
input_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
ins_size
;
i
++
)
{
input_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
input
[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
[
i
].
dtype
())));
desc_vector
.
push_back
(
input_descs
.
back
().
get
());
inputs
.
push_back
(
input
[
i
].
data
());
}
// init out tensors
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
// MLU should do sth
MLUCnnl
::
Concat
(
context
,
ins_size_t
,
axis_t
,
desc_vector
.
data
(),
inputs
.
data
(),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
SplitFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
MLUDeviceContext
&
context
,
const
phi
::
DenseTensor
&
input
,
const
std
::
vector
<
const
phi
::
DenseTensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
phi
::
DenseTensor
*>*
outputs
)
{
if
(
input
.
numel
()
==
0
)
{
return
;
}
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
MLUDeviceGuard
guard
(
dev_id
);
auto
in_dims
=
input
.
dims
();
auto
out_size
=
outputs
->
size
();
std
::
vector
<
framework
::
DDim
>
outs_dims
(
out_size
,
in_dims
);
for
(
size_t
i
=
0
;
i
<
out_size
;
++
i
)
{
outs_dims
[
i
][
axis
]
=
ref_inputs
[
i
]
->
dims
()[
axis
];
}
// init out tensors
std
::
vector
<
void
*>
vct_tensor
;
std
::
vector
<
MLUCnnlTensorDesc
>
output_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
out_size
;
i
++
)
{
(
*
outputs
)[
i
]
->
Resize
(
outs_dims
[
i
]);
output_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
(
*
outputs
)[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
((
*
outputs
)[
i
]
->
dtype
())));
desc_vector
.
push_back
(
output_descs
.
back
().
get
());
vct_tensor
.
push_back
(
GetBasePtr
((
*
outputs
)[
i
]));
}
// init in tensors
MLUCnnlTensorDesc
input_desc
(
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
.
dtype
()));
// MLU should do sth
MLUCnnl
::
Split
(
context
,
out_size
,
axis
,
input_desc
.
get
(),
input
.
data
(),
desc_vector
.
data
(),
vct_tensor
.
data
());
}
};
#endif
#define DEFINE_FUNCTOR(type) \
#define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<phi::CPUContext, type>; \
template class ConcatFunctor<phi::CPUContext, type>; \
template class SplitFunctor<phi::CPUContext, type>;
template class SplitFunctor<phi::CPUContext, type>;
...
@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
...
@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
DEFINE_XPU_FUNCTOR
(
float
)
DEFINE_XPU_FUNCTOR
(
float
)
DEFINE_XPU_FUNCTOR
(
platform
::
float16
)
DEFINE_XPU_FUNCTOR
(
platform
::
float16
)
#endif
#endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
template class SplitFunctor<platform::MLUDeviceContext, type>;
DEFINE_MLU_FUNCTOR
(
float
)
DEFINE_MLU_FUNCTOR
(
platform
::
float16
)
DEFINE_MLU_FUNCTOR
(
int64_t
)
DEFINE_MLU_FUNCTOR
(
bool
)
DEFINE_MLU_FUNCTOR
(
int
)
DEFINE_MLU_FUNCTOR
(
int8_t
)
DEFINE_MLU_FUNCTOR
(
int16_t
)
DEFINE_MLU_FUNCTOR
(
uint8_t
)
#endif
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/metrics/accuracy_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
AccuracyMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
indices
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Indices"
);
auto
*
label
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
accuracy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Accuracy"
);
auto
*
correct
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Correct"
);
auto
*
total
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Total"
);
int
num_samples
=
indices
->
dims
()[
0
];
if
(
num_samples
==
0
)
{
return
;
}
// cast `indices` or `label` if their type is not INT32
phi
::
DenseTensor
indices_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
phi
::
DenseTensor
label_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
auto
indices_type
=
framework
::
TransToProtoVarType
(
indices
->
type
());
if
(
indices_type
!=
VT
::
INT32
)
{
PADDLE_ENFORCE_EQ
(
MLUSupportsCast
(
indices_type
,
VT
::
INT32
),
true
,
platform
::
errors
::
Unimplemented
(
"In accuracy mlu kernel, cast indices from [%s] to "
"[%s] is not supported."
,
framework
::
DataTypeToString
(
indices_type
),
framework
::
DataTypeToString
(
VT
::
INT32
)));
indices_int32
.
Resize
(
indices
->
dims
());
indices_int32
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
org_indices_desc
(
*
indices
);
MLUCnnlTensorDesc
indices_int32_desc
(
indices_int32
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
indices_type
,
VT
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
org_indices_desc
.
get
(),
GetBasePtr
(
indices
),
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
));
}
else
{
indices_int32
.
ShareDataWith
(
*
indices
);
}
auto
label_type
=
framework
::
TransToProtoVarType
(
label
->
type
());
if
(
label_type
!=
VT
::
INT32
)
{
PADDLE_ENFORCE_EQ
(
MLUSupportsCast
(
label_type
,
VT
::
INT32
),
true
,
platform
::
errors
::
Unimplemented
(
"In accuracy mlu kernel, cast label from [%s] to [%s] "
"is not supported."
,
framework
::
DataTypeToString
(
label_type
),
framework
::
DataTypeToString
(
VT
::
INT32
)));
label_int32
.
Resize
(
label
->
dims
());
label_int32
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
org_label_desc
(
*
label
);
MLUCnnlTensorDesc
label_int32_desc
(
label_int32
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
label_type
,
VT
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
org_label_desc
.
get
(),
GetBasePtr
(
label
),
label_int32_desc
.
get
(),
GetBasePtr
(
&
label_int32
));
}
else
{
label_int32
.
ShareDataWith
(
*
label
);
}
// equal
MLUCnnlTensorDesc
indices_int32_desc
(
indices_int32
);
MLUCnnlTensorDesc
label_int32_desc
(
label_int32
);
phi
::
DenseTensor
equal_tensor
(
framework
::
TransToPhiDataType
(
VT
::
BOOL
));
equal_tensor
.
Resize
(
indices
->
dims
());
equal_tensor
.
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
equal_tensor_desc
(
equal_tensor
);
MLUCnnl
::
Logic
(
ctx
,
CNNL_LOGIC_OP_EQ
,
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
),
label_int32_desc
.
get
(),
GetBasePtr
(
&
label_int32
),
equal_tensor_desc
.
get
(),
GetBasePtr
(
&
equal_tensor
));
// cast equal
phi
::
DenseTensor
equal_fp32
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
equal_fp32
.
Resize
(
indices
->
dims
());
equal_fp32
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
equal_fp32_desc
(
equal_fp32
);
cnnlCastDataType_t
equal_cast_type
=
GetCastDataType
(
VT
::
BOOL
,
VT
::
FP32
);
MLUCnnl
::
Cast
(
ctx
,
equal_cast_type
,
equal_tensor_desc
.
get
(),
GetBasePtr
(
&
equal_tensor
),
equal_fp32_desc
.
get
(),
GetBasePtr
(
&
equal_fp32
));
// [correct]
// reduce_max
phi
::
DenseTensor
correct_max
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
correct_max
.
Resize
(
phi
::
make_ddim
({
num_samples
}));
correct_max
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
correct_max_desc
(
correct_max
);
MLUCnnlReduceDesc
reduce_max_desc
({
1
},
CNNL_REDUCE_MAX
,
ToCnnlDataType
<
float
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduce_max_desc
.
get
(),
nullptr
,
equal_fp32_desc
.
get
(),
GetBasePtr
(
&
equal_fp32
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
correct_max_desc
.
get
(),
GetBasePtr
(
&
correct_max
));
// reduce_sum
phi
::
DenseTensor
correct_sum
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
correct_sum
.
Resize
(
correct
->
dims
());
correct_sum
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
correct_sum_desc
(
correct_sum
);
MLUCnnlReduceDesc
reduce_sum_desc
({
0
},
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
float
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduce_sum_desc
.
get
(),
nullptr
,
correct_max_desc
.
get
(),
GetBasePtr
(
&
correct_max
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
correct_sum_desc
.
get
(),
GetBasePtr
(
&
correct_sum
));
// cast to int
correct
->
mutable_data
<
int
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
correct_desc
(
*
correct
);
cnnlCastDataType_t
correct_cast_type
=
GetCastDataType
(
VT
::
FP32
,
VT
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
correct_cast_type
,
correct_sum_desc
.
get
(),
GetBasePtr
(
&
correct_sum
),
correct_desc
.
get
(),
GetBasePtr
(
correct
));
// [total]
total
->
mutable_data
<
int
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
total_desc
(
*
total
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
num_samples
,
total_desc
.
get
(),
GetBasePtr
(
total
));
// use `total` of type `float32` for calculating accuracy
phi
::
DenseTensor
total_fp32
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
total_fp32
.
Resize
(
total
->
dims
());
total_fp32
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
total_fp32_desc
(
total_fp32
);
float
num_samples_fp32
=
static_cast
<
float
>
(
num_samples
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
num_samples_fp32
,
total_fp32_desc
.
get
(),
GetBasePtr
(
&
total_fp32
));
// [accuracy]
accuracy
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
accuracy_desc
(
*
accuracy
);
MLUCnnl
::
Div
(
ctx
,
CNNL_COMPUTATION_HIGH_PRECISION
,
correct_sum_desc
.
get
(),
GetBasePtr
(
&
correct_sum
),
total_fp32_desc
.
get
(),
GetBasePtr
(
&
total_fp32
),
accuracy_desc
.
get
(),
GetBasePtr
(
accuracy
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
accuracy
,
ops
::
AccuracyMLUKernel
<
float
>
,
ops
::
AccuracyMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
AccuracyMLUKernel
<
int16_t
>
,
ops
::
AccuracyMLUKernel
<
int64_t
>
,
ops
::
AccuracyMLUKernel
<
uint8_t
>
,
ops
::
AccuracyMLUKernel
<
int
>
);
paddle/fluid/operators/mlu/CMakeLists.txt
已删除
100644 → 0
浏览文件 @
075d6b14
if
(
WITH_MLU
)
cc_library
(
mlu_baseop
SRCS mlu_baseop.cc
DEPS neuware_lib device_context
)
cc_test
(
activation_op_mlu_test
SRCS activation_op_mlu_test.cc
DEPS op_registry activation_op scope device_context executor
)
endif
()
paddle/fluid/operators/mlu/activation_op_mlu_test.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
fw
=
paddle
::
framework
;
namespace
plat
=
paddle
::
platform
;
USE_OP_ITSELF
(
relu
);
USE_OP_DEVICE_KERNEL
(
relu
,
MLU
);
// relu
template
<
typename
T
>
inline
T
relu
(
T
x
)
{
return
x
>
0
?
x
:
0.
;
}
template
<
typename
T
>
inline
T
relu_grad_dx
(
T
x
,
T
out
,
T
dout
)
{
return
out
>
0
?
dout
:
0
;
}
template
<
typename
T
>
void
Compare
(
fw
::
Scope
*
scope
,
const
plat
::
DeviceContext
&
ctx
,
std
::
string
op_type
)
{
// init
auto
x
=
scope
->
Var
(
"X"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
const
int
num
=
10
;
std
::
vector
<
T
>
init_x
;
for
(
int64_t
i
=
0
;
i
<
num
*
num
;
++
i
)
{
init_x
.
push_back
(
static_cast
<
T
>
(
i
-
50
));
}
paddle
::
framework
::
TensorFromVector
(
init_x
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num
,
num
});
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"Out"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
fw
::
AttributeMap
attrs
;
auto
op
=
fw
::
OpRegistry
::
CreateOp
(
op_type
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
// eval time
struct
timeval
start
,
end
;
gettimeofday
(
&
start
,
NULL
);
for
(
int
i
=
0
;
i
<
100
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
gettimeofday
(
&
end
,
NULL
);
int
micros
=
(((
end
.
tv_sec
-
start
.
tv_sec
)
*
1000000
)
+
end
.
tv_usec
)
-
(
start
.
tv_usec
);
printf
(
"used time: %d
\n
"
,
micros
/
100
);
// eval value
std
::
vector
<
T
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_FLOAT_EQ
(
out_vec
[
i
],
relu
<
T
>
(
init_x
[
i
]));
}
}
template
<
typename
T
>
void
CompareGrad
(
fw
::
Scope
*
scope
,
const
plat
::
DeviceContext
&
ctx
,
std
::
string
op_type
)
{
auto
dout
=
scope
->
Var
(
"DOut"
);
auto
tensor_dout
=
dout
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
out
=
scope
->
Var
(
"Out"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
const
int
num
=
10
;
std
::
vector
<
T
>
init_dout
;
for
(
int64_t
i
=
0
;
i
<
num
*
num
;
++
i
)
{
init_dout
.
push_back
(
static_cast
<
T
>
(
1.0
));
}
std
::
vector
<
T
>
init_out
;
for
(
int64_t
i
=
0
;
i
<
num
*
num
;
++
i
)
{
init_out
.
push_back
(
static_cast
<
T
>
(
i
-
50
));
}
paddle
::
framework
::
TensorFromVector
(
init_dout
,
ctx
,
tensor_dout
);
tensor_dout
->
Resize
({
num
,
num
});
paddle
::
framework
::
TensorFromVector
(
init_out
,
ctx
,
tensor_out
);
tensor_out
->
Resize
({
num
,
num
});
auto
dx
=
scope
->
Var
(
"DX"
);
auto
tensor_dx
=
dx
->
GetMutable
<
phi
::
DenseTensor
>
();
// run
auto
place
=
ctx
.
GetPlace
();
fw
::
AttributeMap
attrs
;
auto
op
=
fw
::
OpRegistry
::
CreateOp
(
op_type
,
{{
"Out@GRAD"
,
{
"DOut"
}},
{
"Out"
,
{
"Out"
}}},
{{
"X@GRAD"
,
{
"DX"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
// eval time
struct
timeval
start
,
end
;
gettimeofday
(
&
start
,
NULL
);
for
(
int
i
=
0
;
i
<
100
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
gettimeofday
(
&
end
,
NULL
);
int
micros
=
(((
end
.
tv_sec
-
start
.
tv_sec
)
*
1000000
)
+
end
.
tv_usec
)
-
(
start
.
tv_usec
);
printf
(
"used time: %d
\n
"
,
micros
/
100
);
// eval value
std
::
vector
<
T
>
dx_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_dx
,
ctx
,
&
dx_vec
);
ctx
.
Wait
();
for
(
uint32_t
i
=
0
;
i
<
dx_vec
.
size
();
i
++
)
{
EXPECT_FLOAT_EQ
(
dx_vec
[
i
],
relu_grad_dx
<
T
>
(
dx_vec
[
i
],
init_out
[
i
],
init_dout
[
i
]));
}
}
TEST
(
relu
,
MLU_fp32
)
{
fw
::
Scope
scope
;
auto
*
ctx
=
plat
::
DeviceContextPool
::
Instance
().
Get
(
plat
::
MLUPlace
(
0
));
Compare
<
float
>
(
&
scope
,
*
ctx
,
"relu"
);
}
TEST
(
relu_grad
,
MLU_fp32
)
{
fw
::
Scope
scope
;
auto
*
ctx
=
plat
::
DeviceContextPool
::
Instance
().
Get
(
plat
::
MLUPlace
(
0
));
CompareGrad
<
float
>
(
&
scope
,
*
ctx
,
"relu_grad"
);
}
paddle/fluid/operators/mlu/mlu_baseop.cc
已删除
100644 → 0
浏览文件 @
075d6b14
因为 它太大了无法显示 source diff 。你可以改为
查看blob
。
paddle/fluid/operators/mlu/mlu_baseop.h
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cn_api.h>
#include <cnnl.h>
#include <concurrentqueue.h>
#include <mlu_op.h>
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/device/mlu/enforce.h"
namespace
paddle
{
namespace
operators
{
using
DataLayout
=
phi
::
DataLayout
;
using
ExecutionContext
=
framework
::
ExecutionContext
;
using
DeviceContextPool
=
platform
::
DeviceContextPool
;
using
MLUDeviceContext
=
platform
::
MLUDeviceContext
;
const
std
::
map
<
std
::
string
,
cnnlReduceOp_t
>
MLUReduceOpMap
=
{
{
"reduce_all"
,
CNNL_REDUCE_AND
},
{
"reduce_any"
,
CNNL_REDUCE_OR
},
{
"reduce_max"
,
CNNL_REDUCE_MAX
},
{
"reduce_mean"
,
CNNL_REDUCE_AVG
},
{
"reduce_min"
,
CNNL_REDUCE_MIN
},
{
"reduce_sum"
,
CNNL_REDUCE_ADD
},
{
"reduce_prod"
,
CNNL_REDUCE_MUL
},
};
const
std
::
map
<
std
::
string
,
cnnlInterpMode_t
>
MLUInterpModeMap
=
{
{
"bilinear"
,
CNNL_INTERP_BILINEAR
},
{
"nearest"
,
CNNL_INTERP_NEAREST
},
{
"linear"
,
CNNL_INTERP_LINEAR
},
{
"trilinear"
,
CNNL_INTERP_TRILINEAR
},
{
"bicubic"
,
CNNL_INTERP_BICUBIC
}};
const
std
::
map
<
std
::
string
,
cnnlInterpBackwardMode_t
>
MLUInterpBackwardModeMap
=
{{
"bilinear"
,
CNNL_INTERP_BACKWARD_BILINEAR
},
{
"nearest"
,
CNNL_INTERP_BACKWARD_NEAREST
},
{
"linear"
,
CNNL_INTERP_BACKWARD_LINEAR
},
{
"trilinear"
,
CNNL_INTERP_BACKWARD_TRILINEAR
},
{
"bicubic"
,
CNNL_INTERP_BACKWARD_BICUBIC
}};
inline
cnnlReduceOp_t
GetMLUCnnlReduceOp
(
const
std
::
string
reduce_name
)
{
auto
iter
=
MLUReduceOpMap
.
find
(
reduce_name
);
if
(
iter
!=
MLUReduceOpMap
.
end
())
{
return
iter
->
second
;
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Not support reduce op type of MLU Device: %s"
,
reduce_name
));
}
inline
cnnlInterpMode_t
GetMLUCnnlInterpMode
(
const
std
::
string
interp_mode
)
{
auto
iter
=
MLUInterpModeMap
.
find
(
interp_mode
);
if
(
iter
!=
MLUInterpModeMap
.
end
())
{
return
iter
->
second
;
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Not support interp mode of MLU Device: %s"
,
interp_mode
));
}
inline
cnnlInterpBackwardMode_t
GetMLUCnnlInterpBackwardMode
(
const
std
::
string
interp_mode
)
{
auto
iter
=
MLUInterpBackwardModeMap
.
find
(
interp_mode
);
if
(
iter
!=
MLUInterpBackwardModeMap
.
end
())
{
return
iter
->
second
;
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Not support interp mode of MLU Device: %s"
,
interp_mode
));
}
inline
const
void
*
GetBasePtr
(
const
phi
::
DenseTensor
*
t
)
{
return
t
->
data
();
}
inline
void
*
GetBasePtr
(
phi
::
DenseTensor
*
t
)
{
return
t
->
data
();
}
inline
cnnlDataType_t
ToCnnlDataType
(
const
phi
::
DataType
&
dtype
)
{
cnnlDataType_t
type
=
CNNL_DTYPE_FLOAT
;
switch
(
dtype
)
{
case
DataType
::
FLOAT16
:
type
=
CNNL_DTYPE_HALF
;
break
;
case
DataType
::
FLOAT32
:
type
=
CNNL_DTYPE_FLOAT
;
break
;
case
DataType
::
FLOAT64
:
type
=
CNNL_DTYPE_DOUBLE
;
break
;
case
DataType
::
INT8
:
type
=
CNNL_DTYPE_INT8
;
break
;
case
DataType
::
INT16
:
type
=
CNNL_DTYPE_INT16
;
break
;
case
DataType
::
INT32
:
type
=
CNNL_DTYPE_INT32
;
break
;
case
DataType
::
INT64
:
type
=
CNNL_DTYPE_INT64
;
break
;
case
DataType
::
BOOL
:
type
=
CNNL_DTYPE_BOOL
;
break
;
case
DataType
::
UINT8
:
type
=
CNNL_DTYPE_UINT8
;
break
;
default:
break
;
}
return
type
;
}
inline
cnnlDataType_t
ToCnnlDataType
(
const
paddle
::
framework
::
proto
::
VarType
::
Type
&
type
)
{
return
ToCnnlDataType
(
framework
::
TransToPhiDataType
(
type
));
}
template
<
typename
T
>
inline
cnnlDataType_t
ToCnnlDataType
()
{
auto
type
=
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
)));
return
ToCnnlDataType
(
type
);
}
inline
mluOpDataType_t
ToMluOpDataType
(
const
phi
::
DataType
&
dtype
)
{
mluOpDataType_t
type
=
MLUOP_DTYPE_FLOAT
;
switch
(
dtype
)
{
case
DataType
::
FLOAT16
:
type
=
MLUOP_DTYPE_HALF
;
break
;
case
DataType
::
FLOAT32
:
type
=
MLUOP_DTYPE_FLOAT
;
break
;
case
DataType
::
FLOAT64
:
type
=
MLUOP_DTYPE_DOUBLE
;
break
;
case
DataType
::
INT8
:
type
=
MLUOP_DTYPE_INT8
;
break
;
case
DataType
::
INT16
:
type
=
MLUOP_DTYPE_INT16
;
break
;
case
DataType
::
INT32
:
type
=
MLUOP_DTYPE_INT32
;
break
;
case
DataType
::
INT64
:
type
=
MLUOP_DTYPE_INT64
;
break
;
case
DataType
::
BOOL
:
type
=
MLUOP_DTYPE_BOOL
;
break
;
case
DataType
::
UINT8
:
type
=
MLUOP_DTYPE_UINT8
;
break
;
default:
break
;
}
return
type
;
}
inline
mluOpDataType_t
ToMluOpDataType
(
const
paddle
::
framework
::
proto
::
VarType
::
Type
&
type
)
{
return
ToMluOpDataType
(
framework
::
TransToPhiDataType
(
type
));
}
template
<
typename
T
>
inline
mluOpDataType_t
ToMluOpDataType
()
{
auto
type
=
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
)));
return
ToMluOpDataType
(
type
);
}
// Converts (via narrowing) a type T value to a type U, and checks that the
// value has no value change due to the conversion.
template
<
typename
WideT
,
typename
NarrowT
>
NarrowT
CheckedNarrowing
(
const
WideT
&
wide
)
{
NarrowT
narrow
=
wide
;
CHECK_EQ
(
narrow
,
wide
)
<<
"checked narrowing failed; values not equal post-conversion"
;
return
narrow
;
}
inline
static
cnnlHandle_t
GetHandleFromCTX
(
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>().
cnnl_handle
();
}
inline
static
mluOpHandle_t
GetMLUOpHandleFromCTX
(
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>().
mluOp_handle
();
}
inline
static
const
MLUDeviceContext
&
GetDevCtxFromCTX
(
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>();
}
using
VT
=
framework
::
proto
::
VarType
;
const
std
::
map
<
std
::
pair
<
VT
::
Type
,
VT
::
Type
>
,
cnnlCastDataType_t
>
MLU_SUPPORTED_CAST_TYPE
=
{
{{
VT
::
FP32
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_FLOAT_TO_HALF
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_FLOAT_TO_INT32
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
INT16
},
CNNL_CAST_FLOAT_TO_INT16
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
INT8
},
CNNL_CAST_FLOAT_TO_INT8
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
UINT8
},
CNNL_CAST_FLOAT_TO_UINT8
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
BOOL
},
CNNL_CAST_FLOAT_TO_BOOL
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_HALF_TO_FLOAT
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_HALF_TO_INT32
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
INT16
},
CNNL_CAST_HALF_TO_INT16
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
INT8
},
CNNL_CAST_HALF_TO_INT8
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
UINT8
},
CNNL_CAST_HALF_TO_UINT8
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
BOOL
},
CNNL_CAST_HALF_TO_BOOL
},
{{
VT
::
INT32
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_INT32_TO_FLOAT
},
{{
VT
::
INT32
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_INT32_TO_HALF
},
{{
VT
::
INT32
,
/*cast to*/
VT
::
INT8
},
CNNL_CAST_INT32_TO_INT8
},
{{
VT
::
INT32
,
/*cast to*/
VT
::
INT16
},
CNNL_CAST_INT32_TO_INT16
},
{{
VT
::
INT16
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_INT16_TO_FLOAT
},
{{
VT
::
INT16
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_INT16_TO_HALF
},
{{
VT
::
INT16
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_INT16_TO_INT32
},
{{
VT
::
INT8
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_INT8_TO_FLOAT
},
{{
VT
::
INT8
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_INT8_TO_HALF
},
{{
VT
::
INT8
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_INT8_TO_INT32
},
{{
VT
::
UINT8
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_UINT8_TO_FLOAT
},
{{
VT
::
UINT8
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_UINT8_TO_HALF
},
{{
VT
::
BOOL
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_BOOL_TO_FLOAT
},
{{
VT
::
BOOL
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_BOOL_TO_HALF
},
{{
VT
::
BOOL
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_BOOL_TO_INT32
},
{{
VT
::
UINT8
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_UINT8_TO_INT32
},
{{
VT
::
INT32
,
/*cast to*/
VT
::
INT64
},
CNNL_CAST_INT32_TO_INT64
},
{{
VT
::
INT64
,
/*cast to*/
VT
::
INT32
},
CNNL_CAST_INT64_TO_INT32
},
{{
VT
::
INT32
,
/*cast to*/
VT
::
BOOL
},
CNNL_CAST_INT32_TO_BOOL
},
{{
VT
::
UINT8
,
/*cast to*/
VT
::
INT64
},
CNNL_CAST_UINT8_TO_INT64
},
{{
VT
::
INT8
,
/*cast to*/
VT
::
INT16
},
CNNL_CAST_INT8_TO_INT16
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
FP64
},
CNNL_CAST_FLOAT_TO_DOUBLE
},
{{
VT
::
FP64
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_DOUBLE_TO_FLOAT
},
{{
VT
::
INT64
,
/*cast to*/
VT
::
FP32
},
CNNL_CAST_INT64_TO_FLOAT
},
{{
VT
::
INT64
,
/*cast to*/
VT
::
FP16
},
CNNL_CAST_INT64_TO_HALF
},
{{
VT
::
FP32
,
/*cast to*/
VT
::
INT64
},
CNNL_CAST_FLOAT_TO_INT64
},
{{
VT
::
FP16
,
/*cast to*/
VT
::
INT64
},
CNNL_CAST_HALF_TO_INT64
},
};
cnnlCastDataType_t
GetCastDataType
(
const
VT
::
Type
&
src_type
,
const
VT
::
Type
&
dst_type
);
cnnlCastDataType_t
GetCastDataType
(
const
DataType
&
src_type
,
const
DataType
&
dst_type
);
bool
MLUSupportsCast
(
const
VT
::
Type
&
src_type
,
const
VT
::
Type
&
dst_type
);
cnnlDeviceType_t
GetCnnlDev
(
int
dev_ordinal
);
using
CnnlTensorDesc
=
cnnlTensorDescriptor_t
;
class
MLUCnnlTensorDesc
{
public:
MLUCnnlTensorDesc
()
{}
// SE_DISALLOW_COPY_AND_ASSIGN
MLUCnnlTensorDesc
(
const
MLUCnnlTensorDesc
&
desc
)
=
delete
;
MLUCnnlTensorDesc
&
operator
=
(
const
MLUCnnlTensorDesc
&
)
=
delete
;
MLUCnnlTensorDesc
(
MLUCnnlTensorDesc
&&
rhs
)
:
raw_tensor_desc
(
rhs
.
raw_tensor_desc
)
{
rhs
.
raw_tensor_desc
=
nullptr
;
}
MLUCnnlTensorDesc
&
operator
=
(
MLUCnnlTensorDesc
&&
rhs
);
MLUCnnlTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
cnnlDataType_t
tensor_dtype
);
MLUCnnlTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
cnnlDataType_t
tensor_dtype
,
const
cnnlTensorLayout_t
layout
);
MLUCnnlTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
cnnlDataType_t
tensor_dtype
,
int
position
);
MLUCnnlTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
cnnlDataType_t
tensor_dtype
);
MLUCnnlTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
cnnlDataType_t
tensor_dtype
,
const
cnnlTensorLayout_t
layout
);
MLUCnnlTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
cnnlDataType_t
tensor_dtype
,
int
position
);
MLUCnnlTensorDesc
(
const
phi
::
DenseTensor
&
tensor
,
const
cnnlTensorLayout_t
layout
,
const
cnnlDataType_t
tensor_dtype
);
explicit
MLUCnnlTensorDesc
(
const
phi
::
DenseTensor
&
tensor
);
MLUCnnlTensorDesc
(
const
phi
::
DenseTensor
&
tensor
,
cnnlTensorLayout_t
layout
,
const
cnnlDataType_t
tensor_dtype
,
int
position
);
MLUCnnlTensorDesc
(
const
phi
::
DenseTensor
&
tensor
,
cnnlTensorLayout_t
layout
,
const
cnnlDataType_t
tensor_dtype
,
int
position
,
float
scale
);
~
MLUCnnlTensorDesc
();
const
cnnlTensorDescriptor_t
get
()
const
{
return
raw_tensor_desc
;
}
private:
cnnlTensorDescriptor_t
raw_tensor_desc
=
nullptr
;
};
class
MLUOpTensorDesc
{
public:
MLUOpTensorDesc
()
{}
// SE_DISALLOW_COPY_AND_ASSIGN
MLUOpTensorDesc
(
const
MLUOpTensorDesc
&
desc
)
=
delete
;
MLUOpTensorDesc
&
operator
=
(
const
MLUOpTensorDesc
&
)
=
delete
;
MLUOpTensorDesc
(
MLUOpTensorDesc
&&
rhs
)
:
raw_tensor_desc
(
rhs
.
raw_tensor_desc
)
{
rhs
.
raw_tensor_desc
=
nullptr
;
}
MLUOpTensorDesc
&
operator
=
(
MLUOpTensorDesc
&&
rhs
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
const
mluOpTensorLayout_t
layout
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
int
position
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
const
mluOpTensorLayout_t
layout
);
MLUOpTensorDesc
(
const
int
tensor_dim
,
const
int64_t
dim_sizes
[],
const
mluOpDataType_t
tensor_dtype
,
int
position
);
MLUOpTensorDesc
(
const
phi
::
DenseTensor
&
tensor
,
const
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
);
explicit
MLUOpTensorDesc
(
const
phi
::
DenseTensor
&
tensor
);
MLUOpTensorDesc
(
const
phi
::
DenseTensor
&
tensor
,
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
,
int
position
);
MLUOpTensorDesc
(
const
phi
::
DenseTensor
&
tensor
,
mluOpTensorLayout_t
layout
,
const
mluOpDataType_t
tensor_dtype
,
int
position
,
float
scale
);
~
MLUOpTensorDesc
();
const
mluOpTensorDescriptor_t
get
()
const
{
return
raw_tensor_desc
;
}
private:
mluOpTensorDescriptor_t
raw_tensor_desc
=
nullptr
;
};
class
MLUCnnlActivationDesc
{
public:
MLUCnnlActivationDesc
(
const
MLUCnnlActivationDesc
&
desc
)
=
delete
;
MLUCnnlActivationDesc
&
operator
=
(
const
MLUCnnlActivationDesc
&
desc
)
=
delete
;
MLUCnnlActivationDesc
(
const
cnnlActivationMode_t
act_mode
,
const
float
ceof
);
MLUCnnlActivationDesc
(
const
cnnlActivationMode_t
act_mode
,
const
float
ceof
,
const
float
sliced_dim
,
const
float
selu_alpha
,
const
float
selu_lambda
);
const
cnnlActivationDescriptor_t
get
()
const
;
~
MLUCnnlActivationDesc
();
private:
cnnlActivationDescriptor_t
active_desc_
=
nullptr
;
};
class
MLUCnnlPoolingDesc
{
public:
MLUCnnlPoolingDesc
(
const
MLUCnnlPoolingDesc
&
desc
)
=
delete
;
MLUCnnlPoolingDesc
&
operator
=
(
const
MLUCnnlPoolingDesc
&
desc
)
=
delete
;
MLUCnnlPoolingDesc
(
const
cnnlPoolingMode_t
mode
,
const
cnnlNanPropagation_t
maxpooling_nan_opt
,
int
window_rows
,
int
window_cols
,
int64_t
pad_up
,
int64_t
pad_down
,
int64_t
pad_left
,
int64_t
pad_right
,
int
row_stride
,
int
col_stride
,
int
row_dilation
,
int
col_dilation
,
bool
ceil_mode
);
MLUCnnlPoolingDesc
(
const
cnnlPoolingMode_t
mode
,
const
cnnlNanPropagation_t
maxpooling_nan_opt
,
const
int
tensor_rank
,
const
std
::
vector
<
int
>&
window
,
const
std
::
vector
<
int
>&
padding
,
const
std
::
vector
<
int
>&
stride
);
const
cnnlPoolingDescriptor_t
get
()
const
;
~
MLUCnnlPoolingDesc
();
private:
cnnlPoolingDescriptor_t
pooling_desc_
=
nullptr
;
};
class
MLUCnnlRandomGeneratorDesc
{
public:
MLUCnnlRandomGeneratorDesc
(
const
ExecutionContext
&
ctx
,
const
int
seed
);
const
cnnlRandGenerator_t
get
()
const
;
phi
::
DenseTensor
&
get_state
();
~
MLUCnnlRandomGeneratorDesc
();
private:
phi
::
DenseTensor
mlu_state
;
cnnlRandGenerator_t
mlu_generator
=
nullptr
;
};
const
std
::
shared_ptr
<
MLUCnnlRandomGeneratorDesc
>&
GetMLURandomGenerator
(
const
ExecutionContext
&
ctx
,
const
int64_t
device_id
,
const
int
seed
);
class
MLUCnnlReduceDesc
{
public:
MLUCnnlReduceDesc
(
const
MLUCnnlReduceDesc
&
desc
)
=
delete
;
MLUCnnlReduceDesc
&
operator
=
(
const
MLUCnnlReduceDesc
&
desc
)
=
delete
;
MLUCnnlReduceDesc
(
const
std
::
vector
<
int
>&
axis_vec
,
const
cnnlReduceOp_t
reduce_op
,
const
cnnlDataType_t
data_type
,
const
cnnlNanPropagation_t
nan_propagation
,
const
cnnlReduceIndices_t
reduce_indices
,
const
cnnlIndicesType_t
indices_type
);
const
cnnlReduceDescriptor_t
get
()
const
;
~
MLUCnnlReduceDesc
();
private:
cnnlReduceDescriptor_t
reduction_desc_
=
nullptr
;
};
class
MLUCnnlOpTensorDesc
{
public:
MLUCnnlOpTensorDesc
(
const
MLUCnnlOpTensorDesc
&
desc
)
=
delete
;
void
operator
=
(
const
MLUCnnlOpTensorDesc
&
)
=
delete
;
MLUCnnlOpTensorDesc
(
cnnlOpTensorDesc_t
op_tensor_op
,
cnnlDataType_t
op_tensor_comp_type
,
cnnlNanPropagation_t
op_tensor_nan_opt
);
const
cnnlOpTensorDescriptor_t
get
()
const
;
~
MLUCnnlOpTensorDesc
();
private:
cnnlOpTensorDescriptor_t
op_tensor_desc_
=
nullptr
;
};
class
MLUCnnlNMSDesc
{
public:
MLUCnnlNMSDesc
(
const
MLUCnnlNMSDesc
&
desc
)
=
delete
;
MLUCnnlNMSDesc
&
operator
=
(
const
MLUCnnlNMSDesc
&
desc
)
=
delete
;
MLUCnnlNMSDesc
(
const
cnnlNmsOutputMode_t
mode
,
const
float
iou_threshold
,
const
int
max_output_size
,
const
float
confidence_threshold
,
const
int
input_layout
);
const
cnnlNmsDescriptor_t
get
()
const
;
~
MLUCnnlNMSDesc
();
private:
cnnlNmsDescriptor_t
nms_desc_
=
nullptr
;
};
class
MLUCnnlConvolutionDesc
{
public:
MLUCnnlConvolutionDesc
(
const
int
dims
,
const
int
pad
[],
const
int
stride
[],
const
int
dilation
[],
const
int
group_count
,
const
cnnlDataType_t
tensor_dtype
);
MLUCnnlConvolutionDesc
(
const
int
dims
,
const
int64_t
pad
[],
const
int64_t
stride
[],
const
int64_t
dilation
[],
const
int
group_count
,
const
cnnlDataType_t
tensor_dtype
);
MLUCnnlConvolutionDesc
(
const
MLUCnnlConvolutionDesc
&
desc
)
=
delete
;
MLUCnnlConvolutionDesc
&
operator
=
(
const
MLUCnnlConvolutionDesc
&
desc
)
=
delete
;
const
cnnlConvolutionDescriptor_t
get
()
const
;
~
MLUCnnlConvolutionDesc
();
private:
cnnlConvolutionDescriptor_t
conv_desc_
=
nullptr
;
};
class
MLUCnnlBatchSpaceDesc
{
public:
MLUCnnlBatchSpaceDesc
(
uint32_t
block_shape
[],
uint32_t
paddings
[],
const
uint32_t
block_shape_size
,
const
uint32_t
paddings_size
);
void
getBatch2spaceNdextraInputSize
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
);
void
getSpace2batchNdextraInputSize
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
);
void
initSpace2batchNdExtraInput
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
void
*
extra_host_input
);
void
initBatch2spaceNdExtraInput
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
void
*
extra_host_input
);
const
cnnlSpaceBatchNdDescriptor_t
get
()
const
;
size_t
getExtraInputSize
()
const
;
~
MLUCnnlBatchSpaceDesc
();
private:
cnnlSpaceBatchNdDescriptor_t
op_desc_
=
nullptr
;
size_t
extra_input_size_
;
};
class
MLUCnnlTrigonDesc
{
public:
explicit
MLUCnnlTrigonDesc
(
const
cnnlTrigonFunctionMode_t
trigon_function_mode
);
const
cnnlTrigonDescriptor_t
get
()
const
;
~
MLUCnnlTrigonDesc
();
private:
cnnlTrigonDescriptor_t
trigon_desc_
=
nullptr
;
};
class
MLUCnnlDCNDesc
{
public:
MLUCnnlDCNDesc
(
int
dimNb
,
const
int
*
pad
,
const
int
*
stride
,
const
int
*
dilation
,
int
deformable_group
,
int
conv_group
,
int
im2col_step
);
const
cnnlDCNDescriptor_t
get
()
const
;
~
MLUCnnlDCNDesc
();
private:
cnnlDCNDescriptor_t
dcn_desc_
=
nullptr
;
};
class
MLUCnnlGridSampleDesc
{
public:
MLUCnnlGridSampleDesc
(
const
std
::
string
&
interp_mode_str
,
const
std
::
string
&
padding_mode_str
,
bool
align_corners
);
const
cnnlGridSampleDescriptor_t
get
()
const
;
~
MLUCnnlGridSampleDesc
();
private:
cnnlGridSampleDescriptor_t
grid_sample_desc_
=
nullptr
;
};
class
MLUSeqDataDesc
{
public:
MLUSeqDataDesc
(
const
MLUSeqDataDesc
&
desc
)
=
delete
;
MLUSeqDataDesc
&
operator
=
(
const
MLUSeqDataDesc
&
desc
)
=
delete
;
MLUSeqDataDesc
(
cnnlSeqDataLayout_t
layout
,
cnnlDataType_t
dtype
,
int
dimNb
,
const
int
dimSize
[],
int
seqLengthArraySize
,
const
int
seqLengthArray
[],
void
*
paddingFill
);
const
cnnlSeqDataDescriptor_t
get
()
const
;
~
MLUSeqDataDesc
();
private:
cnnlSeqDataDescriptor_t
seq_data_desc_
=
nullptr
;
};
class
MLURNNDesc
{
public:
MLURNNDesc
(
const
MLURNNDesc
&
desc
)
=
delete
;
MLURNNDesc
&
operator
=
(
const
MLURNNDesc
&
desc
)
=
delete
;
MLURNNDesc
(
const
int
hidden_size
,
const
int
num_layers
,
const
cnnlRNNInputMode_t
input_mode
,
const
cnnlDirectionMode_t
direction
,
const
cnnlRNNMode_t
rnn_mode
);
MLURNNDesc
(
cnnlRNNMode_t
cell_mode
,
cnnlRNNBiasMode_t
bias_mode
,
cnnlDirectionMode_t
direction
,
cnnlRNNInputMode_t
input_mode
,
cnnlDataType_t
data_type
,
cnnlDataType_t
math_prec
,
int
input_size
,
int
hidden_size
,
int
proj_size
,
int
layer_num
,
void
*
dropout_desc
,
cnnlRNNPaddingMode_t
padding_mode
);
void
SetRNNProjectionLayers
(
const
int
rec_proj_size
,
const
int
out_proj_size
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetRNNProjectionLayers
(
rnn_desc_
,
rec_proj_size
,
out_proj_size
));
}
void
SetPeepholeMode
(
const
cnnlRNNPeepholeMode_t
peephole_mode
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetRNNPeepholeMode
(
rnn_desc_
,
peephole_mode
));
}
void
SetRNNBiasMode
(
const
cnnlRNNBiasMode_t
bias_mode
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetRNNBiasMode
(
rnn_desc_
,
bias_mode
));
}
void
SetRNNMaskMode
(
const
cnnlRNNMaskMode_t
mask_mode
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetRNNMaskMode
(
rnn_desc_
,
mask_mode
));
}
void
SetRNNClip
(
const
cnnlRNNClipMode_t
clip_mode
,
const
cnnlNanPropagation_t
clip_nan_opt
,
const
double
left_clip
,
const
double
right_clip
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetRNNClip
(
rnn_desc_
,
clip_mode
,
clip_nan_opt
,
left_clip
,
right_clip
));
}
void
SetRNNPaddingMode
(
const
cnnlRNNPaddingMode_t
padding_mode
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetRNNPaddingMode
(
rnn_desc_
,
padding_mode
));
}
const
cnnlRNNDescriptor_t
get
()
const
;
~
MLURNNDesc
();
private:
cnnlRNNDescriptor_t
rnn_desc_
=
nullptr
;
};
class
MLUCnnl
{
public:
static
void
Active
(
const
ExecutionContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ActiveGrad
(
const
ExecutionContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
Concat
(
const
ExecutionContext
&
ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Concat
(
const
MLUDeviceContext
&
dev_ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cast
(
const
ExecutionContext
&
ctx
,
cnnlCastDataType_t
cast_type
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Clip
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
min
,
const
void
*
max
,
void
*
y
);
static
void
HardtanhBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
float
max_val
,
const
float
min_val
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
Div
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Fill
(
const
ExecutionContext
&
ctx
,
const
cnnlPointerMode_t
pointer_mode
,
const
void
*
value_ptr
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
LRN
(
const
ExecutionContext
&
ctx
,
const
int
local_size
,
const
double
alpha
,
const
double
beta
,
const
double
k
,
const
cnnlTensorDescriptor_t
input_quant_desc
,
const
void
*
input_quant
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantifyOffline
(
const
ExecutionContext
&
context
,
cnnlQuantizeMode_t
mode
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
QuantifyOnline
(
const
ExecutionContext
&
context
,
const
int
bitwidth
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
bool
compute_scale
,
void
*
position
,
void
*
scale
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
SGD
(
const
ExecutionContext
&
context
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
);
static
void
ApplyAdaGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
cnnlTensorDescriptor_t
accum_desc
,
void
*
accum
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
void
*
lr
,
const
bool
update_slots
);
static
void
ApplyRMSProp
(
const
ExecutionContext
&
context
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
momentum
,
const
void
*
epsilon
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
ms_desc
,
void
*
ms
,
const
cnnlTensorDescriptor_t
mom_desc
,
void
*
mom
);
static
void
ApplyCenterRMSProp
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
momentum
,
const
void
*
epsilon
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
mg_desc
,
void
*
mg
,
const
cnnlTensorDescriptor_t
ms_desc
,
void
*
ms
,
const
cnnlTensorDescriptor_t
mom_desc
,
void
*
mom
);
static
void
ApplyAdam
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
m_desc
,
void
*
m
,
const
cnnlTensorDescriptor_t
v_desc
,
void
*
v
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
beta1
,
const
void
*
beta2
,
const
void
*
beta1_power
,
const
void
*
beta2_power
,
const
void
*
epsilon
,
const
bool
use_nesterov
);
static
void
ApplyAdaMax
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
m_desc
,
void
*
m
,
const
cnnlTensorDescriptor_t
v_desc
,
void
*
v
,
const
void
*
diff
,
const
void
*
lr
,
const
void
*
beta1
,
const
void
*
beta2
,
const
void
*
beta1_power
,
const
void
*
epsilon
);
static
void
ApplyMomentum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
bool
use_nesterov
,
const
void
*
lr
,
const
void
*
momentum
,
void
*
var
,
void
*
accum
);
static
void
ApplyKerasMomentum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
bool
use_nesterov
,
const
void
*
lr
,
const
void
*
momentum
,
void
*
var
,
void
*
accum
);
static
void
ApplyAdadelta
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
diff
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
epsilon
,
void
*
var
,
void
*
accum
,
void
*
accum_update
);
static
void
SparseSoftmaxXentWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxMode_t
mode
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
label_desc
,
const
void
*
label
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
output
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
void
*
back_out
);
static
void
RandomUniform
(
const
ExecutionContext
&
ctx
,
const
int
num
,
const
cnnlDataType_t
data_type
,
const
cnnlRandGenerator_t
mlu_generator
,
void
*
mlu_state
,
void
*
output
);
static
void
FusedDropout
(
const
ExecutionContext
&
ctx
,
const
cnnlRandGenerator_t
generator
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
float
p
,
void
*
state
,
const
cnnlTensorDescriptor_t
mask_desc
,
const
void
*
mask
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cumsum
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
bool
exclusive
,
const
bool
reverse
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
BroadcastTo
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
GatherFunctor
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
int
batch_dims
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ScatterRefFunctor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlScatterRefMode_t
mode
);
static
void
ScatterFunctor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
void
*
params
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
int
dim
,
const
cnnlScatterMode_t
mode
=
CNNL_SCATTER
);
static
void
Range
(
const
ExecutionContext
&
ctx
,
const
void
*
start
,
const
void
*
end
,
const
void
*
step
,
const
cnnlDataType_t
output_dtype
,
void
*
output
);
static
void
Round
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
TopK
(
const
ExecutionContext
&
ctx
,
const
int
k
,
const
int
dim
,
const
bool
largest
,
const
bool
sorted
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
values_output_desc
,
void
*
values_out
,
const
cnnlTensorDescriptor_t
indices_output_desc
,
void
*
indices_out
);
static
void
StridedSlice
(
const
ExecutionContext
&
ctx
,
const
int
begin
[],
const
int
end
[],
const
int
strides
[],
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Split
(
const
ExecutionContext
&
ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Split
(
const
MLUDeviceContext
&
dev_ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Scale
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
alpha_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
beta_desc
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
AddN
(
const
ExecutionContext
&
ctx
,
uint32_t
input_num
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Log
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
cnnlLogBase_t
log_base
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
StridedSliceGrad
(
const
ExecutionContext
&
ctx
,
const
int
begin
[],
const
int
end
[],
const
int
strides
[],
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Logic
(
const
ExecutionContext
&
ctx
,
const
cnnlLogicOp_t
log_method
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
Select
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
condition_desc
,
const
void
*
condition_ptr
,
const
cnnlTensorDescriptor_t
then_desc
,
const
void
*
then_ptr
,
const
cnnlTensorDescriptor_t
else_desc
,
const
void
*
else_ptr
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output_ptr
);
static
void
AssignAdd
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
);
static
void
AssignSub
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
);
static
void
Assign
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
);
static
void
GatherNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BatchToSpace
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlSpaceBatchParam_t
param
);
static
void
BatchToSpaceNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
cnnlSpaceBatchNdDescriptor_t
param
,
void
*
extra_device_input
,
size_t
extra_input_size
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
PoolingForward
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
int64_t
output_h
,
int64_t
output_w
,
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
void
*
extra_input_ptr
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
AdaptivePoolingForward
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlTensorDescriptor_t
index_desc
,
void
*
index
);
static
void
Pool3D
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
const
std
::
vector
<
int64_t
>&
output_shape
,
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Pad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
paddings
,
const
void
*
padding_value
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Matmul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BatchMatmul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
MulAx
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
alpha_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
OpTensor
(
const
ExecutionContext
&
ctx
,
const
cnnlOpTensorDescriptor_t
op_tensor_desc
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlDataType_t
dtype
,
const
float
alpha1_float
=
1.
f
,
const
float
alpha2_float
=
1.
f
,
const
float
beta_float
=
0.
f
);
static
void
BiasAddGrad
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
OneHot
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
desc_indices
,
const
void
*
indices
,
const
int
depth
,
const
void
*
on_value
,
const
void
*
off_value
,
const
int
axis
,
cnnlDataType_t
output_data_type
,
void
*
output
);
static
void
NonMaxSuppression
(
const
ExecutionContext
&
ctx
,
const
cnnlNmsDescriptor_t
nms_desc
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
confidence_desc
,
const
void
*
confidence
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
void
*
output_size
);
static
void
SoftmaxCrossEntropyWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxMode_t
mode
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
logits_in
,
const
cnnlTensorDescriptor_t
label_desc
,
const
void
*
labels_in
,
const
cnnlTensorDescriptor_t
loss_out_desc
,
void
*
loss_out
,
const
cnnlTensorDescriptor_t
back_out_desc
,
void
*
back_out
);
static
void
SoftmaxForward
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxAlgorithm_t
algorithm
,
cnnlSoftmaxMode_t
mode
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SoftmaxBackward
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxAlgorithm_t
algorithm
,
cnnlSoftmaxMode_t
mode
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
Softplus
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
features_desc
,
const
void
*
features
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SoftplusGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
gradients_desc
,
const
void
*
gradients
,
const
cnnlTensorDescriptor_t
features_desc
,
const
void
*
features
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
RsqrtGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
y
,
const
void
*
diff_y
,
void
*
output
);
static
void
SqrtGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
y
,
const
void
*
diff_y
,
void
*
output
);
static
void
ConvolutionForward
(
const
ExecutionContext
&
ctx
,
cnnlConvolutionDescriptor_t
conv_desc_
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias_ptr
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filtet_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FusedConvBNQuantify
(
const
ExecutionContext
&
ctx
,
cnnlConvolutionDescriptor_t
conv_desc
,
const
void
*
epsilon_ptr
,
const
int
fused_ops_number
,
const
cnnlDataType_t
tensor_dtype
,
const
int
input_position
,
const
float
input_scale
,
const
int
filter_position
,
const
float
filter_scale
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale_ptr
,
const
cnnlTensorDescriptor_t
offset_desc
,
const
void
*
offset_ptr
,
const
cnnlTensorDescriptor_t
mean_desc
,
const
void
*
mean_ptr
,
const
cnnlTensorDescriptor_t
variance_desc
,
const
void
*
variance_ptr
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filtet_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Tile
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
UnsortedSegmentSum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
data
,
const
cnnlTensorDescriptor_t
ids_desc
,
const
int
*
segment_ids
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Reduce
(
const
ExecutionContext
&
ctx
,
const
bool
need_workspace
,
const
cnnlReduceDescriptor_t
reduction_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
size_t
indices_size
,
void
*
indices
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FloorDiv
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FloorMod
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Maximum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Minimum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Pow
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
PowR
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
DivNoNan
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SquaredDifference
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
L2Loss
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
output
);
static
void
Abs
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Neg
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Floor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Ceil
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IsNan
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Square
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Sqrt
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Rsqrt
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cos
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Sin
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
TrigonForward
(
const
ExecutionContext
&
ctx
,
const
cnnlTrigonDescriptor_t
trigon_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Exp
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Sign
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IndexSelect
(
const
ExecutionContext
&
ctx
,
const
int
dim
,
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
index_desc
,
const
void
*
index
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IsFinite
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IsNanInf
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
output
);
static
void
Erf
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Log1p
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
LogicalNot
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
DynamicStitch
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
*
indices_desc
,
const
int
**
indices
,
const
cnnlTensorDescriptor_t
*
data_desc
,
const
void
**
data
,
const
int
size
,
int
*
indices_dims
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
CropAndResize
(
const
ExecutionContext
&
ctx
,
const
std
::
string
method_name
,
const
float
extrapolation_value
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_index_desc
,
const
void
*
box_index
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
CropAndResizeBackwardImage
(
const
ExecutionContext
&
ctx
,
const
std
::
string
method_name
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_idx_desc
,
const
void
*
box_idx
,
const
cnnlTensorDescriptor_t
grads_image_desc
,
void
*
grads_image
);
static
void
CropAndResizeBackwardBoxes
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_idx_desc
,
const
void
*
box_idx
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
PoolingBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
AdaptivePoolingBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingMode_t
pool_mode
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
index_desc
,
const
void
*
index
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
PoolingIndex
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
);
static
void
SpaceToBatch
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
int64_t
block_shape
[]);
static
void
SpaceToBatchNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
cnnlSpaceBatchNdDescriptor_t
param
,
void
*
extra_device_input
,
size_t
extra_input_size
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Interp
(
const
ExecutionContext
&
ctx
,
const
cnnlInterpMode_t
mode
,
const
bool
align_corners
,
const
bool
half_pixel_centers
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
InterpBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlInterpBackwardMode_t
mode
,
const
bool
align_corners
,
const
bool
half_pixel_centers
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantizeParam
(
const
ExecutionContext
&
ctx
,
const
cnnlQuantizeMode_t
mode
,
const
int
bitwidth
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
position
,
void
*
scale
,
void
*
offset
);
static
void
QuantizeMatMul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantizeBatchMatMul
(
const
ExecutionContext
&
ctx
,
const
bool
adj_x
,
const
bool
adj_y
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantizeBatchMatMulBCast
(
const
ExecutionContext
&
ctx
,
const
bool
adj_x
,
const
bool
adj_y
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FusedBatchNorm
(
const
ExecutionContext
&
ctx
,
const
bool
is_training
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
estimated_mean
,
const
void
*
estimated_variance
,
float
epsilon
,
float
momentum
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
void
*
batch_mean
,
void
*
batch_var
,
void
*
saved_mean
,
void
*
saved_var
);
static
void
FusedBatchNormGrad
(
const
ExecutionContext
&
ctx
,
const
bool
is_training
,
const
cnnlTensorDescriptor_t
y_backprop_desc
,
const
void
*
y_backprop
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale
,
const
void
*
saved_mean
,
const
void
*
saved_var
,
float
epsilon
,
const
cnnlTensorDescriptor_t
x_backprop_desc
,
void
*
x_backprop
,
void
*
scale_backprop
,
void
*
offset_backprop
);
static
void
LayerNormForward
(
const
ExecutionContext
&
ctx
,
int
axis
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
weight_bias_desc
,
const
void
*
weight
,
const
void
*
bias
,
float
eps
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
,
const
cnnlTensorDescriptor_t
mean_rstd_desc
,
void
*
saved_mean
,
void
*
saved_rstd
);
static
void
LayerNormBackward
(
const
ExecutionContext
&
ctx
,
int
axis
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_z_desc
,
const
void
*
diff_z
,
const
cnnlTensorDescriptor_t
weight_bias_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
mean_rstd_desc
,
const
void
*
saved_mean
,
const
void
*
saved_rstd
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
,
void
*
diff_weight
,
void
*
diff_bias
);
static
void
Transpose
(
const
ExecutionContext
&
ctx
,
const
std
::
vector
<
int
>
perm
,
const
int
input_dim
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
TrilTriu
(
const
ExecutionContext
&
ctx
,
const
int
diagonal_k
,
const
bool
tri_up_mode
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
MatrixBandPart
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
input
,
const
int
num_lower
,
const
int
num_upper
,
void
*
output
);
static
void
NumTrue
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
num_true_desc
,
void
*
num_true
);
static
void
Where
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
num_true_desc
,
const
void
*
num_true
,
const
bool
as_tuple
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
);
static
void
Conv2D
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
input_position
,
const
void
*
input_scale
,
const
void
*
input_offset
,
const
void
*
filter_position
,
const
void
*
filter_scale
,
const
void
*
filter_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ConvBackpropInput
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
in_backprop_desc
,
void
*
in_backprop
);
static
void
QuantizeConvBackpropInput
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
filter_position
,
const
void
*
filter_scale
,
const
void
*
filter_offset
,
const
void
*
out_backprop_position
,
const
void
*
out_backprop_scale
,
const
void
*
out_backprop_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
in_backprop_desc
,
void
*
in_backprop
);
static
void
ConvBackpropFilter
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
filter_backprop_desc
,
void
*
filter_backprop
);
static
void
QuantizeConvBackpropFilter
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
input_position
,
const
void
*
input_scale
,
const
void
*
input_offset
,
const
void
*
out_backprop_position
,
const
void
*
out_backprop_scale
,
const
void
*
out_backprop_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
filter_backprop_desc
,
void
*
filter_backprop
);
static
void
DCNForward
(
const
ExecutionContext
&
ctx
,
const
cnnlDCNDescriptor_t
dcn_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
offset_desc
,
const
void
*
offset
,
const
cnnlTensorDescriptor_t
mask_desc
,
const
void
*
mask
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
DCNBackwardData
(
const
ExecutionContext
&
ctx
,
const
cnnlDCNDescriptor_t
dcn_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
offset_desc
,
const
void
*
offset
,
const
cnnlTensorDescriptor_t
mask_desc
,
const
void
*
mask
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
grad_output_desc
,
const
void
*
grad_output
,
const
cnnlTensorDescriptor_t
grad_input_desc
,
void
*
grad_input
,
const
cnnlTensorDescriptor_t
grad_offset_desc
,
void
*
grad_offset
,
const
cnnlTensorDescriptor_t
grad_mask_desc
,
void
*
grad_mask
);
static
void
DCNBackwardWeight
(
const
ExecutionContext
&
ctx
,
const
cnnlDCNDescriptor_t
dcn_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
offset_desc
,
const
void
*
offset
,
const
cnnlTensorDescriptor_t
mask_desc
,
const
void
*
mask
,
const
cnnlTensorDescriptor_t
grad_output_desc
,
const
void
*
grad_output
,
const
cnnlTensorDescriptor_t
grad_weight_desc
,
void
*
grad_weight
,
const
cnnlTensorDescriptor_t
grad_bias_desc
,
void
*
grad_bias
);
static
void
InTopK
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
predictions_desc
,
const
void
*
predictions
,
const
cnnlTensorDescriptor_t
targets_desc
,
const
void
*
targets
,
const
cnnlTensorDescriptor_t
k_desc
,
const
void
*
k
,
const
int
k_int
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ScatterNd
(
const
ExecutionContext
&
ctx
,
cnnlScatterNdMode_t
mode
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BitWise
(
const
ExecutionContext
&
ctx
,
const
cnnlBitComputeOp_t
optype
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QR
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
cnnlTensorDescriptor_t
q_desc
,
void
*
q
,
const
cnnlTensorDescriptor_t
r_desc
,
void
*
r
,
const
bool
some
);
static
void
Reciprocal
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BceLoss
(
const
ExecutionContext
&
ctx
,
const
cnnlBceLossReduction_t
reduction
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BceLossBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlBceLossReduction_t
reduction
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SmoothL1LossForward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
t_desc
,
const
void
*
target
,
const
float
beta
,
const
cnnlSmoothL1LossAlgorithm_t
algorithm
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
);
static
void
SmoothL1LossBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
dy_desc
,
const
void
*
dy
,
const
float
beta
,
const
cnnlSmoothL1LossAlgorithm_t
algorithm
,
const
cnnlTensorDescriptor_t
dx_desc
,
void
*
dx
);
static
void
EmbeddingForward
(
const
ExecutionContext
&
ctx
,
const
int
padding_idx
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
int
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
RNNForward
(
const
ExecutionContext
&
ctx
,
const
cnnlRNNDescriptor_t
rnn_desc
,
const
int
dev_seq_lengths
[],
const
void
*
weight_param_ptr
,
size_t
weightspace_size
,
const
cnnlSeqDataDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlSeqDataDescriptor_t
y_desc
,
void
*
y
,
const
cnnlTensorDescriptor_t
h_desc
,
const
void
*
hx
,
void
*
hy
,
const
cnnlTensorDescriptor_t
c_desc
,
const
void
*
cx
,
void
*
cy
,
void
*
reservespace_ptr
);
static
void
RNNBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlRNNDescriptor_t
rnn_desc
,
cnnlWgradMode_t
add_grad
,
const
int
dev_seq_lengths
[],
const
void
*
weight_param_ptr
,
void
*
dweight_param_ptr
,
size_t
weightspace_size
,
const
cnnlSeqDataDescriptor_t
x_desc
,
const
void
*
x
,
void
*
dx
,
const
cnnlSeqDataDescriptor_t
y_desc
,
const
void
*
y
,
const
void
*
dy
,
const
cnnlTensorDescriptor_t
hx_desc
,
const
void
*
hx
,
const
void
*
dhy
,
void
*
dhx
,
const
cnnlTensorDescriptor_t
cx_desc
,
const
void
*
cx
,
const
void
*
dcy
,
void
*
dcx
,
void
*
reservespace_ptr
,
size_t
reservespace_size
);
static
void
Mask
(
const
ExecutionContext
&
ctx
,
cnnlMaskedOp_t
masked_mode
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
masked_desc
,
const
void
*
masked
,
const
cnnlTensorDescriptor_t
value_desc
,
const
void
*
value
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
uint32_t
*
number
);
static
void
Transform
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
EmbeddingBackward
(
const
ExecutionContext
&
ctx
,
int
padding_idx
,
bool
scale_grad_by_freq
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
diff_desc
,
const
void
*
diff
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BceWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlBceWithLogitsReduction_t
reduction
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
pos_weight_desc
,
const
void
*
pos_weight
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BceWithLogitsBackward
(
const
ExecutionContext
&
ctx
,
cnnlBceWithLogitsReduction_t
reduction
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
target_desc
,
const
void
*
target
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
pos_weight_desc
,
const
void
*
pos_weight
,
const
cnnlTensorDescriptor_t
diff_input_desc
,
void
*
diff_input
);
static
void
RoiAlign
(
const
ExecutionContext
&
ctx
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
float
spatial_scale
,
const
bool
aligned
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
RoiAlignBackward
(
const
ExecutionContext
&
ctx
,
const
int
sampling_ratio
,
const
float
spatial_scale
,
const
bool
aligned
,
const
cnnlTensorDescriptor_t
grads_desc
,
const
void
*
grads
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
grads_image_desc
,
void
*
grads_image
);
static
void
GridSample
(
const
ExecutionContext
&
ctx
,
const
cnnlGridSampleDescriptor_t
grid_sample_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
grid_desc
,
const
void
*
grid
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SyncBatchNormStats
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
float
eps
,
const
cnnlTensorDescriptor_t
mean_desc
,
void
*
mean
,
const
cnnlTensorDescriptor_t
invstd_desc
,
void
*
invstd
);
static
void
SyncBatchNormGatherStatsWithCounts
(
const
ExecutionContext
&
ctx
,
float
momentum
,
float
eps
,
const
cnnlTensorDescriptor_t
mean_all_desc
,
const
void
*
mean_all
,
const
cnnlTensorDescriptor_t
invstd_all_desc
,
const
void
*
invstd_all
,
const
cnnlTensorDescriptor_t
moving_mean_desc
,
void
*
moving_mean
,
const
cnnlTensorDescriptor_t
moving_var_desc
,
void
*
moving_var
,
const
cnnlTensorDescriptor_t
count_all_desc
,
const
void
*
count_all
,
const
cnnlTensorDescriptor_t
mean_desc
,
void
*
mean
,
const
cnnlTensorDescriptor_t
invstd_desc
,
void
*
invstd
);
static
void
SyncBatchNormElemt
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
mean_desc
,
const
void
*
mean
,
const
cnnlTensorDescriptor_t
invstd_desc
,
const
void
*
invstd
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
);
static
void
SyncBatchnormBackwardReduce
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
desc_dz
,
const
void
*
dz
,
const
cnnlTensorDescriptor_t
desc_x
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
desc_mean
,
const
void
*
mean
,
const
cnnlTensorDescriptor_t
desc_invstd
,
const
void
*
invstd
,
const
cnnlTensorDescriptor_t
desc_dweight
,
void
*
dweight
,
const
cnnlTensorDescriptor_t
desc_dbias
,
void
*
dbias
,
const
cnnlTensorDescriptor_t
desc_sum_dy
,
void
*
sum_dy
,
const
cnnlTensorDescriptor_t
desc_sum_dy_xmu
,
void
*
sum_dy_xmu
,
const
bool
needs_input_grad0
,
const
bool
needs_input_grad1
,
const
bool
needs_input_grad2
);
static
void
SyncBatchNormBackwardElemt
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
mean_desc
,
const
void
*
mean
,
const
cnnlTensorDescriptor_t
invstd_desc
,
const
void
*
invstd
,
const
cnnlTensorDescriptor_t
weight_desc
,
const
void
*
weight
,
const
cnnlTensorDescriptor_t
sum_dy_desc
,
const
void
*
sum_dy
,
const
cnnlTensorDescriptor_t
sum_dy_xmu_desc
,
const
void
*
sum_dy_xmu
,
const
cnnlTensorDescriptor_t
count_desc
,
const
void
*
count
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
};
class
MLUOP
{
public:
static
void
OpYoloBox
(
const
ExecutionContext
&
ctx
,
const
mluOpTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
mluOpTensorDescriptor_t
img_size_desc
,
const
void
*
img_size
,
const
mluOpTensorDescriptor_t
anchors_desc
,
const
void
*
anchors
,
const
int
class_num
,
const
float
conf_thresh
,
const
int
downsample_ratio
,
const
bool
clip_bbox
,
const
float
scale
,
const
bool
iou_aware
,
const
float
iou_aware_factor
,
const
mluOpTensorDescriptor_t
boxes_desc
,
void
*
boxes
,
const
mluOpTensorDescriptor_t
scores_desc
,
void
*
scores
);
static
void
OpPriorBox
(
const
ExecutionContext
&
ctx
,
const
mluOpTensorDescriptor_t
min_sizes_desc
,
const
void
*
min_sizes
,
const
mluOpTensorDescriptor_t
aspect_ratios_desc
,
const
void
*
aspect_ratios
,
const
mluOpTensorDescriptor_t
variances_desc
,
const
void
*
variances
,
const
mluOpTensorDescriptor_t
max_sizes_desc
,
const
void
*
max_sizes
,
const
int
height
,
const
int
width
,
const
int
im_height
,
const
int
im_width
,
const
float
step_h
,
const
float
step_w
,
const
float
offset
,
const
bool
clip
,
const
bool
min_max_aspect_ratios_order
,
const
mluOpTensorDescriptor_t
output_desc
,
void
*
output
,
const
mluOpTensorDescriptor_t
var_desc
,
void
*
var
);
};
const
std
::
map
<
const
std
::
string
,
std
::
pair
<
std
::
vector
<
int
>
,
std
::
vector
<
int
>>>
TransPermMap
=
{
// trans_mode, (forward_perm, backward_perm)
{
"3D_NCHW2NHWC"
,
{{
0
,
2
,
1
},
{
0
,
2
,
1
}}},
{
"4D_NCHW2NHWC"
,
{{
0
,
2
,
3
,
1
},
{
0
,
3
,
1
,
2
}}},
{
"5D_NCHWD2NDHWC"
,
{{
0
,
4
,
2
,
3
,
1
},
{
0
,
4
,
2
,
3
,
1
}}},
{
"5D_NHWDC2NDHWC"
,
{{
0
,
3
,
1
,
2
,
4
},
{
0
,
2
,
3
,
4
,
1
}}}};
inline
void
SetMLUTransposePerm
(
const
framework
::
DDim
&
dims
,
const
DataLayout
&
data_layout
,
std
::
vector
<
int
>*
forward_perm
,
std
::
vector
<
int
>*
backward_perm
,
std
::
vector
<
int
>*
out_shape
)
{
const
int
dim_size
=
dims
.
size
();
PADDLE_ENFORCE_EQ
((
dim_size
>=
3
)
&&
(
dim_size
<=
5
),
true
,
platform
::
errors
::
InvalidArgument
(
"MLUTransposePerm func only support (dim_size >= 3) && "
"(dim_size <= 5), but now dim_size is %d."
,
dim_size
));
PADDLE_ENFORCE_EQ
(
(
data_layout
==
DataLayout
::
kNCHW
)
||
(
data_layout
==
DataLayout
::
kNHWC
),
true
,
platform
::
errors
::
InvalidArgument
(
"MLUTransposePerm func only support DataLayout: kNCHW or kNHWC, but "
"now data_layout is %s."
,
data_layout
));
// case 1: NCHW of Paddle != NHWC of MLU when dims==3,4
// case 2: NHWDC and NCHWD of Paddle != NDHWC of MLU when dims==5
std
::
string
map_key
=
""
;
if
(
data_layout
==
DataLayout
::
kNCHW
)
{
switch
(
dim_size
)
{
case
3
:
map_key
=
"3D_NCHW2NHWC"
;
break
;
case
4
:
map_key
=
"4D_NCHW2NHWC"
;
break
;
case
5
:
map_key
=
"5D_NCHWD2NDHWC"
;
break
;
}
}
else
if
(
data_layout
==
DataLayout
::
kNHWC
&&
dim_size
==
5
)
{
map_key
=
"5D_NHWDC2NDHWC"
;
}
assert
(
map_key
!=
""
);
forward_perm
->
assign
(
TransPermMap
.
at
(
map_key
).
first
.
begin
(),
TransPermMap
.
at
(
map_key
).
first
.
end
());
backward_perm
->
assign
(
TransPermMap
.
at
(
map_key
).
second
.
begin
(),
TransPermMap
.
at
(
map_key
).
second
.
end
());
auto
in_dims
=
phi
::
vectorize
(
dims
);
for
(
size_t
i
=
0
;
i
<
in_dims
.
size
();
i
++
)
{
out_shape
->
push_back
(
in_dims
[
forward_perm
->
at
(
i
)]);
}
}
template
<
typename
T
>
inline
void
TransposeFromMLUTensor
(
const
ExecutionContext
&
ctx
,
const
std
::
vector
<
int
>
perm
,
const
phi
::
DenseTensor
*
transformed_input
,
phi
::
DenseTensor
*
transformed_output
,
bool
need_reshape_or_alloc
)
{
const
int
dim_size
=
perm
.
size
();
if
(
need_reshape_or_alloc
)
{
std
::
vector
<
int
>
output_shape
;
auto
input_dims
=
transformed_input
->
dims
();
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
output_shape
.
push_back
(
input_dims
[
perm
[
i
]]);
}
transformed_output
->
mutable_data
<
T
>
(
framework
::
DDim
(
output_shape
.
data
(),
dim_size
),
ctx
.
GetPlace
());
}
MLUCnnlTensorDesc
trans_in_desc
(
*
transformed_input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
trans_out_desc
(
*
transformed_output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Transpose
(
ctx
,
perm
,
dim_size
,
trans_in_desc
.
get
(),
GetBasePtr
(
transformed_input
),
trans_out_desc
.
get
(),
GetBasePtr
(
transformed_output
));
}
template
<
typename
T
>
inline
void
FillMLUTensorWithHostValue
(
const
ExecutionContext
&
ctx
,
T
value
,
phi
::
DenseTensor
*
out
)
{
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value
,
out_desc
.
get
(),
GetBasePtr
(
out
));
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/optimizers/adam_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
AdamMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s"
,
ctx
.
InputNames
(
"Param"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE_EQ
(
grad_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The Grad(%s)'s type should be phi::DenseTensor, "
"but the received is %s"
,
ctx
.
InputNames
(
"Grad"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grad"
);
auto
*
mom1
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Moment1"
);
auto
*
mom2
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Moment2"
);
auto
*
lr
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
*
beta1_pow
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta1Pow"
);
auto
*
beta2_pow
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta2Pow"
);
auto
*
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
*
mom1_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Moment1Out"
);
auto
*
mom2_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Moment2Out"
);
auto
*
beta1_pow_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Beta1PowOut"
);
auto
*
beta2_pow_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Beta2PowOut"
);
bool
skip_update
=
false
;
if
(
ctx
.
HasInput
(
"SkipUpdate"
))
{
auto
*
skip_update_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SkipUpdate"
);
PADDLE_ENFORCE_EQ
(
skip_update_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update_tensor
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update_tensor
,
ctx
.
device_context
(),
&
skip_update_vec
);
ctx
.
device_context
().
Wait
();
skip_update
=
skip_update_vec
[
0
];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if
(
skip_update
)
{
VLOG
(
4
)
<<
"Adam skip update"
;
framework
::
TensorCopy
(
*
param
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
param_out
);
framework
::
TensorCopy
(
*
mom1
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
mom1_out
);
framework
::
TensorCopy
(
*
mom2
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
mom2_out
);
framework
::
TensorCopy
(
*
beta1_pow
,
beta1_pow
->
place
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
beta1_pow_out
);
framework
::
TensorCopy
(
*
beta2_pow
,
beta2_pow
->
place
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
beta2_pow_out
);
return
;
}
bool
use_global_beta_pow
=
ctx
.
Attr
<
bool
>
(
"use_global_beta_pow"
);
VLOG
(
4
)
<<
"use_global_beta_pow:"
<<
use_global_beta_pow
;
param_out
->
ShareDataWith
(
*
param
);
mom1_out
->
ShareDataWith
(
*
mom1
);
mom2_out
->
ShareDataWith
(
*
mom2
);
phi
::
DenseTensor
beta1_pow_tmp
;
phi
::
DenseTensor
beta2_pow_tmp
;
if
(
beta1_pow
->
place
()
==
platform
::
CPUPlace
())
{
T
beta1
=
*
beta1_pow
->
data
<
T
>
();
beta1_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_pow_tmp_desc
(
beta1_pow_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta1
,
beta1_pow_tmp_desc
.
get
(),
GetBasePtr
(
&
beta1_pow_tmp
));
beta1_pow
=
&
beta1_pow_tmp
;
}
if
(
beta2_pow
->
place
()
==
platform
::
CPUPlace
())
{
T
beta2
=
*
beta2_pow
->
data
<
T
>
();
beta2_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta2_pow_tmp_desc
(
beta2_pow_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta2
,
beta2_pow_tmp_desc
.
get
(),
GetBasePtr
(
&
beta2_pow_tmp
));
beta2_pow
=
&
beta2_pow_tmp
;
}
VLOG
(
3
)
<<
"beta1_pow.numel() : "
<<
beta1_pow
->
numel
()
<<
"beta2_pow.numel() : "
<<
beta2_pow
->
numel
();
VLOG
(
3
)
<<
"param.numel(): "
<<
param
->
numel
();
PADDLE_ENFORCE_EQ
(
beta1_pow_out
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta1 pow output size should be 1, but received "
"value is:%d."
,
beta1_pow_out
->
numel
()));
PADDLE_ENFORCE_EQ
(
beta2_pow_out
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta2 pow output size should be 1, but received "
"value is:%d."
,
beta2_pow_out
->
numel
()));
const
phi
::
DenseTensor
*
beta1_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
beta2_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
epsilon_tensor
=
nullptr
;
phi
::
DenseTensor
beta1_tmp
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
beta2_tmp
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
epsilon_tmp
(
phi
::
DataType
::
FLOAT32
);
if
(
ctx
.
HasInput
(
"Beta1Tensor"
))
{
beta1_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta1Tensor"
);
PADDLE_ENFORCE_EQ
(
beta1_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(Beta1Tensor) size must be 1, but get %d"
,
beta1_tensor
->
numel
()));
}
else
{
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
beta1_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_tmp_desc
(
beta1_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta1
,
beta1_tmp_desc
.
get
(),
GetBasePtr
(
&
beta1_tmp
));
beta1_tensor
=
&
beta1_tmp
;
}
if
(
ctx
.
HasInput
(
"Beta2Tensor"
))
{
beta2_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta2Tensor"
);
PADDLE_ENFORCE_EQ
(
beta2_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(Beta2Tensor) size must be 1, but get %d"
,
beta2_tensor
->
numel
()));
}
else
{
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
beta2_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta2_tmp_desc
(
beta2_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta2
,
beta2_tmp_desc
.
get
(),
GetBasePtr
(
&
beta2_tmp
));
beta2_tensor
=
&
beta2_tmp
;
}
if
(
ctx
.
HasInput
(
"EpsilonTensor"
))
{
epsilon_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"EpsilonTensor"
);
PADDLE_ENFORCE_EQ
(
epsilon_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(EpsilonTensor) size must be 1, but get %d"
,
epsilon_tensor
->
numel
()));
}
else
{
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
epsilon_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
epsilon_tmp_desc
(
epsilon_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
epsilon
,
epsilon_tmp_desc
.
get
(),
GetBasePtr
(
&
epsilon_tmp
));
epsilon_tensor
=
&
epsilon_tmp
;
}
MLUCnnlTensorDesc
param_desc
(
*
param
);
MLUCnnlTensorDesc
mom1_desc
(
*
mom1
);
MLUCnnlTensorDesc
mom2_desc
(
*
mom2
);
MLUCnnlTensorDesc
grad_desc
(
*
grad
);
MLUCnnl
::
ApplyAdam
(
ctx
,
param_desc
.
get
(),
GetBasePtr
(
param_out
),
mom1_desc
.
get
(),
GetBasePtr
(
mom1_out
),
mom2_desc
.
get
(),
GetBasePtr
(
mom2_out
),
grad_desc
.
get
(),
GetBasePtr
(
grad
),
GetBasePtr
(
lr
),
GetBasePtr
(
beta1_tensor
),
GetBasePtr
(
beta2_tensor
),
GetBasePtr
(
beta1_pow
),
GetBasePtr
(
beta2_pow
),
GetBasePtr
(
epsilon_tensor
),
/*use_nesterov*/
false
);
if
(
!
use_global_beta_pow
)
{
beta1_pow_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
beta2_pow_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_desc
(
*
beta1_tensor
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_pow
),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_tensor
),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_pow_out
),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_pow
),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_tensor
),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_pow_out
),
ToCnnlDataType
<
T
>
());
}
}
};
template
<
typename
T
>
class
AdamWMLUKernel
:
public
AdamMLUKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
VLOG
(
3
)
<<
"MLU AdamW Kernel"
;
bool
skip_update
=
false
;
if
(
ctx
.
HasInput
(
"SkipUpdate"
))
{
VLOG
(
3
)
<<
"Has SkipUpdate"
;
auto
*
skip_update_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SkipUpdate"
);
PADDLE_ENFORCE_EQ
(
skip_update_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update_tensor
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update_tensor
,
ctx
.
device_context
(),
&
skip_update_vec
);
ctx
.
device_context
().
Wait
();
skip_update
=
skip_update_vec
[
0
];
}
bool
with_decay
=
ctx
.
Attr
<
bool
>
(
"with_decay"
);
const
bool
multi_precision
=
ctx
.
Attr
<
bool
>
(
"multi_precision"
);
auto
*
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
*
master_param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"MasterParamOut"
);
const
auto
*
master_param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"MasterParam"
);
VLOG
(
3
)
<<
"Skip update: "
<<
skip_update
<<
", With decay: "
<<
with_decay
;
if
(
!
skip_update
&&
with_decay
)
{
auto
*
param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
MLUCnnlTensorDesc
param_desc
(
*
param
);
if
(
multi_precision
)
{
VLOG
(
3
)
<<
"[adamw] multi_precision, cast masterparam to param."
;
bool
has_master
=
ctx
.
HasInput
(
"MasterParam"
)
&&
ctx
.
HasOutput
(
"MasterParamOut"
);
PADDLE_ENFORCE_EQ
(
has_master
,
true
,
platform
::
errors
::
InvalidArgument
(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"
));
// cast masterparam (fp32) to param (fp16), then paramout (fp16) to
// masterparamout (fp32)
MLUCnnlTensorDesc
master_param_desc
(
*
master_param
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
master_param
->
dtype
()),
framework
::
TransToProtoVarType
(
param
->
dtype
()));
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
master_param_desc
.
get
(),
GetBasePtr
(
master_param
),
param_desc
.
get
(),
const_cast
<
void
*>
(
GetBasePtr
(
param
)));
}
else
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s"
,
ctx
.
InputNames
(
"Param"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
lr
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
float
coeff
=
ctx
.
Attr
<
float
>
(
"coeff"
);
// update param with decay coeff: mul(-1 * lr, coeff * param) + param
MLUCnnlTensorDesc
lr_desc
(
*
lr
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
lr_desc
.
get
(),
GetBasePtr
(
lr
),
param_desc
.
get
(),
GetBasePtr
(
param
),
param_desc
.
get
(),
const_cast
<
void
*>
(
GetBasePtr
(
param
)),
ToCnnlDataType
<
T
>
(),
/*alpha1*/
-
1.
f
,
/*alpha2*/
coeff
,
/*beta*/
1.
f
);
}
}
AdamMLUKernel
<
T
>::
Compute
(
ctx
);
if
(
multi_precision
)
{
VLOG
(
3
)
<<
"[adamw] multi_precision, cast paramout to masterparamout."
;
// cast paramout to masterparamout
master_param_out
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
param_out
->
dtype
()),
framework
::
TransToProtoVarType
(
master_param_out
->
dtype
()));
MLUCnnlTensorDesc
param_out_desc
(
*
param_out
);
MLUCnnlTensorDesc
master_param_out_desc
(
*
master_param_out
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
param_out_desc
.
get
(),
GetBasePtr
(
param_out
),
master_param_out_desc
.
get
(),
GetBasePtr
(
master_param_out
));
}
}
};
template
<
typename
T
>
class
MergedAdamMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// Get inputs and outputs
auto
params
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Param"
);
auto
grads
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Grad"
);
auto
lrs
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
mom1s
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Moment1"
);
auto
mom2s
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Moment2"
);
auto
beta1_pows
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Beta1Pow"
);
auto
beta2_pows
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Beta2Pow"
);
auto
master_params
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"MasterParam"
);
auto
param_outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
mom1_outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Moment1Out"
);
auto
mom2_outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Moment2Out"
);
auto
beta1_pow_outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Beta1PowOut"
);
auto
beta2_pow_outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Beta2PowOut"
);
// Check validation of inputs and outputs
size_t
param_num
=
params
.
size
();
PADDLE_ENFORCE_EQ
(
param_num
,
param_outs
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d."
,
param_outs
.
size
(),
param_num
));
bool
skip_update
=
false
;
if
(
ctx
.
HasInput
(
"SkipUpdate"
))
{
auto
*
skip_update_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SkipUpdate"
);
PADDLE_ENFORCE_EQ
(
skip_update_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update_tensor
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update_tensor
,
ctx
.
device_context
(),
&
skip_update_vec
);
ctx
.
device_context
().
Wait
();
skip_update
=
skip_update_vec
[
0
];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if
(
skip_update
)
{
VLOG
(
4
)
<<
"MergedAdam skip update"
;
for
(
size_t
i
=
0
;
i
<
param_num
;
++
i
)
{
framework
::
TensorCopy
(
*
params
[
i
],
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
param_outs
[
i
]);
framework
::
TensorCopy
(
*
mom1s
[
i
],
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
mom1_outs
[
i
]);
framework
::
TensorCopy
(
*
mom2s
[
i
],
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
mom2_outs
[
i
]);
framework
::
TensorCopy
(
*
beta1_pows
[
i
],
beta1_pows
[
i
]
->
place
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
beta1_pow_outs
[
i
]);
framework
::
TensorCopy
(
*
beta2_pows
[
i
],
beta2_pows
[
i
]
->
place
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
beta2_pow_outs
[
i
]);
}
return
;
}
bool
use_global_beta_pow
=
ctx
.
Attr
<
bool
>
(
"use_global_beta_pow"
);
VLOG
(
4
)
<<
"use_global_beta_pow:"
<<
use_global_beta_pow
;
// Get beta1, beta2 and epsilon from attribute.
const
phi
::
DenseTensor
*
beta1_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
beta2_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
epsilon_tensor
=
nullptr
;
phi
::
DenseTensor
beta1_tmp
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
beta2_tmp
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
epsilon_tmp
(
phi
::
DataType
::
FLOAT32
);
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
beta1_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
beta2_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
epsilon_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_tmp_desc
(
beta1_tmp
);
MLUCnnlTensorDesc
beta2_tmp_desc
(
beta2_tmp
);
MLUCnnlTensorDesc
epsilon_tmp_desc
(
epsilon_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta1
,
beta1_tmp_desc
.
get
(),
GetBasePtr
(
&
beta1_tmp
));
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta2
,
beta2_tmp_desc
.
get
(),
GetBasePtr
(
&
beta2_tmp
));
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
epsilon
,
epsilon_tmp_desc
.
get
(),
GetBasePtr
(
&
epsilon_tmp
));
beta1_tensor
=
&
beta1_tmp
;
beta2_tensor
=
&
beta2_tmp
;
epsilon_tensor
=
&
epsilon_tmp
;
// Loop to compute
for
(
size_t
i
=
0
;
i
<
param_num
;
++
i
)
{
VLOG
(
4
)
<<
"[MergedAdam] loop: "
<<
i
;
param_outs
[
i
]
->
ShareDataWith
(
*
params
[
i
]);
mom1_outs
[
i
]
->
ShareDataWith
(
*
mom1s
[
i
]);
mom2_outs
[
i
]
->
ShareDataWith
(
*
mom2s
[
i
]);
phi
::
DenseTensor
beta1_pow_tmp
;
phi
::
DenseTensor
beta2_pow_tmp
;
if
(
beta1_pows
[
i
]
->
place
()
==
platform
::
CPUPlace
())
{
T
beta1
=
*
beta1_pows
[
i
]
->
data
<
T
>
();
beta1_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_pow_tmp_desc
(
beta1_pow_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta1
,
beta1_pow_tmp_desc
.
get
(),
GetBasePtr
(
&
beta1_pow_tmp
));
beta1_pows
[
i
]
=
&
beta1_pow_tmp
;
}
if
(
beta2_pows
[
i
]
->
place
()
==
platform
::
CPUPlace
())
{
T
beta2
=
*
beta2_pows
[
i
]
->
data
<
T
>
();
beta2_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta2_pow_tmp_desc
(
beta2_pow_tmp
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
beta2
,
beta2_pow_tmp_desc
.
get
(),
GetBasePtr
(
&
beta2_pow_tmp
));
beta2_pows
[
i
]
=
&
beta2_pow_tmp
;
}
VLOG
(
3
)
<<
"beta1_pow.numel() : "
<<
beta1_pows
[
i
]
->
numel
()
<<
"beta2_pow.numel() : "
<<
beta2_pows
[
i
]
->
numel
();
VLOG
(
3
)
<<
"param.numel(): "
<<
params
[
i
]
->
numel
();
PADDLE_ENFORCE_EQ
(
beta1_pow_outs
[
i
]
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta1 pow output size should be 1, but received "
"value is:%d."
,
beta1_pow_outs
[
i
]
->
numel
()));
PADDLE_ENFORCE_EQ
(
beta2_pow_outs
[
i
]
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta2 pow output size should be 1, but received "
"value is:%d."
,
beta2_pow_outs
[
i
]
->
numel
()));
MLUCnnlTensorDesc
param_desc
(
*
params
[
i
]);
MLUCnnlTensorDesc
mom1_desc
(
*
mom1s
[
i
]);
MLUCnnlTensorDesc
mom2_desc
(
*
mom2s
[
i
]);
MLUCnnlTensorDesc
grad_desc
(
*
grads
[
i
]);
MLUCnnl
::
ApplyAdam
(
ctx
,
param_desc
.
get
(),
GetBasePtr
(
param_outs
[
i
]),
mom1_desc
.
get
(),
GetBasePtr
(
mom1_outs
[
i
]),
mom2_desc
.
get
(),
GetBasePtr
(
mom2_outs
[
i
]),
grad_desc
.
get
(),
GetBasePtr
(
grads
[
i
]),
GetBasePtr
(
lrs
[
i
]),
GetBasePtr
(
beta1_tensor
),
GetBasePtr
(
beta2_tensor
),
GetBasePtr
(
beta1_pows
[
i
]),
GetBasePtr
(
beta2_pows
[
i
]),
GetBasePtr
(
epsilon_tensor
),
/*use_nesterov*/
false
);
if
(
!
use_global_beta_pow
)
{
beta1_pow_outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
beta2_pow_outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
beta1_desc
(
*
beta1_tensor
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_pows
[
i
]),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_tensor
),
beta1_desc
.
get
(),
GetBasePtr
(
beta1_pow_outs
[
i
]),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_pows
[
i
]),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_tensor
),
beta1_desc
.
get
(),
GetBasePtr
(
beta2_pow_outs
[
i
]),
ToCnnlDataType
<
T
>
());
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
adam
,
ops
::
AdamMLUKernel
<
float
>
,
ops
::
AdamMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
adamw
,
ops
::
AdamWMLUKernel
<
float
>
,
ops
::
AdamWMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
merged_adam
,
ops
::
MergedAdamMLUKernel
<
float
>
,
ops
::
MergedAdamMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MLUMergedMomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
params
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Param"
);
auto
params_out
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"ParamOut"
);
size_t
n
=
params
.
size
();
PADDLE_ENFORCE_EQ
(
n
,
params_out
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d."
,
params_out
.
size
(),
n
));
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
params
[
i
],
params_out
[
i
],
platform
::
errors
::
InvalidArgument
(
"The size of Input(Param) and Output(ParamOut) "
"must be the same Tensors."
));
}
auto
grads
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Grad"
);
PADDLE_ENFORCE_EQ
(
n
,
grads
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Input(Grad) must be equal to Input(Param), but got "
"the size of Input(Grad) is %d, the size of Input(Param) is %d."
,
grads
.
size
(),
n
));
auto
velocitys
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Velocity"
);
PADDLE_ENFORCE_EQ
(
n
,
velocitys
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Input(Velocity) must be equal to "
"Input(Param), but got the size of Input(Velocity) "
"is %d, the size of Input(Param) is %d."
,
velocitys
.
size
(),
n
));
auto
velocitys_out
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"VelocityOut"
);
PADDLE_ENFORCE_EQ
(
n
,
velocitys_out
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Output(VelocityOut) must be "
"equal to Input(Param), but got the size of Output(VelocityOut) is "
"%d, the size of Input(Param) is %d."
,
velocitys_out
.
size
(),
n
));
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
velocitys
[
i
],
velocitys_out
[
i
],
platform
::
errors
::
InvalidArgument
(
"Input(Velocity) and Output(VelocityOut) must be "
"the same Tensors."
));
}
auto
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
auto
lrs
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"LearningRate"
);
if
(
lrs
.
size
()
!=
1
)
{
PADDLE_ENFORCE_EQ
(
n
,
lrs
.
size
(),
platform
::
errors
::
InvalidArgument
(
"If the size of Input(LearningRate) is not 1, the size of "
"Input(LearningRate) must be "
"equal to Input(Param), but got the size of Input(LearningRate) "
"is %d, the size of Input(Param) is %d."
,
lrs
.
size
(),
n
));
}
auto
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
regularization_methods
=
ctx
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"regularization_method"
);
auto
regularization_coeffs
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"regularization_coeff"
);
if
(
regularization_methods
.
size
()
!=
0
)
{
PADDLE_ENFORCE_EQ
(
n
,
regularization_methods
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Attr(regularization_method) must be equal "
"to Input(Param), but got the size of "
"Attr(regularization_method) is %d, the size of Input(Param) is "
"%d."
,
regularization_methods
.
size
(),
n
));
PADDLE_ENFORCE_EQ
(
n
,
regularization_coeffs
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Attr(regularization_coeff) must be equal "
"to Input(Param), but got the size of Attr(regularization_coeff) "
"is %d, the size of Input(Param) is %d."
,
regularization_coeffs
.
size
(),
n
));
}
VLOG
(
5
)
<<
"use_nesterov: "
<<
use_nesterov
<<
", regularization_methods.size(): "
<<
regularization_methods
.
size
()
<<
", regularization_coeffs.size(): "
<<
regularization_coeffs
.
size
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
phi
::
DenseTensor
mu_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
mu_tensor_desc
(
mu_tensor
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
mu
,
mu_tensor_desc
.
get
(),
GetBasePtr
(
&
mu_tensor
));
for
(
size_t
idx
=
0
;
idx
<
n
;
++
idx
)
{
phi
::
RegularizationType
regularization_flag
=
regularization_methods
.
size
()
>
0
&&
regularization_methods
[
idx
]
==
"l2_decay"
?
phi
::
RegularizationType
::
kL2DECAY
:
phi
::
RegularizationType
::
kNONE
;
T
regularization_coeff
=
static_cast
<
T
>
(
0.0
);
if
(
regularization_coeffs
.
size
()
!=
0
)
{
regularization_coeff
=
static_cast
<
T
>
(
regularization_coeffs
[
idx
]);
}
auto
learning_rate
=
lrs
.
size
()
>
1
?
lrs
[
idx
]
:
lrs
[
0
];
auto
param_out
=
params_out
[
idx
];
auto
velocity_out
=
velocitys_out
[
idx
];
auto
grad
=
grads
[
idx
];
phi
::
DenseTensor
regularized_grad
;
MLUCnnlTensorDesc
param_desc
(
*
param_out
);
if
(
regularization_flag
==
phi
::
RegularizationType
::
kL2DECAY
)
{
regularized_grad
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
param_out
->
dims
(),
dev_ctx
);
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
param_desc
.
get
(),
GetBasePtr
(
param_out
),
param_desc
.
get
(),
GetBasePtr
(
grad
),
param_desc
.
get
(),
GetBasePtr
(
&
regularized_grad
),
ToCnnlDataType
<
T
>
(),
regularization_coeff
);
}
else
{
regularized_grad
=
*
grad
;
}
MLUCnnl
::
ApplyMomentum
(
ctx
,
param_desc
.
get
(),
GetBasePtr
(
&
regularized_grad
),
use_nesterov
,
GetBasePtr
(
learning_rate
),
GetBasePtr
(
&
mu_tensor
),
GetBasePtr
(
param_out
),
GetBasePtr
(
velocity_out
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
merged_momentum
,
ops
::
MLUMergedMomentumOpKernel
<
float
>
,
ops
::
MLUMergedMomentumOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/optimizers/momentum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MLUMomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
std
::
string
regularization_method
=
ctx
.
Attr
<
std
::
string
>
(
"regularization_method"
);
auto
regularization_coeff
=
ctx
.
Attr
<
float
>
(
"regularization_coeff"
);
phi
::
RegularizationType
regularization_flag
{
phi
::
RegularizationType
::
kNONE
};
// disable regularization
if
(
regularization_method
==
"l2_decay"
)
{
regularization_flag
=
phi
::
RegularizationType
::
kL2DECAY
;
}
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
learning_rate
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
auto
velocity
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Velocity"
);
auto
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"VelocityOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grad"
);
phi
::
DenseTensor
mu_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
mu_tensor_desc
(
mu_tensor
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
mu
,
mu_tensor_desc
.
get
(),
GetBasePtr
(
&
mu_tensor
));
phi
::
DenseTensor
regularized_grad
;
MLUCnnlTensorDesc
param_desc
(
*
param
);
if
(
regularization_flag
==
phi
::
RegularizationType
::
kL2DECAY
)
{
regularized_grad
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
param
->
dims
(),
dev_ctx
);
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
param_desc
.
get
(),
GetBasePtr
(
param
),
param_desc
.
get
(),
GetBasePtr
(
grad
),
param_desc
.
get
(),
GetBasePtr
(
&
regularized_grad
),
ToCnnlDataType
<
T
>
(),
regularization_coeff
);
}
else
{
regularized_grad
=
*
grad
;
}
framework
::
TensorCopy
(
*
param
,
ctx
.
GetPlace
(),
dev_ctx
,
param_out
);
framework
::
TensorCopy
(
*
velocity
,
ctx
.
GetPlace
(),
dev_ctx
,
velocity_out
);
MLUCnnl
::
ApplyMomentum
(
ctx
,
param_desc
.
get
(),
GetBasePtr
(
&
regularized_grad
),
use_nesterov
,
GetBasePtr
(
learning_rate
),
GetBasePtr
(
&
mu_tensor
),
GetBasePtr
(
param_out
),
GetBasePtr
(
velocity_out
));
}
else
if
(
grad_var
->
IsType
<
phi
::
SelectedRows
>
())
{
PADDLE_ENFORCE_EQ
(
false
,
true
,
platform
::
errors
::
PermissionDenied
(
"Unsupport SparseMomentum"
));
}
else
{
PADDLE_ENFORCE_EQ
(
false
,
true
,
platform
::
errors
::
PermissionDenied
(
"Unsupported Variable Type of Grad "
"in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]"
,
paddle
::
framework
::
ToTypeName
(
grad_var
->
Type
())));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
momentum
,
ops
::
MLUMomentumOpKernel
<
float
>
,
ops
::
MLUMomentumOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
e75c01f9
...
@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
...
@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
platform
::
is_mlu_place
(
place_
))
{
int
dev_idx
=
place_
.
device
;
compute_stream_
=
((
platform
::
MLUDeviceContext
*
)(
platform
::
DeviceContextPool
::
Instance
()
.
Get
(
place_
)))
->
stream
();
events_
.
resize
(
buffer_size
);
for
(
auto
&
event
:
events_
)
{
event
=
platform
::
MluEventResourcePool
::
Instance
().
New
(
dev_idx
);
}
stream_
=
platform
::
MluStreamResourcePool
::
Instance
().
New
(
dev_idx
);
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
if
(
platform
::
is_xpu_place
(
place_
))
{
if
(
platform
::
is_xpu_place
(
place_
))
{
int
dev_idx
=
place_
.
device
;
int
dev_idx
=
place_
.
device
;
...
@@ -260,57 +245,6 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -260,57 +245,6 @@ void BufferedReader::ReadAsync(size_t i) {
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
if
(
platform
::
is_mlu_place
(
place_
))
{
TensorVec
&
mlu
=
mlu_buffer_
[
i
];
if
(
mlu
.
empty
())
{
mlu
.
resize
(
cpu
.
size
());
}
else
{
PADDLE_ENFORCE_EQ
(
mlu
.
size
(),
cpu
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Input tensor number on MLU and CPU devices are not matched. "
"The number on MLU is %d, on CPU is %d"
,
mlu
.
size
(),
cpu
.
size
()));
}
std
::
vector
<
void
*>
mlu_ptrs
;
mlu_ptrs
.
reserve
(
cpu
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
mlu
[
i
].
Resize
(
cpu
[
i
].
dims
());
mlu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
mlu_ptrs
.
emplace_back
(
mlu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
()));
}
platform
::
SetMLUDeviceId
(
place_
.
device
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnPlaceNotifier
(
events_
[
i
].
get
(),
compute_stream_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnWaitNotifier
(
events_
[
i
].
get
()));
platform
::
RecordEvent
record_event
(
"BufferedReader:MemoryCopy"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
auto
cpu_place
=
cpu
[
i
].
place
();
auto
cpu_ptr
=
cpu
[
i
].
data
();
auto
mlu_ptr
=
mlu_ptrs
[
i
];
auto
size
=
cpu
[
i
].
numel
()
*
phi
::
SizeOf
(
cpu
[
i
].
dtype
());
if
((
platform
::
is_mlu_place
(
cpu_place
)))
{
memory
::
Copy
(
place_
,
mlu_ptr
,
cpu_place
,
cpu_ptr
,
size
,
stream_
.
get
());
}
else
{
memory
::
Copy
(
place_
,
mlu_ptr
,
cpu_place
,
cpu_ptr
,
size
,
stream_
.
get
());
platform
::
MLUStreamSync
(
stream_
.
get
());
}
mlu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
platform
::
MLUStreamSync
(
stream_
.
get
());
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
if
(
platform
::
is_xpu_place
(
place_
))
{
if
(
platform
::
is_xpu_place
(
place_
))
{
TensorVec
&
xpu
=
xpu_buffer_
[
i
];
TensorVec
&
xpu
=
xpu_buffer_
[
i
];
...
...
paddle/fluid/operators/reader/buffered_reader.h
浏览文件 @
e75c01f9
...
@@ -26,10 +26,6 @@
...
@@ -26,10 +26,6 @@
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
...
@@ -92,12 +88,6 @@ class BufferedReader : public framework::DecoratedReader {
...
@@ -92,12 +88,6 @@ class BufferedReader : public framework::DecoratedReader {
std
::
vector
<
std
::
shared_ptr
<
platform
::
CudaEventObject
>>
events_
;
std
::
vector
<
std
::
shared_ptr
<
platform
::
CudaEventObject
>>
events_
;
#endif
#endif
#ifdef PADDLE_WITH_MLU
mluStream
compute_stream_
;
std
::
shared_ptr
<
platform
::
MluStreamObject
>
stream_
;
std
::
vector
<
std
::
shared_ptr
<
platform
::
MluEventObject
>>
events_
;
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
xpuStream
compute_stream_
;
xpuStream
compute_stream_
;
std
::
shared_ptr
<
platform
::
XpuStreamObject
>
stream_
;
std
::
shared_ptr
<
platform
::
XpuStreamObject
>
stream_
;
...
...
paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ReduceMaxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
input
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
out_dtype
=
context
.
Attr
<
int
>
(
"out_dtype"
);
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
input_dims
=
input
->
dims
();
const
auto
&
input_dim_size
=
input
->
dims
().
size
();
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
for
(
int
i
=
0
;
i
<
input_dims
.
size
();
i
++
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
i
));
}
}
else
{
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
{
reduce_dims
.
push_back
(
dims
[
i
]
+
input_dim_size
);
}
else
{
reduce_dims
.
push_back
(
dims
[
i
]);
}
}
}
auto
place
=
context
.
GetPlace
();
phi
::
DenseTensor
cast_out
(
input
->
type
());
cast_out
.
Resize
(
output
->
dims
());
cast_out
.
mutable_data
<
T
>
(
place
);
auto
cast_out_dtype
=
framework
::
TransToProtoVarType
(
input
->
dtype
());
if
(
out_dtype
!=
-
1
)
{
cast_out_dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
);
}
if
(
framework
::
TransToProtoVarType
(
input
->
type
())
!=
cast_out_dtype
)
{
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
output
->
mutable_data
<
float
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
output
->
mutable_data
<
paddle
::
platform
::
float16
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT32
)
{
output
->
mutable_data
<
int32_t
>
(
place
);
}
}
else
{
output
->
ShareDataWith
(
cast_out
);
}
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
->
dtype
()));
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
MLUCnnlReduceDesc
reduction_desc
(
reduce_dims
,
CNNL_REDUCE_MAX
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
context
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
input_desc
.
get
(),
GetBasePtr
(
input
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
ReduceMaxGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
out_grad
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
reduce_dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
int
in_dtype
=
context
.
Attr
<
int
>
(
"in_dtype"
);
PADDLE_ENFORCE_EQ
(
in_dtype
==
-
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"MLU only support in_dtype == -1 in reduce_max_grad op."
));
auto
*
x_grad
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
x_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
place
=
context
.
GetPlace
();
// broadcast
auto
x_dims_vec
=
phi
::
vectorize
(
x
->
dims
());
if
(
reduce_all
)
{
reduce_dims
.
clear
();
for
(
size_t
d
=
0
;
d
<
x_dims_vec
.
size
();
++
d
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
d
));
}
}
phi
::
DenseTensor
tmp_out
,
tmp_out_grad
;
auto
tmp_out_dims_vec
=
x_dims_vec
;
for
(
auto
d
:
reduce_dims
)
{
if
(
d
<
0
)
{
d
+=
x_dims_vec
.
size
();
}
tmp_out_dims_vec
[
d
]
=
1
;
}
tmp_out
.
ShareDataWith
(
*
out
);
tmp_out
.
Resize
(
phi
::
make_ddim
(
tmp_out_dims_vec
));
tmp_out_grad
.
ShareDataWith
(
*
out_grad
);
tmp_out_grad
.
Resize
(
phi
::
make_ddim
(
tmp_out_dims_vec
));
phi
::
DenseTensor
transformed_out
(
x
->
type
());
transformed_out
.
Resize
(
phi
::
make_ddim
(
x_dims_vec
));
transformed_out
.
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
tmp_out_desc
(
tmp_out
);
MLUCnnlTensorDesc
transformed_out_desc
(
transformed_out
);
MLUCnnl
::
BroadcastTo
(
context
,
tmp_out_desc
.
get
(),
GetBasePtr
(
&
tmp_out
),
transformed_out_desc
.
get
(),
GetBasePtr
(
&
transformed_out
));
phi
::
DenseTensor
transformed_out_grad
(
x
->
type
());
transformed_out_grad
.
Resize
(
phi
::
make_ddim
(
x_dims_vec
));
transformed_out_grad
.
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
tmp_out_grad_desc
(
tmp_out_grad
);
MLUCnnlTensorDesc
transformed_out_grad_desc
(
transformed_out_grad
);
MLUCnnl
::
BroadcastTo
(
context
,
tmp_out_grad_desc
.
get
(),
GetBasePtr
(
&
tmp_out_grad
),
transformed_out_grad_desc
.
get
(),
GetBasePtr
(
&
transformed_out_grad
));
// compare
phi
::
DenseTensor
equal_cond
;
equal_cond
.
mutable_data
<
bool
>
(
x_grad
->
dims
(),
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
equal_cond_desc
(
equal_cond
);
MLUCnnl
::
Logic
(
context
,
CNNL_LOGIC_OP_EQ
,
x_desc
.
get
(),
GetBasePtr
(
x
),
transformed_out_desc
.
get
(),
GetBasePtr
(
&
transformed_out
),
equal_cond_desc
.
get
(),
GetBasePtr
(
&
equal_cond
));
// select
phi
::
DenseTensor
t_zero
;
t_zero
.
mutable_data
<
T
>
(
x_grad
->
dims
(),
place
);
FillMLUTensorWithHostValue
<
T
>
(
context
,
static_cast
<
T
>
(
0
),
&
t_zero
);
t_zero
.
Resize
(
x_grad
->
dims
());
MLUCnnlTensorDesc
t_zero_desc
(
t_zero
);
MLUCnnlTensorDesc
x_grad_desc
(
*
x_grad
);
MLUCnnl
::
Select
(
context
,
equal_cond_desc
.
get
(),
GetBasePtr
(
&
equal_cond
),
transformed_out_grad_desc
.
get
(),
GetBasePtr
(
&
transformed_out_grad
),
t_zero_desc
.
get
(),
GetBasePtr
(
&
t_zero
),
x_grad_desc
.
get
(),
GetBasePtr
(
x_grad
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
reduce_max
,
ops
::
ReduceMaxMLUKernel
<
float
>
,
ops
::
ReduceMaxMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMaxMLUKernel
<
int
>
);
REGISTER_OP_MLU_KERNEL
(
reduce_max_grad
,
ops
::
ReduceMaxGradMLUKernel
<
float
>
,
ops
::
ReduceMaxGradMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMaxGradMLUKernel
<
int
>
);
paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ReduceMeanMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
MLUReduceOp
<
T
>
(
context
,
"reduce_mean"
);
}
};
template
<
typename
T
>
class
ReduceMeanGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
input
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output_grad
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
input_grad
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
reduce_dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
input_dims
=
phi
::
vectorize
(
input
->
dims
());
int
reduce_numel
=
1
;
if
(
reduce_all
)
{
reduce_dims
.
clear
();
for
(
size_t
d
=
0
;
d
<
input_dims
.
size
();
++
d
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
d
));
}
}
for
(
auto
&
d
:
reduce_dims
)
{
if
(
d
<
0
)
{
d
=
d
+
input_dims
.
size
();
}
reduce_numel
*=
input_dims
[
d
];
}
phi
::
DenseTensor
tmp_output_grad
(
output_grad
->
dtype
());
auto
tmp_output_dims
=
input_dims
;
for
(
auto
d
:
reduce_dims
)
{
tmp_output_dims
[
d
]
=
1
;
}
tmp_output_grad
.
ShareDataWith
(
*
output_grad
);
tmp_output_grad
.
Resize
(
phi
::
make_ddim
(
tmp_output_dims
));
MLUCnnlTensorDesc
output_grad_desc
(
tmp_output_grad
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
tmp_output_grad
.
dtype
()));
MLUCnnlTensorDesc
input_grad_desc
(
*
input_grad
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input_grad
->
dtype
()));
auto
value
=
static_cast
<
T
>
(
1.0
/
static_cast
<
float
>
(
reduce_numel
));
MLUCnnl
::
Fill
(
context
,
CNNL_POINTER_MODE_HOST
,
&
value
,
input_grad_desc
.
get
(),
GetBasePtr
(
input_grad
));
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
context
,
op_tensor_desc
.
get
(),
output_grad_desc
.
get
(),
GetBasePtr
(
&
tmp_output_grad
),
input_grad_desc
.
get
(),
GetBasePtr
(
input_grad
),
input_grad_desc
.
get
(),
GetBasePtr
(
input_grad
),
ToCnnlDataType
<
T
>
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
reduce_mean
,
ops
::
ReduceMeanMLUKernel
<
float
>
,
ops
::
ReduceMeanMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
reduce_mean_grad
,
ops
::
ReduceMeanGradMLUKernel
<
float
>
,
ops
::
ReduceMeanGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ReduceMinMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
input
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
out_dtype
=
context
.
Attr
<
int
>
(
"out_dtype"
);
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
input_dims
=
input
->
dims
();
const
auto
&
input_dim_size
=
input
->
dims
().
size
();
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
for
(
int
i
=
0
;
i
<
input_dims
.
size
();
i
++
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
i
));
}
}
else
{
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
{
reduce_dims
.
push_back
(
dims
[
i
]
+
input_dim_size
);
}
else
{
reduce_dims
.
push_back
(
dims
[
i
]);
}
}
}
auto
place
=
context
.
GetPlace
();
phi
::
DenseTensor
cast_out
(
input
->
type
());
cast_out
.
Resize
(
output
->
dims
());
cast_out
.
mutable_data
<
T
>
(
place
);
auto
cast_out_dtype
=
framework
::
TransToProtoVarType
(
input
->
dtype
());
if
(
out_dtype
!=
-
1
)
{
cast_out_dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
);
}
if
(
framework
::
TransToProtoVarType
(
input
->
type
())
!=
cast_out_dtype
)
{
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
output
->
mutable_data
<
float
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
output
->
mutable_data
<
paddle
::
platform
::
float16
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT32
)
{
output
->
mutable_data
<
int32_t
>
(
place
);
}
}
else
{
output
->
ShareDataWith
(
cast_out
);
}
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
->
dtype
()));
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
MLUCnnlReduceDesc
reduction_desc
(
reduce_dims
,
CNNL_REDUCE_MIN
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
context
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
input_desc
.
get
(),
GetBasePtr
(
input
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
reduce_min
,
ops
::
ReduceMinMLUKernel
<
float
>
,
ops
::
ReduceMinMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMinMLUKernel
<
int
>
);
paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_MLU
#include <string>
#include <vector>
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
void
MLUReduceOp
(
const
framework
::
ExecutionContext
&
context
,
std
::
string
reduce_name
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
context
.
GetPlace
()),
true
,
platform
::
errors
::
Unavailable
(
"This kernel only runs on MLU."
));
auto
*
input
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
input_dims
=
phi
::
vectorize
(
input
->
dims
());
const
auto
&
input_dim_size
=
input
->
dims
().
size
();
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
for
(
size_t
i
=
0
;
i
<
input_dims
.
size
();
i
++
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
i
));
}
}
else
{
for
(
size_t
i
=
0
;
i
<
dims
.
size
();
++
i
)
{
if
(
dims
[
i
]
<
0
)
{
reduce_dims
.
push_back
(
dims
[
i
]
+
input_dim_size
);
}
else
{
reduce_dims
.
push_back
(
dims
[
i
]);
}
}
}
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
->
dtype
()));
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
cnnlReduceOp_t
reduce_op
=
GetMLUCnnlReduceOp
(
reduce_name
);
MLUCnnlReduceDesc
reduction_desc
(
reduce_dims
,
reduce_op
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
context
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
input_desc
.
get
(),
GetBasePtr
(
input
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
output_desc
.
get
(),
GetBasePtr
(
output
));
}
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ReduceMeanMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
MLUReduceOp
<
T
>
(
context
,
"reduce_prod"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
reduce_prod
,
ops
::
ReduceMeanMLUKernel
<
float
>
,
ops
::
ReduceMeanMLUKernel
<
plat
::
float16
>
,
ops
::
ReduceMeanMLUKernel
<
int
>
);
paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ReduceSumMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
MLUReduceOp
<
T
>
(
context
,
"reduce_sum"
);
}
};
template
<
typename
T
>
class
ReduceSumGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out_grad
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
in_grad
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
in_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
auto
reduce_dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
in_dims
=
phi
::
vectorize
(
in
->
dims
());
if
(
reduce_all
)
{
reduce_dims
.
clear
();
for
(
size_t
d
=
0
;
d
<
in_dims
.
size
();
++
d
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
d
));
}
}
for
(
auto
&
d
:
reduce_dims
)
{
if
(
d
<
0
)
{
d
=
d
+
in_dims
.
size
();
}
}
phi
::
DenseTensor
tmp_out
(
out_grad
->
dtype
());
auto
tmp_output_dims
=
in_dims
;
for
(
auto
d
:
reduce_dims
)
{
tmp_output_dims
[
d
]
=
1
;
}
tmp_out
.
ShareDataWith
(
*
out_grad
);
tmp_out
.
Resize
(
phi
::
make_ddim
(
tmp_output_dims
));
MLUCnnlTensorDesc
out_desc
(
tmp_out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
in_grad_desc
(
*
in_grad
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
BroadcastTo
(
context
,
out_desc
.
get
(),
GetBasePtr
(
&
tmp_out
),
in_grad_desc
.
get
(),
GetBasePtr
(
in_grad
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
reduce_sum
,
ops
::
ReduceSumMLUKernel
<
float
>
,
ops
::
ReduceSumMLUKernel
<
int
>
,
ops
::
ReduceSumMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceSumGradMLUKernel
<
float
>
,
ops
::
ReduceSumGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
浏览文件 @
e75c01f9
...
@@ -41,18 +41,6 @@ class SoftmaxWithCrossEntropyOpMaker
...
@@ -41,18 +41,6 @@ class SoftmaxWithCrossEntropyOpMaker
"The outputs value of softmax activation by given the input batch, "
"The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation."
)
"which will be used in backward calculation."
)
.
AsIntermediate
();
.
AsIntermediate
();
#if defined(PADDLE_WITH_MLU)
AddOutput
(
"Backprop"
,
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits). "
"The intermediate value used for backward calculation. The calculation "
"is :"
"exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
"where labels is ont-hot."
"Currently, the tensor is generated and used in npu/mlu kernel. "
)
.
AsIntermediate
();
#endif
AddOutput
(
"Loss"
,
AddOutput
(
"Loss"
,
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits) "
"Input(Logits) "
...
@@ -135,12 +123,6 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
...
@@ -135,12 +123,6 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
true
,
true
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"Output(Softmax) should be not null."
));
"Output(Softmax) should be not null."
));
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Backprop"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Backprop) should be not null."
));
#endif
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Loss"
),
ctx
->
HasOutput
(
"Loss"
),
true
,
true
,
...
@@ -235,12 +217,6 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
...
@@ -235,12 +217,6 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
true
,
true
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"Input(Softmax) should be not null."
));
"Input(Softmax) should be not null."
));
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Backprop"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Backprop) should be not null."
));
#endif
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Label"
),
ctx
->
HasInput
(
"Label"
),
true
,
true
,
...
@@ -324,9 +300,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
...
@@ -324,9 +300,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
grad_op
->
SetType
(
"softmax_with_cross_entropy_grad"
);
grad_op
->
SetType
(
"softmax_with_cross_entropy_grad"
);
grad_op
->
SetInput
(
"Label"
,
this
->
Input
(
"Label"
));
grad_op
->
SetInput
(
"Label"
,
this
->
Input
(
"Label"
));
grad_op
->
SetInput
(
"Softmax"
,
this
->
Output
(
"Softmax"
));
grad_op
->
SetInput
(
"Softmax"
,
this
->
Output
(
"Softmax"
));
#if defined(PADDLE_WITH_MLU)
grad_op
->
SetInput
(
"Backprop"
,
this
->
Output
(
"Backprop"
));
#endif
grad_op
->
SetInput
(
framework
::
GradVarName
(
"Loss"
),
this
->
OutputGrad
(
"Loss"
));
grad_op
->
SetInput
(
framework
::
GradVarName
(
"Loss"
),
this
->
OutputGrad
(
"Loss"
));
grad_op
->
SetOutput
(
framework
::
GradVarName
(
"Logits"
),
grad_op
->
SetOutput
(
framework
::
GradVarName
(
"Logits"
),
this
->
InputGrad
(
"Logits"
));
this
->
InputGrad
(
"Logits"
));
...
@@ -356,26 +329,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
...
@@ -356,26 +329,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops
::
SoftmaxWithCrossEntropyGradInplaceInferer
);
ops
::
SoftmaxWithCrossEntropyGradInplaceInferer
);
REGISTER_OP_VERSION
(
softmax_with_cross_entropy
)
REGISTER_OP_VERSION
(
softmax_with_cross_entropy
)
#if defined(PADDLE_WITH_MLU)
.
AddCheckpoint
(
R"ROC(
Add a new attribute [use_softmax] )ROC"
,
paddle
::
framework
::
compatible
::
OpVersionDesc
().
NewAttr
(
"use_softmax"
,
"A flag to indicate whether to do softmax"
,
true
))
.
AddCheckpoint
(
R"ROC(
Add a new dispensable/intermediate output [backprop] )ROC"
,
paddle
::
framework
::
compatible
::
OpVersionDesc
().
NewOutput
(
"Backprop"
,
"The intermediate value used for backward calculation. The "
"calculation is :"
"exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
"where labels is ont-hot."
"Currently, the tensor is generated and used in npu/mlu kernel. "
));
#else
.
AddCheckpoint
(
.
AddCheckpoint
(
R"ROC(
R"ROC(
Add a new attribute [use_softmax] )ROC"
,
Add a new attribute [use_softmax] )ROC"
,
paddle
::
framework
::
compatible
::
OpVersionDesc
().
NewAttr
(
paddle
::
framework
::
compatible
::
OpVersionDesc
().
NewAttr
(
"use_softmax"
,
"A flag to indicate whether to do softmax"
,
true
));
"use_softmax"
,
"A flag to indicate whether to do softmax"
,
true
));
#endif
paddle/fluid/operators/utils.h
浏览文件 @
e75c01f9
...
@@ -92,11 +92,6 @@ inline T GetValue(const phi::DenseTensor* x) {
...
@@ -92,11 +92,6 @@ inline T GetValue(const phi::DenseTensor* x) {
if
(
!
platform
::
is_cpu_place
(
x
->
place
()))
{
if
(
!
platform
::
is_cpu_place
(
x
->
place
()))
{
phi
::
DenseTensor
cpu_x
;
phi
::
DenseTensor
cpu_x
;
framework
::
TensorCopy
(
*
x
,
platform
::
CPUPlace
(),
&
cpu_x
);
framework
::
TensorCopy
(
*
x
,
platform
::
CPUPlace
(),
&
cpu_x
);
#if defined(PADDLE_WITH_MLU)
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
const
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
x
->
place
());
dev_ctx
->
Wait
();
#endif
value
=
cpu_x
.
data
<
T
>
()[
0
];
value
=
cpu_x
.
data
<
T
>
()[
0
];
}
else
{
}
else
{
value
=
x
->
data
<
T
>
()[
0
];
value
=
x
->
data
<
T
>
()[
0
];
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -78,11 +78,7 @@ if(WITH_ASCEND_CL)
...
@@ -78,11 +78,7 @@ if(WITH_ASCEND_CL)
set
(
NPU_CTX_DEPS npu_stream npu_info
)
set
(
NPU_CTX_DEPS npu_stream npu_info
)
endif
()
endif
()
if
(
WITH_MLU
)
if
(
WITH_ASCEND_CL
)
set
(
MLU_CTX_DEPS mlu_device_context
)
endif
()
if
(
WITH_ASCEND_CL OR WITH_MLU
)
cc_library
(
cc_library
(
stream_callback_manager
stream_callback_manager
SRCS stream_callback_manager.cc
SRCS stream_callback_manager.cc
...
@@ -175,10 +171,6 @@ if(WITH_XPU)
...
@@ -175,10 +171,6 @@ if(WITH_XPU)
target_link_libraries
(
device_context xpu_resource_pool
)
target_link_libraries
(
device_context xpu_resource_pool
)
endif
()
endif
()
if
(
WITH_MLU
)
target_link_libraries
(
device_context mlu_resource_pool
)
endif
()
if
(
WITH_CUSTOM_DEVICE
)
if
(
WITH_CUSTOM_DEVICE
)
target_link_libraries
(
device_context custom_device_resource_pool
)
target_link_libraries
(
device_context custom_device_resource_pool
)
endif
()
endif
()
...
...
paddle/fluid/platform/device/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -15,11 +15,6 @@ if(WITH_IPU)
...
@@ -15,11 +15,6 @@ if(WITH_IPU)
add_subdirectory
(
ipu
)
add_subdirectory
(
ipu
)
endif
()
endif
()
# MLU
if
(
WITH_MLU
)
add_subdirectory
(
mlu
)
endif
()
if
(
WITH_CUSTOM_DEVICE
)
if
(
WITH_CUSTOM_DEVICE
)
add_subdirectory
(
custom
)
add_subdirectory
(
custom
)
endif
()
endif
()
paddle/fluid/platform/device/device_wrapper.h
浏览文件 @
e75c01f9
...
@@ -25,11 +25,6 @@ limitations under the License. */
...
@@ -25,11 +25,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#endif
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
e75c01f9
...
@@ -33,11 +33,6 @@ limitations under the License. */
...
@@ -33,11 +33,6 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -224,18 +219,6 @@ void EmplaceDeviceContexts(
...
@@ -224,18 +219,6 @@ void EmplaceDeviceContexts(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option."
));
"option."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_MLU
EmplaceDeviceContext
<
MLUDeviceContext
>
(
place_to_device_context
,
place
,
disable_setting_default_stream_for_allocator
,
/*unused*/
stream_priority
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"MLUPlace is not supported. Please "
"re-compile with WITH_MLU option."
));
#endif
#endif
}
else
if
(
platform
::
is_ipu_place
(
place
))
{
}
else
if
(
platform
::
is_ipu_place
(
place
))
{
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
e75c01f9
...
@@ -135,10 +135,6 @@ class IPUDeviceContext
...
@@ -135,10 +135,6 @@ class IPUDeviceContext
};
};
#endif
#endif
#ifdef PADDLE_WITH_MLU
class
MLUDeviceContext
;
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
namespace
xpu
=
baidu
::
xpu
::
api
;
namespace
xpu
=
baidu
::
xpu
::
api
;
using
XPUDeviceContext
=
phi
::
XPUContext
;
using
XPUDeviceContext
=
phi
::
XPUContext
;
...
@@ -173,11 +169,6 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
...
@@ -173,11 +169,6 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
};
};
#endif
#endif
#ifdef PADDLE_WITH_MLU
template
<
>
struct
DefaultDeviceContextType
<
phi
::
MLUPlace
>
;
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template
<
>
template
<
>
struct
DefaultDeviceContextType
<
phi
::
GPUPinnedPlace
>
{
struct
DefaultDeviceContextType
<
phi
::
GPUPinnedPlace
>
{
...
...
paddle/fluid/platform/init.cc
浏览文件 @
e75c01f9
...
@@ -36,10 +36,6 @@ limitations under the License. */
...
@@ -36,10 +36,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef WITH_WIN_DUMP_DBG
#ifdef WITH_WIN_DUMP_DBG
#include <stdio.h>
#include <stdio.h>
#include <time.h>
#include <time.h>
...
@@ -195,14 +191,6 @@ void InitDevices() {
...
@@ -195,14 +191,6 @@ void InitDevices() {
LOG
(
WARNING
)
LOG
(
WARNING
)
<<
"Compiled with PADDLE_WITH_IPU, but no IPU found in runtime."
;
<<
"Compiled with PADDLE_WITH_IPU, but no IPU found in runtime."
;
}
}
#endif
#ifdef PADDLE_WITH_MLU
try
{
// use user specified MLUs in single-node multi-process mode.
devices
=
platform
::
GetMLUSelectedDevices
();
}
catch
(
const
std
::
exception
&
exp
)
{
LOG
(
WARNING
)
<<
"Compiled with WITH_MLU, but no MLU found in runtime."
;
}
#endif
#endif
InitDevices
(
devices
);
InitDevices
(
devices
);
});
});
...
@@ -228,10 +216,6 @@ void InitDevices(const std::vector<int> devices) {
...
@@ -228,10 +216,6 @@ void InitDevices(const std::vector<int> devices) {
#ifdef PADDLE_WITH_IPU
#ifdef PADDLE_WITH_IPU
places
.
emplace_back
(
platform
::
IPUPlace
(
devices
[
i
]));
places
.
emplace_back
(
platform
::
IPUPlace
(
devices
[
i
]));
#endif
#endif
#ifdef PADDLE_WITH_MLU
places
.
emplace_back
(
platform
::
MLUPlace
(
devices
[
i
]));
#endif
}
}
places
.
emplace_back
(
platform
::
CPUPlace
());
places
.
emplace_back
(
platform
::
CPUPlace
());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/platform/init_test.cc
浏览文件 @
e75c01f9
...
@@ -15,16 +15,13 @@ limitations under the License. */
...
@@ -15,16 +15,13 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
TEST
(
InitDevices
,
CPU
)
{
TEST
(
InitDevices
,
CPU
)
{
using
paddle
::
framework
::
InitDevices
;
using
paddle
::
framework
::
InitDevices
;
using
paddle
::
platform
::
DeviceContextPool
;
using
paddle
::
platform
::
DeviceContextPool
;
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) && \
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) && \
!defined(PADDLE_WITH_HIP)
&& !defined(PADDLE_WITH_MLU)
!defined(PADDLE_WITH_HIP)
InitDevices
();
InitDevices
();
DeviceContextPool
&
pool
=
DeviceContextPool
::
Instance
();
DeviceContextPool
&
pool
=
DeviceContextPool
::
Instance
();
ASSERT_EQ
(
pool
.
Size
(),
1U
);
ASSERT_EQ
(
pool
.
Size
(),
1U
);
...
@@ -55,18 +52,6 @@ TEST(InitDevices, XPU) {
...
@@ -55,18 +52,6 @@ TEST(InitDevices, XPU) {
#endif
#endif
}
}
TEST
(
InitDevices
,
MLU
)
{
using
paddle
::
framework
::
InitDevices
;
using
paddle
::
platform
::
DeviceContextPool
;
#ifdef PADDLE_WITH_MLU
int
count
=
paddle
::
platform
::
GetMLUDeviceCount
();
InitDevices
();
DeviceContextPool
&
pool
=
DeviceContextPool
::
Instance
();
ASSERT_EQ
(
pool
.
Size
(),
1U
+
static_cast
<
unsigned
>
(
count
));
#endif
}
#ifndef _WIN32
#ifndef _WIN32
TEST
(
SignalHandle
,
SignalHandle
)
{
TEST
(
SignalHandle
,
SignalHandle
)
{
std
::
string
msg
=
"Signal raises"
;
std
::
string
msg
=
"Signal raises"
;
...
...
paddle/fluid/platform/place.h
浏览文件 @
e75c01f9
...
@@ -32,7 +32,6 @@ using NPUPlace = phi::NPUPlace;
...
@@ -32,7 +32,6 @@ using NPUPlace = phi::NPUPlace;
using
NPUPinnedPlace
=
phi
::
NPUPinnedPlace
;
using
NPUPinnedPlace
=
phi
::
NPUPinnedPlace
;
using
XPUPlace
=
phi
::
XPUPlace
;
using
XPUPlace
=
phi
::
XPUPlace
;
using
IPUPlace
=
phi
::
IPUPlace
;
using
IPUPlace
=
phi
::
IPUPlace
;
using
MLUPlace
=
phi
::
MLUPlace
;
using
CustomPlace
=
phi
::
CustomPlace
;
using
CustomPlace
=
phi
::
CustomPlace
;
using
PlaceList
=
std
::
vector
<
Place
>
;
using
PlaceList
=
std
::
vector
<
Place
>
;
...
@@ -110,15 +109,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
...
@@ -110,15 +109,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with IPU. Cannot visit ipu device"
));
"Paddle is not compiled with IPU. Cannot visit ipu device"
));
return
typename
Visitor
::
result_type
();
return
typename
Visitor
::
result_type
();
#endif
}
case
phi
::
AllocationType
::
MLU
:
{
#ifdef PADDLE_WITH_MLU
platform
::
MLUPlace
p
(
place
.
GetDeviceId
());
return
visitor
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with MLU. Cannot visit mlu device"
));
#endif
#endif
}
}
case
phi
::
AllocationType
::
CUSTOM
:
{
case
phi
::
AllocationType
::
CUSTOM
:
{
...
...
paddle/fluid/platform/place_test.cc
浏览文件 @
e75c01f9
...
@@ -19,7 +19,6 @@ TEST(Place, Equality) {
...
@@ -19,7 +19,6 @@ TEST(Place, Equality) {
paddle
::
platform
::
CPUPlace
cpu
;
paddle
::
platform
::
CPUPlace
cpu
;
paddle
::
platform
::
CUDAPlace
g0
(
0
),
g1
(
1
),
gg0
(
0
);
paddle
::
platform
::
CUDAPlace
g0
(
0
),
g1
(
1
),
gg0
(
0
);
paddle
::
platform
::
XPUPlace
x0
(
0
),
x1
(
1
),
xx0
(
0
);
paddle
::
platform
::
XPUPlace
x0
(
0
),
x1
(
1
),
xx0
(
0
);
paddle
::
platform
::
MLUPlace
m0
(
0
),
m1
(
1
),
mm0
(
0
);
EXPECT_EQ
(
cpu
,
cpu
);
EXPECT_EQ
(
cpu
,
cpu
);
EXPECT_EQ
(
g0
,
g0
);
EXPECT_EQ
(
g0
,
g0
);
...
@@ -28,13 +27,9 @@ TEST(Place, Equality) {
...
@@ -28,13 +27,9 @@ TEST(Place, Equality) {
EXPECT_EQ
(
x0
,
x0
);
EXPECT_EQ
(
x0
,
x0
);
EXPECT_EQ
(
x1
,
x1
);
EXPECT_EQ
(
x1
,
x1
);
EXPECT_EQ
(
x0
,
xx0
);
EXPECT_EQ
(
x0
,
xx0
);
EXPECT_EQ
(
m0
,
m0
);
EXPECT_EQ
(
m1
,
m1
);
EXPECT_EQ
(
m0
,
mm0
);
EXPECT_NE
(
g0
,
g1
);
EXPECT_NE
(
g0
,
g1
);
EXPECT_NE
(
x0
,
x1
);
EXPECT_NE
(
x0
,
x1
);
EXPECT_NE
(
m0
,
m1
);
EXPECT_TRUE
(
paddle
::
platform
::
places_are_same_class
(
g0
,
gg0
));
EXPECT_TRUE
(
paddle
::
platform
::
places_are_same_class
(
g0
,
gg0
));
EXPECT_TRUE
(
paddle
::
platform
::
places_are_same_class
(
x0
,
xx0
));
EXPECT_TRUE
(
paddle
::
platform
::
places_are_same_class
(
x0
,
xx0
));
...
@@ -49,11 +44,6 @@ TEST(Place, Print) {
...
@@ -49,11 +44,6 @@ TEST(Place, Print) {
ss
<<
paddle
::
platform
::
XPUPlace
(
1
);
ss
<<
paddle
::
platform
::
XPUPlace
(
1
);
EXPECT_EQ
(
"Place(xpu:1)"
,
ss
.
str
());
EXPECT_EQ
(
"Place(xpu:1)"
,
ss
.
str
());
}
}
{
std
::
stringstream
ss
;
ss
<<
paddle
::
platform
::
MLUPlace
(
1
);
EXPECT_EQ
(
"Place(mlu:1)"
,
ss
.
str
());
}
{
{
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
paddle
::
platform
::
CUDAPlace
(
1
);
ss
<<
paddle
::
platform
::
CUDAPlace
(
1
);
...
...
paddle/fluid/platform/profiler/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -6,7 +6,6 @@ cc_library(
...
@@ -6,7 +6,6 @@ cc_library(
cuda_tracer
cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc
SRCS cuda_tracer.cc cupti_data_process.cc
DEPS workqueue_utils enforce glog
)
DEPS workqueue_utils enforce glog
)
add_subdirectory
(
mlu
)
add_subdirectory
(
custom_device
)
add_subdirectory
(
custom_device
)
cc_library
(
cc_library
(
event_node
event_node
...
@@ -33,12 +32,7 @@ cc_library(
...
@@ -33,12 +32,7 @@ cc_library(
cc_library
(
cc_library
(
new_profiler
new_profiler
SRCS profiler.cc
SRCS profiler.cc
DEPS host_tracer
DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
cuda_tracer
profiler_utils
cpu_utilization
event_bind
mlu_tracer
custom_tracer
)
custom_tracer
)
cc_test
(
cc_test
(
test_event_node
test_event_node
...
...
paddle/fluid/platform/profiler/chrometracing_logger.cc
浏览文件 @
e75c01f9
...
@@ -790,11 +790,7 @@ void ChromeTracingLogger::RefineDisplayName(
...
@@ -790,11 +790,7 @@ void ChromeTracingLogger::RefineDisplayName(
(
*
it
).
second
*
2
+
1
);
(
*
it
).
second
*
2
+
1
);
}
}
#ifdef PADDLE_WITH_MLU
static
std
::
string
device_type
(
"MLU"
);
#else
static
std
::
string
device_type
(
"GPU"
);
static
std
::
string
device_type
(
"GPU"
);
#endif
for
(
auto
it
=
deviceid_streamid_set_
.
begin
();
for
(
auto
it
=
deviceid_streamid_set_
.
begin
();
it
!=
deviceid_streamid_set_
.
end
();
it
!=
deviceid_streamid_set_
.
end
();
...
...
paddle/fluid/platform/profiler/mlu/CMakeLists.txt
已删除
100644 → 0
浏览文件 @
075d6b14
if
(
WITH_MLU
)
set
(
MLU_INFO mlu_info
)
endif
()
cc_library
(
mlu_tracer
SRCS mlu_tracer.cc cnpapi_data_process.cc
DEPS workqueue_utils enforce glog
${
MLU_INFO
}
)
paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#include <cstdio>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#ifdef PADDLE_WITH_MLU
namespace
paddle
{
namespace
platform
{
namespace
{
inline
uint64_t
GetTimeGap
()
{
static
uint64_t
time_gap
=
[]()
->
uint64_t
{
uint64_t
cpu_time
=
PosixInNsec
();
uint64_t
mlu_time
=
cnpapiGetTimestamp
();
return
(
cpu_time
-
mlu_time
);
}();
return
time_gap
;
}
void
AddKernelRecord
(
const
cnpapiActivityKernel
*
kernel
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
)
{
static
uint64_t
time_gap
=
GetTimeGap
();
if
(
kernel
->
start
+
time_gap
<
start_ns
)
{
return
;
}
DeviceTraceEvent
event
;
event
.
name
=
demangle
(
kernel
->
name
);
event
.
type
=
TracerEventType
::
Kernel
;
event
.
start_ns
=
kernel
->
start
+
time_gap
;
event
.
end_ns
=
kernel
->
end
+
time_gap
;
event
.
device_id
=
kernel
->
device_id
;
event
.
context_id
=
kernel
->
context_id
;
event
.
stream_id
=
kernel
->
queue_id
;
event
.
correlation_id
=
kernel
->
correlation_id
;
event
.
kernel_info
.
block_x
=
kernel
->
dimx
;
event
.
kernel_info
.
block_y
=
kernel
->
dimy
;
event
.
kernel_info
.
block_z
=
kernel
->
dimz
;
event
.
kernel_info
.
grid_x
=
kernel
->
kernel_type
;
event
.
kernel_info
.
grid_y
=
0
;
event
.
kernel_info
.
grid_z
=
0
;
event
.
kernel_info
.
queued
=
kernel
->
queued
;
event
.
kernel_info
.
submitted
=
kernel
->
submitted
;
event
.
kernel_info
.
completed
=
kernel
->
received
;
collector
->
AddDeviceEvent
(
std
::
move
(
event
));
}
const
char
*
MemcpyKind
(
cnpapiActivityMemcpyType
kind
)
{
switch
(
kind
)
{
case
CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD
:
return
"MEMCPY_HtoD"
;
case
CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH
:
return
"MEMCPY_DtoH"
;
case
CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD
:
return
"MEMCPY_DtoD"
;
case
CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH
:
return
"MEMCPY_HtoH"
;
case
CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP
:
return
"MEMCPY_PtoP"
;
default:
break
;
}
return
"MEMCPY"
;
}
void
AddMemcpyRecord
(
const
cnpapiActivityMemcpy
*
memcpy
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
)
{
static
uint64_t
time_gap
=
GetTimeGap
();
if
(
memcpy
->
start
+
time_gap
<
start_ns
)
{
return
;
}
DeviceTraceEvent
event
;
event
.
name
=
MemcpyKind
(
memcpy
->
copy_type
);
event
.
type
=
TracerEventType
::
Memcpy
;
event
.
start_ns
=
memcpy
->
start
+
time_gap
;
event
.
end_ns
=
memcpy
->
end
+
time_gap
;
event
.
device_id
=
memcpy
->
device_id
;
event
.
context_id
=
memcpy
->
context_id
;
event
.
stream_id
=
memcpy
->
queue_id
;
event
.
correlation_id
=
memcpy
->
correlation_id
;
event
.
memcpy_info
.
num_bytes
=
memcpy
->
bytes
;
snprintf
(
event
.
memcpy_info
.
copy_kind
,
phi
::
kMemKindMaxLen
,
"%s"
,
MemcpyKind
(
memcpy
->
copy_type
));
collector
->
AddDeviceEvent
(
std
::
move
(
event
));
}
void
AddMemcpy2Record
(
const
cnpapiActivityMemcpyPtoP
*
memcpy2
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
)
{
static
uint64_t
time_gap
=
GetTimeGap
();
if
(
memcpy2
->
start
+
time_gap
<
start_ns
)
{
return
;
}
DeviceTraceEvent
event
;
event
.
name
=
MemcpyKind
(
memcpy2
->
copy_type
);
event
.
type
=
TracerEventType
::
Memcpy
;
event
.
start_ns
=
memcpy2
->
start
+
time_gap
;
event
.
end_ns
=
memcpy2
->
end
+
time_gap
;
event
.
device_id
=
memcpy2
->
device_id
;
event
.
context_id
=
memcpy2
->
context_id
;
event
.
stream_id
=
memcpy2
->
queue_id
;
event
.
correlation_id
=
memcpy2
->
correlation_id
;
event
.
memcpy_info
.
num_bytes
=
memcpy2
->
bytes
;
snprintf
(
event
.
memcpy_info
.
copy_kind
,
phi
::
kMemKindMaxLen
,
"%s"
,
MemcpyKind
(
memcpy2
->
copy_type
));
collector
->
AddDeviceEvent
(
std
::
move
(
event
));
}
void
AddMemsetRecord
(
const
cnpapiActivityMemset
*
memset
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
)
{
static
uint64_t
time_gap
=
GetTimeGap
();
if
(
memset
->
start
+
time_gap
<
start_ns
)
{
return
;
}
DeviceTraceEvent
event
;
event
.
name
=
"MEMSET"
;
event
.
type
=
TracerEventType
::
Memset
;
event
.
start_ns
=
memset
->
start
+
time_gap
;
event
.
end_ns
=
memset
->
end
+
time_gap
;
event
.
device_id
=
memset
->
device_id
;
event
.
context_id
=
memset
->
context_id
;
event
.
stream_id
=
memset
->
queue_id
;
event
.
correlation_id
=
memset
->
correlation_id
;
event
.
memset_info
.
num_bytes
=
memset
->
bytes
;
event
.
memset_info
.
value
=
memset
->
value
;
collector
->
AddDeviceEvent
(
std
::
move
(
event
));
}
class
CnpapiRuntimeCbidStr
{
public:
static
const
CnpapiRuntimeCbidStr
&
GetInstance
()
{
static
CnpapiRuntimeCbidStr
inst
;
return
inst
;
}
std
::
string
RuntimeKind
(
cnpapi_CallbackId
cbid
)
const
{
auto
iter
=
cbid_str_
.
find
(
cbid
);
if
(
iter
==
cbid_str_
.
end
())
{
return
"MLU Runtime API "
+
std
::
to_string
(
cbid
);
}
return
iter
->
second
;
}
private:
CnpapiRuntimeCbidStr
();
std
::
unordered_map
<
cnpapi_CallbackId
,
std
::
string
>
cbid_str_
;
};
CnpapiRuntimeCbidStr
::
CnpapiRuntimeCbidStr
()
{
#define REGISTER_RUNTIME_CBID_STR(cbid) \
cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR
(
cnMalloc
);
REGISTER_RUNTIME_CBID_STR
(
cnMallocHost
);
REGISTER_RUNTIME_CBID_STR
(
cnFree
);
REGISTER_RUNTIME_CBID_STR
(
cnFreeHost
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpy
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyPeer
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyHtoD
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyDtoH
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyDtoD
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyAsync
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyHtoDAsync
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyDtoHAsync
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyDtoDAsync
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyDtoD2D
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpyDtoD3D
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpy2D
);
REGISTER_RUNTIME_CBID_STR
(
cnMemcpy3D
);
REGISTER_RUNTIME_CBID_STR
(
cnMemsetD8
);
REGISTER_RUNTIME_CBID_STR
(
cnMemsetD16
);
REGISTER_RUNTIME_CBID_STR
(
cnMemsetD32
);
REGISTER_RUNTIME_CBID_STR
(
cnMemsetD8Async
);
REGISTER_RUNTIME_CBID_STR
(
cnMemsetD16Async
);
REGISTER_RUNTIME_CBID_STR
(
cnMemsetD32Async
);
REGISTER_RUNTIME_CBID_STR
(
cnInvokeKernel
);
REGISTER_RUNTIME_CBID_STR
(
cnCreateQueue
);
REGISTER_RUNTIME_CBID_STR
(
cnDestroyQueue
);
REGISTER_RUNTIME_CBID_STR
(
cnQueueSync
);
REGISTER_RUNTIME_CBID_STR
(
cnQueueWaitNotifier
);
REGISTER_RUNTIME_CBID_STR
(
cnWaitNotifier
);
REGISTER_RUNTIME_CBID_STR
(
cnCreateNotifier
);
REGISTER_RUNTIME_CBID_STR
(
cnDestroyNotifier
);
REGISTER_RUNTIME_CBID_STR
(
cnPlaceNotifier
);
REGISTER_RUNTIME_CBID_STR
(
cnCtxCreate
);
REGISTER_RUNTIME_CBID_STR
(
cnCtxDestroy
);
REGISTER_RUNTIME_CBID_STR
(
cnCtxGetCurrent
);
REGISTER_RUNTIME_CBID_STR
(
cnCtxSetCurrent
);
REGISTER_RUNTIME_CBID_STR
(
cnCtxGetDevice
);
REGISTER_RUNTIME_CBID_STR
(
cnCtxSync
);
REGISTER_RUNTIME_CBID_STR
(
cnInvokeHostFunc
);
#undef REGISTER_RUNTIME_CBID_STR
}
void
AddApiRecord
(
const
cnpapiActivityAPI
*
api
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
)
{
static
uint64_t
time_gap
=
GetTimeGap
();
if
(
api
->
start
+
time_gap
<
start_ns
)
{
return
;
}
RuntimeTraceEvent
event
;
event
.
name
=
CnpapiRuntimeCbidStr
::
GetInstance
().
RuntimeKind
(
api
->
cbid
);
event
.
start_ns
=
api
->
start
+
time_gap
;
event
.
end_ns
=
api
->
end
+
time_gap
;
event
.
process_id
=
api
->
process_id
;
event
.
thread_id
=
api
->
thread_id
;
event
.
correlation_id
=
api
->
correlation_id
;
event
.
callback_id
=
api
->
cbid
;
event
.
type
=
TracerEventType
::
MluRuntime
;
collector
->
AddRuntimeEvent
(
std
::
move
(
event
));
}
}
// namespace
namespace
details
{
void
ProcessCnpapiActivityRecord
(
const
cnpapiActivity
*
record
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
)
{
switch
(
record
->
type
)
{
case
CNPAPI_ACTIVITY_TYPE_KERNEL
:
AddKernelRecord
(
reinterpret_cast
<
const
cnpapiActivityKernel
*>
(
record
),
start_ns
,
collector
);
break
;
case
CNPAPI_ACTIVITY_TYPE_MEMCPY
:
AddMemcpyRecord
(
reinterpret_cast
<
const
cnpapiActivityMemcpy
*>
(
record
),
start_ns
,
collector
);
break
;
case
CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP
:
AddMemcpy2Record
(
reinterpret_cast
<
const
cnpapiActivityMemcpyPtoP
*>
(
record
),
start_ns
,
collector
);
break
;
case
CNPAPI_ACTIVITY_TYPE_MEMSET
:
AddMemsetRecord
(
reinterpret_cast
<
const
cnpapiActivityMemset
*>
(
record
),
start_ns
,
collector
);
break
;
case
CNPAPI_ACTIVITY_TYPE_CNDRV_API
:
AddApiRecord
(
reinterpret_cast
<
const
cnpapiActivityAPI
*>
(
record
),
start_ns
,
collector
);
break
;
default:
break
;
}
}
}
// namespace details
}
// namespace platform
}
// namespace paddle
#endif
paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
namespace
paddle
{
namespace
platform
{
namespace
details
{
#ifdef PADDLE_WITH_MLU
void
ProcessCnpapiActivityRecord
(
const
cnpapiActivity
*
record
,
uint64_t
start_ns
,
TraceEventCollector
*
collector
);
#endif
}
// namespace details
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler/mlu/mlu_tracer.cc
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#include <string>
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#define CNPAPI_CALL(call) \
do { \
cnpapiResult _status = call; \
if (_status != CNPAPI_SUCCESS) { \
const char* errstr; \
cnpapiGetResultString(_status, &errstr); \
LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
} \
} while (0)
namespace
paddle
{
namespace
platform
{
namespace
{
void
BufferRequestedCallback
(
uint64_t
**
buffer
,
size_t
*
size
,
size_t
*
max_num_records
)
{
constexpr
size_t
kBufferSize
=
1
<<
23
;
// 8 MB
constexpr
size_t
kBufferAlignSize
=
8
;
*
buffer
=
reinterpret_cast
<
uint64_t
*>
(
paddle
::
framework
::
AlignedMalloc
(
kBufferSize
,
kBufferAlignSize
));
*
size
=
kBufferSize
;
*
max_num_records
=
0
;
}
void
BufferCompletedCallback
(
uint64_t
*
buffer
,
size_t
size
,
size_t
valid_size
)
{
if
(
buffer
==
nullptr
||
valid_size
==
0
)
{
return
;
}
auto
mlu_tracer
=
&
MluTracer
::
GetInstance
();
mlu_tracer
->
ProcessCnpapiActivity
(
buffer
,
valid_size
);
paddle
::
framework
::
AlignedFree
(
buffer
);
}
}
// namespace
MluTracer
::
MluTracer
()
{
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL
(
cnpapiInit
());
CNPAPI_CALL
(
cnpapiActivityRegisterCallbacks
(
BufferRequestedCallback
,
BufferCompletedCallback
));
#endif
}
void
MluTracer
::
PrepareTracing
()
{
PADDLE_ENFORCE_EQ
(
state_
==
TracerState
::
UNINITED
||
state_
==
TracerState
::
STOPED
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"MluTracer must be UNINITED"
));
EnableCnpapiActivity
();
state_
=
TracerState
::
READY
;
}
void
MluTracer
::
StartTracing
()
{
PADDLE_ENFORCE_EQ
(
state_
==
TracerState
::
READY
,
true
,
platform
::
errors
::
PreconditionNotMet
(
"MluTracer must be READY or STOPPED"
));
tracing_start_ns_
=
PosixInNsec
();
state_
=
TracerState
::
STARTED
;
}
void
MluTracer
::
StopTracing
()
{
PADDLE_ENFORCE_EQ
(
state_
,
TracerState
::
STARTED
,
platform
::
errors
::
PreconditionNotMet
(
"MluTracer must be STARTED"
));
DisableCnpapiActivity
();
state_
=
TracerState
::
STOPED
;
}
void
MluTracer
::
CollectTraceData
(
TraceEventCollector
*
collector
)
{
PADDLE_ENFORCE_EQ
(
state_
,
TracerState
::
STOPED
,
platform
::
errors
::
PreconditionNotMet
(
"MluTracer must be STOPED"
));
for
(
auto
he
:
collector_
.
HostEvents
())
{
collector
->
AddHostEvent
(
std
::
move
(
he
));
}
for
(
auto
rte
:
collector_
.
RuntimeEvents
())
{
collector
->
AddRuntimeEvent
(
std
::
move
(
rte
));
}
for
(
auto
de
:
collector_
.
DeviceEvents
())
{
collector
->
AddDeviceEvent
(
std
::
move
(
de
));
}
for
(
auto
tn
:
collector_
.
ThreadNames
())
{
collector
->
AddThreadName
(
tn
.
first
,
tn
.
second
);
}
collector_
.
ClearAll
();
}
void
MluTracer
::
ProcessCnpapiActivity
(
uint64_t
*
buffer
,
size_t
valid_size
)
{
#ifdef PADDLE_WITH_MLU
cnpapiActivity
*
record
=
nullptr
;
while
(
true
)
{
cnpapiResult
status
=
cnpapiActivityGetNextRecord
(
buffer
,
valid_size
,
&
record
);
if
(
status
==
CNPAPI_SUCCESS
)
{
details
::
ProcessCnpapiActivityRecord
(
record
,
tracing_start_ns_
,
&
collector_
);
}
else
if
(
status
==
CNPAPI_ERROR_INSUFFICIENT_MEMORY
||
status
==
CNPAPI_ERROR_MAX_LIMIT_REACHED
)
{
break
;
}
else
{
CNPAPI_CALL
(
status
);
}
}
#endif
}
void
MluTracer
::
EnableCnpapiActivity
()
{
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL
(
cnpapiActivityEnable
(
CNPAPI_ACTIVITY_TYPE_KERNEL
));
CNPAPI_CALL
(
cnpapiActivityEnable
(
CNPAPI_ACTIVITY_TYPE_MEMCPY
));
CNPAPI_CALL
(
cnpapiActivityEnable
(
CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP
));
CNPAPI_CALL
(
cnpapiActivityEnable
(
CNPAPI_ACTIVITY_TYPE_MEMSET
));
CNPAPI_CALL
(
cnpapiActivityEnable
(
CNPAPI_ACTIVITY_TYPE_CNDRV_API
));
VLOG
(
3
)
<<
"enable cnpapi activity"
;
#endif
}
void
MluTracer
::
DisableCnpapiActivity
()
{
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL
(
cnpapiActivityFlushAll
());
CNPAPI_CALL
(
cnpapiActivityDisable
(
CNPAPI_ACTIVITY_TYPE_KERNEL
));
CNPAPI_CALL
(
cnpapiActivityDisable
(
CNPAPI_ACTIVITY_TYPE_MEMCPY
));
CNPAPI_CALL
(
cnpapiActivityDisable
(
CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP
));
CNPAPI_CALL
(
cnpapiActivityDisable
(
CNPAPI_ACTIVITY_TYPE_MEMSET
));
CNPAPI_CALL
(
cnpapiActivityDisable
(
CNPAPI_ACTIVITY_TYPE_CNDRV_API
));
VLOG
(
3
)
<<
"disable cnpapi activity"
;
#endif
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler/mlu/mlu_tracer.h
已删除
100644 → 0
浏览文件 @
075d6b14
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <vector>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
namespace
paddle
{
namespace
platform
{
class
MluTracer
:
public
TracerBase
{
public:
static
MluTracer
&
GetInstance
()
{
static
MluTracer
instance
;
return
instance
;
}
void
PrepareTracing
()
override
;
void
StartTracing
()
override
;
void
StopTracing
()
override
;
void
CollectTraceData
(
TraceEventCollector
*
collector
)
override
;
void
ProcessCnpapiActivity
(
uint64_t
*
buffer
,
size_t
valid_size
);
private:
MluTracer
();
DISABLE_COPY_AND_ASSIGN
(
MluTracer
);
void
EnableCnpapiActivity
();
void
DisableCnpapiActivity
();
uint64_t
tracing_start_ns_
=
UINT64_MAX
;
TraceEventCollector
collector_
;
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/profiler/profiler.cc
浏览文件 @
e75c01f9
...
@@ -29,10 +29,6 @@
...
@@ -29,10 +29,6 @@
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include "paddle/fluid/platform/profiler/host_tracer.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/utils.h"
#include "paddle/fluid/platform/profiler/utils.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -49,9 +45,6 @@ void SynchronizeDevice() {
...
@@ -49,9 +45,6 @@ void SynchronizeDevice() {
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
#endif
#endif
#ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtSyncDevice
());
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
dev_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
auto
dev_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
for
(
const
auto
&
dev_type
:
dev_types
)
{
for
(
const
auto
&
dev_type
:
dev_types
)
{
...
@@ -86,9 +79,6 @@ bool Profiler::IsCuptiSupported() {
...
@@ -86,9 +79,6 @@ bool Profiler::IsCuptiSupported() {
bool
Profiler
::
IsCnpapiSupported
()
{
bool
Profiler
::
IsCnpapiSupported
()
{
bool
supported
=
false
;
bool
supported
=
false
;
#ifdef PADDLE_WITH_MLU
supported
=
true
;
#endif
return
supported
;
return
supported
;
}
}
...
@@ -104,11 +94,6 @@ Profiler::Profiler(const ProfilerOptions& options,
...
@@ -104,11 +94,6 @@ Profiler::Profiler(const ProfilerOptions& options,
if
(
trace_switch
.
test
(
kProfileGPUOptionBit
))
{
if
(
trace_switch
.
test
(
kProfileGPUOptionBit
))
{
tracers_
.
emplace_back
(
&
CudaTracer
::
GetInstance
(),
false
);
tracers_
.
emplace_back
(
&
CudaTracer
::
GetInstance
(),
false
);
}
}
#ifdef PADDLE_WITH_MLU
if
(
trace_switch
.
test
(
kProfileMLUOptionBit
))
{
tracers_
.
emplace_back
(
&
MluTracer
::
GetInstance
(),
false
);
}
#endif
if
(
trace_switch
.
test
(
kProfileCustomDeviceOptionBit
))
{
if
(
trace_switch
.
test
(
kProfileCustomDeviceOptionBit
))
{
for
(
const
auto
&
dev_type
:
custom_device_types
)
{
for
(
const
auto
&
dev_type
:
custom_device_types
)
{
tracers_
.
emplace_back
(
&
CustomTracer
::
GetInstance
(
dev_type
),
false
);
tracers_
.
emplace_back
(
&
CustomTracer
::
GetInstance
(
dev_type
),
false
);
...
...
paddle/fluid/platform/profiler_helper.h
浏览文件 @
e75c01f9
...
@@ -34,10 +34,6 @@ limitations under the License. */
...
@@ -34,10 +34,6 @@ limitations under the License. */
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/backends/device_manager.h"
#endif
#endif
...
@@ -112,13 +108,6 @@ void SynchronizeAllDevice() {
...
@@ -112,13 +108,6 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
}
}
#endif
#endif
#ifdef PADDLE_WITH_MLU
int
count
=
GetMLUDeviceCount
();
for
(
int
i
=
0
;
i
<
count
;
i
++
)
{
SetMLUDeviceId
(
i
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtSyncDevice
());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
dev_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
auto
dev_types
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
();
for
(
const
auto
&
dev_type
:
dev_types
)
{
for
(
const
auto
&
dev_type
:
dev_types
)
{
...
...
paddle/fluid/platform/stream_callback_manager.cc
浏览文件 @
e75c01f9
...
@@ -32,10 +32,6 @@ static void StreamCallbackFunc(gpuStream_t stream,
...
@@ -32,10 +32,6 @@ static void StreamCallbackFunc(gpuStream_t stream,
StreamCallbackFunc
(
cudaStream_t
stream
,
cudaError_t
status
,
void
*
user_data
)
StreamCallbackFunc
(
cudaStream_t
stream
,
cudaError_t
status
,
void
*
user_data
)
#endif
#endif
#endif
#endif
#if PADDLE_WITH_MLU
static
void
StreamCallbackFunc
(
void
*
user_data
)
#endif
{
{
std
::
unique_ptr
<
std
::
function
<
void
()
>>
func
(
std
::
unique_ptr
<
std
::
function
<
void
()
>>
func
(
reinterpret_cast
<
std
::
function
<
void
()
>
*>
(
user_data
));
reinterpret_cast
<
std
::
function
<
void
()
>
*>
(
user_data
));
...
@@ -71,20 +67,12 @@ void StreamCallbackManager<Stream>::AddCallback(
...
@@ -71,20 +67,12 @@ void StreamCallbackManager<Stream>::AddCallback(
cudaStreamAddCallback
(
stream_
,
StreamCallbackFunc
,
func
,
0
));
cudaStreamAddCallback
(
stream_
,
StreamCallbackFunc
,
func
,
0
));
#endif
#endif
#endif
#endif
#if PADDLE_WITH_MLU
VLOG
(
3
)
<<
"MLULaunchCallback at stream: "
<<
stream_
;
cnrtInvokeHostFunc
(
stream_
,
StreamCallbackFunc
,
func
);
#endif
}
}
template
<
typename
Stream
>
template
<
typename
Stream
>
void
StreamCallbackManager
<
Stream
>::
Wait
()
const
{
void
StreamCallbackManager
<
Stream
>::
Wait
()
const
{
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
platform
::
GpuStreamSync
(
stream_
);
platform
::
GpuStreamSync
(
stream_
);
#endif
#ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
stream_
));
#endif
#endif
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
...
@@ -100,10 +88,5 @@ template struct StreamCallbackManager<gpuStream_t>;
...
@@ -100,10 +88,5 @@ template struct StreamCallbackManager<gpuStream_t>;
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
template
struct
StreamCallbackManager
<
hipStream_t
>;
template
struct
StreamCallbackManager
<
hipStream_t
>;
#endif
#endif
#ifdef PADDLE_WITH_MLU
template
struct
StreamCallbackManager
<
mluStream
>;
#endif
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/pybind/imperative.cc
浏览文件 @
e75c01f9
...
@@ -151,8 +151,6 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
...
@@ -151,8 +151,6 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
return
place_obj
.
cast
<
platform
::
IPUPlace
>
();
return
place_obj
.
cast
<
platform
::
IPUPlace
>
();
}
else
if
(
py
::
isinstance
<
platform
::
Place
>
(
place_obj
))
{
}
else
if
(
py
::
isinstance
<
platform
::
Place
>
(
place_obj
))
{
return
place_obj
.
cast
<
platform
::
Place
>
();
return
place_obj
.
cast
<
platform
::
Place
>
();
}
else
if
(
py
::
isinstance
<
platform
::
MLUPlace
>
(
place_obj
))
{
return
place_obj
.
cast
<
platform
::
MLUPlace
>
();
}
else
if
(
py
::
isinstance
<
platform
::
CustomPlace
>
(
place_obj
))
{
}
else
if
(
py
::
isinstance
<
platform
::
CustomPlace
>
(
place_obj
))
{
return
place_obj
.
cast
<
platform
::
CustomPlace
>
();
return
place_obj
.
cast
<
platform
::
CustomPlace
>
();
}
else
{
}
else
{
...
@@ -207,8 +205,6 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
...
@@ -207,8 +205,6 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
SetTensorFromPyArray
<
platform
::
NPUPlace
>
(
tensor
,
array
,
place
,
zero_copy
);
SetTensorFromPyArray
<
platform
::
NPUPlace
>
(
tensor
,
array
,
place
,
zero_copy
);
}
else
if
(
platform
::
is_ipu_place
(
place
))
{
}
else
if
(
platform
::
is_ipu_place
(
place
))
{
SetTensorFromPyArray
<
platform
::
IPUPlace
>
(
tensor
,
array
,
place
,
zero_copy
);
SetTensorFromPyArray
<
platform
::
IPUPlace
>
(
tensor
,
array
,
place
,
zero_copy
);
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
SetTensorFromPyArray
<
platform
::
MLUPlace
>
(
tensor
,
array
,
place
,
zero_copy
);
}
else
if
(
platform
::
is_custom_place
(
place
))
{
}
else
if
(
platform
::
is_custom_place
(
place
))
{
SetTensorFromPyArray
<
platform
::
CustomPlace
>
(
SetTensorFromPyArray
<
platform
::
CustomPlace
>
(
tensor
,
array
,
place
,
zero_copy
);
tensor
,
array
,
place
,
zero_copy
);
...
@@ -727,14 +723,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -727,14 +723,6 @@ void BindImperative(py::module *m_ptr) {
py
::
arg
(
"zero_copy"
)
=
false
,
py
::
arg
(
"zero_copy"
)
=
false
,
py
::
arg
(
"name"
)
=
""
,
py
::
arg
(
"name"
)
=
""
,
py
::
arg
(
"stop_gradient"
)
=
-
1
)
py
::
arg
(
"stop_gradient"
)
=
-
1
)
.
def
(
"__init__"
,
&
InitVarBaseFromNumpyWithArg
<
platform
::
MLUPlace
>
,
py
::
arg
(
"value"
),
py
::
arg
(
"place"
),
py
::
arg
(
"persistable"
)
=
false
,
py
::
arg
(
"zero_copy"
)
=
false
,
py
::
arg
(
"name"
)
=
""
,
py
::
arg
(
"stop_gradient"
)
=
-
1
)
.
def
(
"__init__"
,
.
def
(
"__init__"
,
&
InitVarBaseFromNumpyWithArg
<
platform
::
CustomPlace
>
,
&
InitVarBaseFromNumpyWithArg
<
platform
::
CustomPlace
>
,
py
::
arg
(
"value"
),
py
::
arg
(
"value"
),
...
@@ -773,11 +761,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -773,11 +761,6 @@ void BindImperative(py::module *m_ptr) {
py
::
arg
(
"tensor"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"place"
),
py
::
arg
(
"place"
),
py
::
arg
(
"name"
)
=
""
)
py
::
arg
(
"name"
)
=
""
)
.
def
(
"__init__"
,
&
InitVarBaseFromTensorWithArg
<
platform
::
MLUPlace
>
,
py
::
arg
(
"tensor"
),
py
::
arg
(
"place"
),
py
::
arg
(
"name"
)
=
""
)
.
def
(
"__init__"
,
.
def
(
"__init__"
,
&
InitVarBaseFromTensorWithArg
<
platform
::
CustomPlace
>
,
&
InitVarBaseFromTensorWithArg
<
platform
::
CustomPlace
>
,
py
::
arg
(
"tensor"
),
py
::
arg
(
"tensor"
),
...
@@ -1878,18 +1861,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -1878,18 +1861,6 @@ void BindImperative(py::module *m_ptr) {
return
new_var
;
return
new_var
;
},
},
py
::
return_value_policy
::
copy
)
py
::
return_value_policy
::
copy
)
.
def
(
"_copy_to"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
const
platform
::
MLUPlace
&
place
,
bool
blocking
)
{
auto
new_var
=
self
->
NewVarBase
(
place
,
blocking
);
if
(
!
blocking
)
{
IncreaseVarbaseReferenceCountUntilCopyComplete
(
self
,
place
);
}
return
new_var
;
},
py
::
return_value_policy
::
copy
)
.
def
(
.
def
(
"_copy_to"
,
"_copy_to"
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
[](
const
std
::
shared_ptr
<
imperative
::
VarBase
>
&
self
,
...
@@ -2217,11 +2188,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -2217,11 +2188,6 @@ void BindImperative(py::module *m_ptr) {
self
.
SetExpectedPlace
(
*
p
);
self
.
SetExpectedPlace
(
*
p
);
VLOG
(
4
)
<<
"Tracer("
<<
&
self
<<
")"
VLOG
(
4
)
<<
"Tracer("
<<
&
self
<<
")"
<<
" set expected place "
<<
*
p
;
<<
" set expected place "
<<
*
p
;
}
else
if
(
py
::
isinstance
<
platform
::
MLUPlace
>
(
obj
))
{
auto
p
=
obj
.
cast
<
platform
::
MLUPlace
*>
();
self
.
SetExpectedPlace
(
*
p
);
VLOG
(
4
)
<<
"Tracer("
<<
&
self
<<
")"
<<
" set expected place "
<<
*
p
;
}
else
if
(
py
::
isinstance
<
platform
::
CustomPlace
>
(
obj
))
{
}
else
if
(
py
::
isinstance
<
platform
::
CustomPlace
>
(
obj
))
{
auto
p
=
obj
.
cast
<
platform
::
CustomPlace
*>
();
auto
p
=
obj
.
cast
<
platform
::
CustomPlace
*>
();
self
.
SetExpectedPlace
(
*
p
);
self
.
SetExpectedPlace
(
*
p
);
...
@@ -2412,28 +2378,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -2412,28 +2378,6 @@ void BindImperative(py::module *m_ptr) {
inplace_map
);
inplace_map
);
}
}
})
})
.
def
(
"trace"
,
[](
imperative
::
Tracer
&
self
,
const
std
::
string
&
type
,
const
PyNameVarBaseMap
&
ins
,
const
PyNameVarBaseMap
&
outs
,
framework
::
AttributeMap
attrs
,
const
platform
::
MLUPlace
&
place
,
bool
trace_backward
,
const
std
::
map
<
std
::
string
,
std
::
string
>
&
inplace_map
=
{})
{
auto
ins_map
=
ConvertToNameVarBaseMap
(
ins
);
auto
outs_map
=
ConvertToNameVarBaseMap
(
outs
);
{
py
::
gil_scoped_release
release
;
self
.
TraceOp
<
imperative
::
VarBase
>
(
type
,
std
::
move
(
ins_map
),
std
::
move
(
outs_map
),
std
::
move
(
attrs
),
place
,
trace_backward
,
inplace_map
);
}
})
.
def
(
"trace"
,
.
def
(
"trace"
,
[](
imperative
::
Tracer
&
self
,
[](
imperative
::
Tracer
&
self
,
const
std
::
string
&
type
,
const
std
::
string
&
type
,
...
@@ -2505,7 +2449,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -2505,7 +2449,6 @@ void BindImperative(py::module *m_ptr) {
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
CUDAPinnedPlace
>
);
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
CUDAPinnedPlace
>
);
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
NPUPlace
>
);
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
NPUPlace
>
);
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
CustomPlace
>
);
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
CustomPlace
>
);
m
.
def
(
"varbase_copy"
,
&
VarBaseCopy
<
platform
::
MLUPlace
>
);
m
.
def
(
m
.
def
(
"dygraph_partial_grad"
,
"dygraph_partial_grad"
,
...
@@ -2616,19 +2559,6 @@ void BindImperative(py::module *m_ptr) {
...
@@ -2616,19 +2559,6 @@ void BindImperative(py::module *m_ptr) {
py
::
arg
(
"ring_id"
));
py
::
arg
(
"ring_id"
));
#endif
#endif
#if defined(PADDLE_WITH_CNCL)
py
::
class_
<
imperative
::
CNCLParallelContext
,
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
CNCLParallelContext
>>
(
m
,
"CNCLParallelContext"
)
.
def
(
py
::
init
<
const
imperative
::
ParallelStrategy
&
,
const
platform
::
MLUPlace
&>
())
.
def
(
"init"
,
[](
imperative
::
CNCLParallelContext
&
self
)
{
self
.
Init
();
})
.
def
(
"init_with_ring_id"
,
&
imperative
::
CNCLParallelContext
::
InitWithRingID
,
py
::
arg
(
"ring_id"
));
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
defined(PADDLE_WITH_XPU_BKCL)
py
::
class_
<
imperative
::
HeterParallelContext
,
py
::
class_
<
imperative
::
HeterParallelContext
,
...
...
paddle/fluid/pybind/parallel_executor.cc
浏览文件 @
e75c01f9
...
@@ -152,10 +152,6 @@ limitations under the License. */
...
@@ -152,10 +152,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#include "paddle/fluid/pybind/crypto.h"
#endif
#endif
...
...
paddle/fluid/pybind/place.cc
浏览文件 @
e75c01f9
...
@@ -152,10 +152,6 @@ limitations under the License. */
...
@@ -152,10 +152,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#include "paddle/fluid/pybind/crypto.h"
#endif
#endif
...
@@ -194,7 +190,6 @@ PyTypeObject *g_cpuplace_pytype = nullptr;
...
@@ -194,7 +190,6 @@ PyTypeObject *g_cpuplace_pytype = nullptr;
PyTypeObject
*
g_xpuplace_pytype
=
nullptr
;
PyTypeObject
*
g_xpuplace_pytype
=
nullptr
;
PyTypeObject
*
g_npuplace_pytype
=
nullptr
;
PyTypeObject
*
g_npuplace_pytype
=
nullptr
;
PyTypeObject
*
g_cudapinnedplace_pytype
=
nullptr
;
PyTypeObject
*
g_cudapinnedplace_pytype
=
nullptr
;
PyTypeObject
*
g_mluplace_pytype
=
nullptr
;
PyTypeObject
*
g_ipuplace_pytype
=
nullptr
;
PyTypeObject
*
g_ipuplace_pytype
=
nullptr
;
template
<
typename
PlaceType
>
template
<
typename
PlaceType
>
...
@@ -371,7 +366,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
...
@@ -371,7 +366,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
CPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
CPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
XPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
XPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
MLUPlace
>
)
.
def
(
"_equals"
,
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
CUDAPinnedPlace
>
)
&
IsSamePlace
<
platform
::
CUDAPlace
,
platform
::
CUDAPinnedPlace
>
)
.
def
(
"_get_device_id"
,
.
def
(
"_get_device_id"
,
...
@@ -614,82 +608,8 @@ void BindPlace(pybind11::module &m) { // NOLINT
...
@@ -614,82 +608,8 @@ void BindPlace(pybind11::module &m) { // NOLINT
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
CUDAPinnedPlace
>
)
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
CUDAPinnedPlace
>
)
#ifdef PADDLE_WITH_IPU
.
def
(
"get_device_id"
,
[](
const
platform
::
IPUPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
#endif
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
IPUPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
IPUPlace
&>
);
// MLUPlace
py
::
class_
<
platform
::
MLUPlace
>
mluplace
(
m
,
"MLUPlace"
,
R"DOC(
MLUPlace is a descriptor of a device.
It represents a MLU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: mlu
mlu_place = paddle.MLUPlace(0)
)DOC"
);
g_mluplace_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
mluplace
.
ptr
());
mluplace
.
def
(
"__init__"
,
[](
platform
::
MLUPlace
&
self
,
int
dev_id
)
{
#ifdef PADDLE_WITH_MLU
if
(
UNLIKELY
(
dev_id
<
0
))
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid MLUPlace(%d), device id must be 0 or "
"positive integer"
,
dev_id
);
std
::
exit
(
-
1
);
}
if
(
UNLIKELY
(
dev_id
>=
platform
::
GetMLUDeviceCount
()))
{
if
(
platform
::
GetMLUDeviceCount
()
==
0
)
{
LOG
(
ERROR
)
<<
"Cannot use MLU because there is no MLU "
"detected on your "
"machine."
;
std
::
exit
(
-
1
);
}
else
{
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Invalid MLUPlace(%d), must inside [0, %d), because MLU "
"number on your machine is %d"
,
dev_id
,
platform
::
GetMLUDeviceCount
(),
platform
::
GetMLUDeviceCount
());
std
::
exit
(
-
1
);
}
}
new
(
&
self
)
platform
::
MLUPlace
(
dev_id
);
#else
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Cannot use MLU because you have installed CPU/GPU/... "
"version "
"PaddlePaddle.
\n
"
"If you want to use MLU, please try to install MLU version "
"PaddlePaddle by: pip install paddlepaddle-mlu
\n
"
"If you only have CPU, please change MLUPlace(%d) to be "
"CPUPlace().
\n
"
,
dev_id
);
std
::
exit
(
-
1
);
#endif
})
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
MLUPlace
>
)
#ifdef PADDLE_WITH_MLU
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
Place
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
CUDAPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
CPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
XPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
MLUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
MLUPlace
,
platform
::
CUDAPinnedPlace
>
)
.
def
(
"get_device_id"
,
[](
const
platform
::
MLUPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
#endif
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
MLUPlace
&>
);
py
::
class_
<
platform
::
Place
>
platformplace
(
m
,
"Place"
);
py
::
class_
<
platform
::
Place
>
platformplace
(
m
,
"Place"
);
g_place_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
platformplace
.
ptr
());
g_place_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
platformplace
.
ptr
());
platformplace
.
def
(
py
::
init
<>
())
platformplace
.
def
(
py
::
init
<>
())
...
@@ -701,7 +621,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
...
@@ -701,7 +621,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CUDAPinnedPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CUDAPinnedPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
MLUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CustomPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CustomPlace
>
)
.
def
(
"is_gpu_place"
,
.
def
(
"is_gpu_place"
,
[](
platform
::
Place
&
self
)
{
return
platform
::
is_gpu_place
(
self
);
})
[](
platform
::
Place
&
self
)
{
return
platform
::
is_gpu_place
(
self
);
})
...
@@ -758,10 +677,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
...
@@ -758,10 +677,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
[](
platform
::
Place
&
self
,
const
platform
::
IPUPlace
&
ipu_place
)
{
[](
platform
::
Place
&
self
,
const
platform
::
IPUPlace
&
ipu_place
)
{
self
=
ipu_place
;
self
=
ipu_place
;
})
})
.
def
(
"set_place"
,
[](
platform
::
Place
&
self
,
const
platform
::
MLUPlace
&
mlu_place
)
{
self
=
mlu_place
;
})
.
def
(
"set_place"
,
.
def
(
"set_place"
,
[](
platform
::
Place
&
self
,
const
platform
::
CustomPlace
&
plug_place
)
{
[](
platform
::
Place
&
self
,
const
platform
::
CustomPlace
&
plug_place
)
{
self
=
plug_place
;
self
=
plug_place
;
...
...
paddle/fluid/pybind/tensor.cc
浏览文件 @
e75c01f9
...
@@ -152,10 +152,6 @@ limitations under the License. */
...
@@ -152,10 +152,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#include "paddle/fluid/pybind/crypto.h"
#endif
#endif
...
@@ -252,10 +248,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
...
@@ -252,10 +248,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
NPUPlace
&
place
)
{
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
NPUPlace
&
place
)
{
self
.
mutable_data
<
float
>
(
place
);
self
.
mutable_data
<
float
>
(
place
);
})
})
.
def
(
"_alloc_float"
,
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
MLUPlace
&
place
)
{
self
.
mutable_data
<
float
>
(
place
);
})
.
def
(
"_alloc_double"
,
.
def
(
"_alloc_double"
,
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
CPUPlace
&
place
)
{
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
CPUPlace
&
place
)
{
self
.
mutable_data
<
double
>
(
place
);
self
.
mutable_data
<
double
>
(
place
);
...
@@ -276,10 +268,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
...
@@ -276,10 +268,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
CUDAPlace
&
place
)
{
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
CUDAPlace
&
place
)
{
self
.
mutable_data
<
int
>
(
place
);
self
.
mutable_data
<
int
>
(
place
);
})
})
.
def
(
"_alloc_int"
,
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
MLUPlace
&
place
)
{
self
.
mutable_data
<
int
>
(
place
);
})
.
def
(
.
def
(
"_alloc_int"
,
"_alloc_int"
,
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
{
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
{
...
@@ -325,13 +313,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
...
@@ -325,13 +313,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
return
reinterpret_cast
<
uintptr_t
>
(
return
reinterpret_cast
<
uintptr_t
>
(
self
.
mutable_data
(
place
,
framework
::
TransToPhiDataType
(
type
)));
self
.
mutable_data
(
place
,
framework
::
TransToPhiDataType
(
type
)));
})
})
.
def
(
"_mutable_data"
,
[](
phi
::
DenseTensor
&
self
,
paddle
::
platform
::
MLUPlace
&
place
,
paddle
::
framework
::
proto
::
VarType
::
Type
type
)
{
return
reinterpret_cast
<
uintptr_t
>
(
self
.
mutable_data
(
place
,
framework
::
TransToPhiDataType
(
type
)));
})
.
def
(
"_clear"
,
&
phi
::
DenseTensor
::
clear
)
.
def
(
"_clear"
,
&
phi
::
DenseTensor
::
clear
)
.
def
(
"_mutable_data"
,
.
def
(
"_mutable_data"
,
[](
phi
::
DenseTensor
&
self
,
[](
phi
::
DenseTensor
&
self
,
...
@@ -370,11 +351,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
...
@@ -370,11 +351,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
py
::
arg
(
"tensor"
),
py
::
arg
(
"tensor"
),
py
::
arg
(
"place"
),
py
::
arg
(
"place"
),
py
::
arg
(
"batch_size"
)
=
-
1
)
py
::
arg
(
"batch_size"
)
=
-
1
)
.
def
(
"_copy_from"
,
&
TensorCopyFrom
<
paddle
::
platform
::
MLUPlace
>
,
py
::
arg
(
"tensor"
),
py
::
arg
(
"place"
),
py
::
arg
(
"batch_size"
)
=
-
1
)
.
def
(
"_copy_from"
,
.
def
(
"_copy_from"
,
&
TensorCopyFrom
<
paddle
::
platform
::
IPUPlace
>
,
&
TensorCopyFrom
<
paddle
::
platform
::
IPUPlace
>
,
py
::
arg
(
"tensor"
),
py
::
arg
(
"tensor"
),
...
@@ -415,11 +391,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
...
@@ -415,11 +391,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
py
::
arg
(
"array"
),
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
py
::
arg
(
"zero_copy"
)
=
false
)
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
MLUPlace
>
,
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
.
def
(
"set"
,
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
CUDAPinnedPlace
>
,
SetTensorFromPyArray
<
paddle
::
platform
::
CUDAPinnedPlace
>
,
py
::
arg
(
"array"
),
py
::
arg
(
"array"
),
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
e75c01f9
...
@@ -292,13 +292,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
...
@@ -292,13 +292,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
auto
p
=
self
.
place
();
auto
p
=
self
.
place
();
paddle
::
memory
::
Copy
(
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
nullptr
);
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
nullptr
);
#endif
}
else
if
(
platform
::
is_mlu_place
(
self
.
place
()))
{
#ifdef PADDLE_WITH_MLU
const
T
*
a
=
self
.
data
<
T
>
();
auto
p
=
self
.
place
();
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
&
b
,
p
,
a
+
offset
,
sizeof
(
T
),
nullptr
);
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
self
.
place
()))
{
}
else
if
(
platform
::
is_custom_place
(
self
.
place
()))
{
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
...
@@ -336,13 +329,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
...
@@ -336,13 +329,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
paddle
::
memory
::
Copy
(
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
nullptr
);
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
nullptr
);
#endif
}
else
if
(
platform
::
is_mlu_place
(
self
->
place
()))
{
#ifdef PADDLE_WITH_MLU
auto
p
=
self
->
place
();
T
*
a
=
self
->
mutable_data
<
T
>
(
p
);
paddle
::
memory
::
Copy
(
p
,
a
+
offset
,
platform
::
CPUPlace
(),
&
elem
,
sizeof
(
T
),
nullptr
);
#endif
#endif
}
else
if
(
platform
::
is_custom_place
(
self
->
place
()))
{
}
else
if
(
platform
::
is_custom_place
(
self
->
place
()))
{
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
...
@@ -413,21 +399,6 @@ void SetTensorFromPyArrayT(
...
@@ -413,21 +399,6 @@ void SetTensorFromPyArrayT(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
"Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with IPU support."
));
"Please recompile or reinstall Paddle with IPU support."
));
#endif
}
else
if
(
paddle
::
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_MLU
platform
::
Place
tmp_place
=
place
;
platform
::
MLUDeviceGuard
guard
(
tmp_place
.
device
);
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
dev_ctx
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
pool
.
Get
(
place
));
paddle
::
platform
::
MLUMemcpyH2DAsync
(
dst
,
array
.
data
(),
array
.
nbytes
(),
dev_ctx
->
stream
());
dev_ctx
->
Wait
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use MLUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with MLU support."
));
#endif
#endif
}
else
if
(
paddle
::
platform
::
is_custom_place
(
place
))
{
}
else
if
(
paddle
::
platform
::
is_custom_place
(
place
))
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
@@ -779,10 +750,6 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
...
@@ -779,10 +750,6 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
}
else
if
(
platform
::
is_xpu_place
(
place
))
{
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
output
->
mutable_data
(
place
,
self
.
dtype
());
output
->
mutable_data
(
place
,
self
.
dtype
());
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_MLU
output
->
mutable_data
(
place
,
self
.
dtype
());
#endif
#endif
}
else
{
}
else
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
@@ -1064,39 +1031,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
...
@@ -1064,39 +1031,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use CUDAPlace in CPU only version, "
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."
));
"Please recompile or reinstall Paddle with CUDA support."
));
#endif
}
else
if
(
is_mlu_tensor
)
{
#ifdef PADDLE_WITH_MLU
py
::
array
py_arr
(
py
::
dtype
(
py_dtype_str
.
c_str
()),
py_dims
,
py_strides
);
PADDLE_ENFORCE_EQ
(
py_arr
.
writeable
(),
true
,
platform
::
errors
::
InvalidArgument
(
"PyArray is not writable, in which case memory leak "
"or double free would occur"
));
PADDLE_ENFORCE_EQ
(
py_arr
.
owndata
(),
true
,
platform
::
errors
::
InvalidArgument
(
"PyArray does not own data, in which case memory leak "
"or double free would occur"
));
size_t
copy_bytes
=
sizeof_dtype
*
numel
;
auto
p
=
tensor
.
place
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
tensor
.
place
());
paddle
::
memory
::
Copy
(
platform
::
CPUPlace
(),
py_arr
.
mutable_data
(),
p
,
tensor_buf_ptr
,
copy_bytes
,
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
).
stream
());
ctx
.
Wait
();
return
py_arr
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with MLU support."
));
#endif
#endif
}
else
if
(
is_custom_device_tensor
)
{
}
else
if
(
is_custom_device_tensor
)
{
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#ifdef PADDLE_WITH_CUSTOM_DEVICE
...
...
paddle/phi/backends/device_memory_aligment.h
浏览文件 @
e75c01f9
...
@@ -21,9 +21,6 @@ limitations under the License. */
...
@@ -21,9 +21,6 @@ limitations under the License. */
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/phi/backends/mlu/mlu_info.h"
#endif
namespace
phi
{
namespace
phi
{
...
@@ -42,11 +39,9 @@ inline size_t Alignment(size_t size,
...
@@ -42,11 +39,9 @@ inline size_t Alignment(size_t size,
alignment
=
phi
::
backends
::
gpu
::
GpuMinChunkSize
();
alignment
=
phi
::
backends
::
gpu
::
GpuMinChunkSize
();
#elif defined(PADDLE_WITH_XPU)
#elif defined(PADDLE_WITH_XPU)
alignment
=
alignment
;
alignment
=
alignment
;
#elif defined(PADDLE_WITH_MLU)
alignment
=
phi
::
backends
::
mlu
::
MLUMinChunkSize
();
#else
#else
PADDLE_THROW
(
phi
::
errors
::
PreconditionNotMet
(
PADDLE_THROW
(
phi
::
errors
::
PreconditionNotMet
(
"Fluid is not compiled with CUDA/XPU/NPU
/MLU
."
));
"Fluid is not compiled with CUDA/XPU/NPU."
));
#endif
#endif
}
}
}
}
...
...
paddle/phi/backends/mlu/mlu_info.h
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_MLU
namespace
phi
{
namespace
backends
{
namespace
mlu
{
//! Get the minimum chunk size for MLU buddy allocator.
inline
size_t
MLUMinChunkSize
()
{
// Allow to allocate the minimum chunk size is 256 bytes.
return
1
<<
8
;
}
}
// namespace mlu
}
// namespace backends
}
// namespace phi
#endif
paddle/phi/common/place.h
浏览文件 @
e75c01f9
...
@@ -193,16 +193,6 @@ class IPUPlace : public Place {
...
@@ -193,16 +193,6 @@ class IPUPlace : public Place {
:
Place
(
AllocationType
::
IPU
,
place
.
GetDeviceId
())
{}
:
Place
(
AllocationType
::
IPU
,
place
.
GetDeviceId
())
{}
};
};
class
MLUPlace
:
public
Place
{
public:
MLUPlace
()
:
Place
(
AllocationType
::
MLU
,
0
)
{}
explicit
MLUPlace
(
int
device_id
)
:
Place
(
AllocationType
::
MLU
,
device_id
)
{}
MLUPlace
(
const
MLUPlace
&
)
=
default
;
MLUPlace
(
const
Place
&
place
)
// NOLINT
:
Place
(
AllocationType
::
MLU
,
place
.
GetDeviceId
())
{}
};
class
CustomPlace
:
public
Place
{
class
CustomPlace
:
public
Place
{
public:
public:
CustomPlace
()
:
Place
(
AllocationType
::
CUSTOM
,
0
,
""
)
{}
CustomPlace
()
:
Place
(
AllocationType
::
CUSTOM
,
0
,
""
)
{}
...
...
paddle/phi/core/utils/visit_place.h
浏览文件 @
e75c01f9
...
@@ -62,15 +62,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
...
@@ -62,15 +62,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
PADDLE_THROW
(
phi
::
errors
::
Unavailable
(
PADDLE_THROW
(
phi
::
errors
::
Unavailable
(
(
"Paddle is not compiled with IPU. Cannot visit ipu device"
)));
(
"Paddle is not compiled with IPU. Cannot visit ipu device"
)));
return
typename
Visitor
::
result_type
();
return
typename
Visitor
::
result_type
();
#endif
}
case
phi
::
AllocationType
::
MLU
:
{
#ifdef PADDLE_WITH_MLU
phi
::
MLUPlace
p
(
place
.
GetDeviceId
());
return
visitor
(
p
);
#else
PADDLE_THROW
(
phi
::
errors
::
Unavailable
(
(
"Paddle is not compiled with MLU. Cannot visit mlu device"
)));
#endif
#endif
}
}
case
phi
::
AllocationType
::
CUSTOM
:
{
case
phi
::
AllocationType
::
CUSTOM
:
{
...
...
paddle/phi/kernels/funcs/activation_functor.h
浏览文件 @
e75c01f9
...
@@ -1980,11 +1980,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
...
@@ -1980,11 +1980,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
}
}
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
static
constexpr
ActBwdOpFwdDeps
FwdDeps
()
{
#ifdef PADDLE_WITH_MLU
return
ActBwdOpFwdDeps
::
kDepX
;
#else
return
ActBwdOpFwdDeps
::
kDepOut
;
return
ActBwdOpFwdDeps
::
kDepOut
;
#endif
}
}
};
};
...
...
paddle/phi/kernels/funcs/math_function.cc
浏览文件 @
e75c01f9
...
@@ -203,13 +203,6 @@ void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
...
@@ -203,13 +203,6 @@ void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
phi
::
VisitDataType
(
tensor
->
dtype
(),
TensorSetConstantCPU
(
tensor
,
value
));
phi
::
VisitDataType
(
tensor
->
dtype
(),
TensorSetConstantCPU
(
tensor
,
value
));
}
}
template
<
>
void
set_constant_with_place
<
phi
::
MLUPlace
>
(
const
phi
::
DeviceContext
&
context
,
phi
::
DenseTensor
*
tensor
,
float
value
)
{
PADDLE_THROW
(
phi
::
errors
::
Unimplemented
(
"MLUPlace is not supported"
));
}
template
<
>
template
<
>
void
set_constant_with_place
<
phi
::
GPUPinnedPlace
>
(
void
set_constant_with_place
<
phi
::
GPUPinnedPlace
>
(
const
phi
::
DeviceContext
&
context
,
phi
::
DenseTensor
*
tensor
,
float
value
)
{
const
phi
::
DeviceContext
&
context
,
phi
::
DenseTensor
*
tensor
,
float
value
)
{
...
...
paddle/phi/kernels/funcs/strided_memcpy.h
浏览文件 @
e75c01f9
...
@@ -56,8 +56,7 @@ inline void CopyWithContext(const Context& ctx,
...
@@ -56,8 +56,7 @@ inline void CopyWithContext(const Context& ctx,
const
Place
&
src_place
,
const
Place
&
src_place
,
const
void
*
src
,
const
void
*
src
,
size_t
num
)
{
size_t
num
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
defined(PADDLE_WITH_MLU)
memory_utils
::
Copy
(
dst_place
,
dst
,
src_place
,
src
,
num
,
ctx
.
stream
());
memory_utils
::
Copy
(
dst_place
,
dst
,
src_place
,
src
,
num
,
ctx
.
stream
());
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
...
...
python/paddle/fluid/__init__.py
浏览文件 @
e75c01f9
...
@@ -72,7 +72,6 @@ from .core import (
...
@@ -72,7 +72,6 @@ from .core import (
CUDAPlace
,
CUDAPlace
,
CUDAPinnedPlace
,
CUDAPinnedPlace
,
IPUPlace
,
IPUPlace
,
MLUPlace
,
CustomPlace
,
CustomPlace
,
)
)
from
.lod_tensor
import
create_lod_tensor
,
create_random_int_lodtensor
from
.lod_tensor
import
create_lod_tensor
,
create_random_int_lodtensor
...
@@ -127,7 +126,6 @@ __all__ = (
...
@@ -127,7 +126,6 @@ __all__ = (
'CUDAPlace'
,
'CUDAPlace'
,
'CUDAPinnedPlace'
,
'CUDAPinnedPlace'
,
'IPUPlace'
,
'IPUPlace'
,
'MLUPlace'
,
'Tensor'
,
'Tensor'
,
'ParamAttr'
,
'ParamAttr'
,
'WeightNormParamAttr'
,
'WeightNormParamAttr'
,
...
...
python/paddle/framework/__init__.py
浏览文件 @
e75c01f9
...
@@ -25,7 +25,6 @@ from ..fluid.core import IPUPlace # noqa: F401
...
@@ -25,7 +25,6 @@ from ..fluid.core import IPUPlace # noqa: F401
from
..fluid.core
import
CUDAPlace
# noqa: F401
from
..fluid.core
import
CUDAPlace
# noqa: F401
from
..fluid.core
import
CUDAPinnedPlace
# noqa: F401
from
..fluid.core
import
CUDAPinnedPlace
# noqa: F401
from
..fluid.core
import
NPUPlace
# noqa: F401
from
..fluid.core
import
NPUPlace
# noqa: F401
from
..fluid.core
import
MLUPlace
# noqa: F401
from
..fluid.core
import
CustomPlace
# noqa: F401
from
..fluid.core
import
CustomPlace
# noqa: F401
from
..fluid
import
core
# noqa: F401
from
..fluid
import
core
# noqa: F401
...
...
test/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -170,9 +170,6 @@ if(${len} GREATER_EQUAL 1)
...
@@ -170,9 +170,6 @@ if(${len} GREATER_EQUAL 1)
if
(
WITH_XPU
)
if
(
WITH_XPU
)
target_link_libraries
(
${
test_name
}
xpulib
)
target_link_libraries
(
${
test_name
}
xpulib
)
endif
()
endif
()
if
(
WITH_MLU
)
target_link_libraries
(
${
test_name
}
neuware_lib
)
endif
()
if
(
NOT
if
(
NOT
(
"
${
test_name
}
"
STREQUAL
"c_broadcast_op_npu_test"
(
"
${
test_name
}
"
STREQUAL
"c_broadcast_op_npu_test"
OR
"
${
test_name
}
"
STREQUAL
"c_allreduce_sum_op_npu_test"
OR
"
${
test_name
}
"
STREQUAL
"c_allreduce_sum_op_npu_test"
...
...
test/cpp/imperative/CMakeLists.txt
浏览文件 @
e75c01f9
...
@@ -28,12 +28,6 @@ else()
...
@@ -28,12 +28,6 @@ else()
SRCS bkcl_context_test.cc
SRCS bkcl_context_test.cc
DEPS bkcl_context
)
DEPS bkcl_context
)
endif
()
endif
()
if
(
WITH_CNCL
)
cc_test
(
cncl_context_test
SRCS cncl_context_test.cc
DEPS cncl_context
)
endif
()
endif
()
endif
()
cc_test
(
cc_test
(
...
...
test/cpp/imperative/cncl_context_test.cc
已删除
100644 → 0
浏览文件 @
075d6b14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/imperative/cncl_context.h"
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
namespace
imperative
=
paddle
::
imperative
;
namespace
platform
=
paddle
::
platform
;
namespace
framework
=
paddle
::
framework
;
// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test
// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test
int
nrings
=
1
;
imperative
::
ParallelStrategy
GetStrategy
(
int
local_rank
)
{
std
::
vector
<
std
::
string
>
eps
=
{
"127.0.0.1:9866"
,
"localhost:9867"
};
imperative
::
ParallelStrategy
strategy
;
strategy
.
trainer_endpoints_
=
eps
;
strategy
.
current_endpoint_
=
eps
[
local_rank
];
strategy
.
nranks_
=
2
;
strategy
.
local_rank_
=
local_rank
;
strategy
.
nrings_
=
nrings
;
return
strategy
;
}
#if defined(PADDLE_WITH_CNCL)
void
Broadcast
(
int
local_rank
,
int
device_id
)
{
int
data_size
=
4
;
float
test_data
=
7
;
const
auto
&
place
=
platform
::
MLUPlace
(
device_id
);
platform
::
MLUDeviceContext
ctx
(
place
);
imperative
::
CNCLParallelContext
cpc
(
GetStrategy
(
local_rank
),
place
);
// init
cpc
.
Init
();
framework
::
Variable
*
src_dev_var
(
new
framework
::
Variable
());
auto
*
src_dev_tensor
=
src_dev_var
->
GetMutable
<
phi
::
DenseTensor
>
();
src_dev_tensor
->
mutable_data
<
float
>
(
phi
::
make_ddim
({
data_size
}),
place
);
// fill data for rank 0 only
std
::
vector
<
float
>
src_vec
;
if
(
local_rank
==
0
)
{
for
(
int
i
=
0
;
i
<
data_size
;
++
i
)
{
src_vec
.
push_back
(
test_data
);
}
framework
::
TensorFromVector
(
src_vec
,
ctx
,
src_dev_tensor
);
}
ctx
.
Wait
();
// call broadcast
cpc
.
Broadcast
(
src_dev_var
,
0
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
// check result
std
::
vector
<
float
>
dst_vec
;
framework
::
TensorToVector
(
*
src_dev_tensor
,
ctx
,
&
dst_vec
);
ctx
.
Wait
();
for
(
int
i
=
0
;
i
<
data_size
;
++
i
)
{
EXPECT_EQ
(
dst_vec
[
i
],
test_data
);
}
}
TEST
(
Broadcast
,
Run
)
{
if
(
platform
::
GetMLUDeviceCount
()
>=
2
)
{
int
local_rank
=
atoi
(
getenv
(
"PADDLE_TRAINER_ID"
));
int
device_id
=
atoi
(
getenv
(
"FLAGS_selected_mlus"
));
Broadcast
(
local_rank
,
device_id
);
}
}
void
AllReduceByStream
(
int
local_rank
,
int
device_id
)
{
int
data_size
=
32
;
const
auto
&
place
=
platform
::
MLUPlace
(
device_id
);
platform
::
MLUDeviceContext
ctx
(
place
);
imperative
::
CNCLParallelContext
cpc
(
GetStrategy
(
local_rank
),
place
);
// init
cpc
.
Init
();
// input data
framework
::
Variable
*
src_dev_var
(
new
framework
::
Variable
());
auto
*
src_dev_tensor
=
src_dev_var
->
GetMutable
<
phi
::
DenseTensor
>
();
src_dev_tensor
->
mutable_data
<
float
>
(
phi
::
make_ddim
({
data_size
}),
place
);
// fill input data
std
::
vector
<
float
>
src_vec
;
for
(
int
i
=
0
;
i
<
data_size
;
++
i
)
{
src_vec
.
push_back
(
1.0
+
local_rank
);
}
framework
::
TensorFromVector
(
src_vec
,
ctx
,
src_dev_tensor
);
ctx
.
Wait
();
// output data
framework
::
Variable
*
dst_dev_var
(
new
framework
::
Variable
());
auto
*
dst_dev_tensor
=
dst_dev_var
->
GetMutable
<
phi
::
DenseTensor
>
();
dst_dev_tensor
->
mutable_data
<
float
>
(
phi
::
make_ddim
({
data_size
}),
place
);
// call allreduce
cpc
.
AllReduceByStream
(
*
src_dev_var
,
dst_dev_var
,
0
,
false
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
// check result
std
::
vector
<
float
>
dst_vec
;
framework
::
TensorToVector
(
*
dst_dev_tensor
,
ctx
,
&
dst_vec
);
ctx
.
Wait
();
EXPECT_EQ
(
dst_vec
.
size
(),
src_vec
.
size
());
for
(
int
i
=
0
;
i
<
data_size
;
++
i
)
{
EXPECT_EQ
(
dst_vec
[
i
],
3.0
);
}
}
TEST
(
AllReduceByStream
,
Run
)
{
if
(
platform
::
GetMLUDeviceCount
()
>=
2
)
{
int
local_rank
=
atoi
(
getenv
(
"PADDLE_TRAINER_ID"
));
int
device_id
=
atoi
(
getenv
(
"FLAGS_selected_mlus"
));
AllReduceByStream
(
local_rank
,
device_id
);
}
}
#endif
test/cpp/imperative/test_group.cc
浏览文件 @
e75c01f9
...
@@ -76,8 +76,7 @@ void GroupConcatSplit(Place place, size_t size) {
...
@@ -76,8 +76,7 @@ void GroupConcatSplit(Place place, size_t size) {
value
.
push_back
(
static_cast
<
T
>
(
1.0
*
j
));
value
.
push_back
(
static_cast
<
T
>
(
1.0
*
j
));
}
}
if
(
std
::
is_same
<
Place
,
platform
::
CUDAPlace
>::
value
||
if
(
std
::
is_same
<
Place
,
platform
::
CUDAPlace
>::
value
)
{
std
::
is_same
<
Place
,
platform
::
MLUPlace
>::
value
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_CNCL)
paddle
::
memory
::
Copy
(
paddle
::
memory
::
Copy
(
...
@@ -185,20 +184,5 @@ TEST(TestGroup, TestXPUConcatSplit) {
...
@@ -185,20 +184,5 @@ TEST(TestGroup, TestXPUConcatSplit) {
GroupConcatSplit
<
float
>
(
xpu_place
,
size
);
GroupConcatSplit
<
float
>
(
xpu_place
,
size
);
}
}
#endif
#endif
#if defined(PADDLE_WITH_CNCL)
TEST
(
TestGroup
,
TestMLUConcatSplit
)
{
platform
::
MLUPlace
mlu_place
(
0
);
platform
::
CPUPlace
cpu_place
;
int
size
=
3
;
GroupConcatSplit
<
float
>
(
cpu_place
,
size
);
GroupConcatSplit
<
float
>
(
mlu_place
,
size
);
size
=
15
;
GroupConcatSplit
<
float
>
(
cpu_place
,
size
);
GroupConcatSplit
<
float
>
(
mlu_place
,
size
);
}
#endif
}
// namespace imperative
}
// namespace imperative
}
// namespace paddle
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录