Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
883ee1a3
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
883ee1a3
编写于
12月 10, 2021
作者:
W
wanghuancoder
提交者:
GitHub
12月 10, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into revert-37926-eager_coreops_500
上级
823208fe
11c785a4
变更
141
隐藏空白更改
内联
并排
Showing
141 changed file
with
7103 addition
and
930 deletion
+7103
-930
paddle/fluid/distributed/fleet_executor/carrier.h
paddle/fluid/distributed/fleet_executor/carrier.h
+5
-0
paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
...e/fluid/distributed/fleet_executor/compute_interceptor.cc
+3
-0
paddle/fluid/eager/accumulation/gradient_accumulation.cc
paddle/fluid/eager/accumulation/gradient_accumulation.cc
+16
-0
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+5
-0
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+8
-0
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-1
paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
.../fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+7
-0
paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
+56
-0
paddle/fluid/framework/ir/ipu/avg_shard_pass.h
paddle/fluid/framework/ir/ipu/avg_shard_pass.h
+30
-0
paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
+133
-0
paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
+31
-0
paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+108
-0
paddle/fluid/framework/ir/ipu/infer_shape_pass.h
paddle/fluid/framework/ir/ipu/infer_shape_pass.h
+30
-0
paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
+89
-0
paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
+30
-0
paddle/fluid/framework/ir/ipu/inference_process_pass.cc
paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+129
-0
paddle/fluid/framework/ir/ipu/inference_process_pass.h
paddle/fluid/framework/ir/ipu/inference_process_pass.h
+30
-0
paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
+52
-0
paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
+31
-0
paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
+85
-0
paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
+30
-0
paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
+28
-0
paddle/fluid/framework/ir/ipu/ipu_pass_base.h
paddle/fluid/framework/ir/ipu/ipu_pass_base.h
+37
-0
paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
+97
-0
paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
+31
-0
paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+91
-0
paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
+31
-0
paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
+79
-0
paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
+36
-0
paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
...le/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+68
-0
paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
+30
-0
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
.../fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+31
-1
paddle/fluid/framework/op_registry.h
paddle/fluid/framework/op_registry.h
+3
-0
paddle/fluid/framework/paddle2cinn/CMakeLists.txt
paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+1
-1
paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+51
-37
paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+36
-6
paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+112
-10
paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+29
-6
paddle/fluid/framework/paddle2cinn/cinn_compiler.h
paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+13
-2
paddle/fluid/framework/pten_utils.cc
paddle/fluid/framework/pten_utils.cc
+4
-2
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+46
-5
paddle/fluid/imperative/gradient_accumulator.cc
paddle/fluid/imperative/gradient_accumulator.cc
+7
-0
paddle/fluid/imperative/prepared_operator.cc
paddle/fluid/imperative/prepared_operator.cc
+8
-0
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+0
-64
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+3
-2
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+28
-0
paddle/fluid/operators/cinn/cinn_launch_context.cc
paddle/fluid/operators/cinn/cinn_launch_context.cc
+97
-70
paddle/fluid/operators/cinn/cinn_launch_context.h
paddle/fluid/operators/cinn/cinn_launch_context.h
+19
-18
paddle/fluid/operators/cinn/cinn_launch_context_test.cc
paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+38
-33
paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+0
-20
paddle/fluid/operators/cinn/cinn_launch_op.h
paddle/fluid/operators/cinn/cinn_launch_op.h
+44
-65
paddle/fluid/operators/cinn/cinn_launch_op_test.cc
paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+3
-2
paddle/fluid/operators/complex_view_op.cc
paddle/fluid/operators/complex_view_op.cc
+163
-0
paddle/fluid/operators/complex_view_op.cu
paddle/fluid/operators/complex_view_op.cu
+29
-0
paddle/fluid/operators/complex_view_op.h
paddle/fluid/operators/complex_view_op.h
+60
-0
paddle/fluid/operators/ipu_runtime_op.cc
paddle/fluid/operators/ipu_runtime_op.cc
+62
-0
paddle/fluid/operators/ipu_runtime_op.h
paddle/fluid/operators/ipu_runtime_op.h
+69
-0
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+7
-0
paddle/fluid/operators/py_layer_op.h
paddle/fluid/operators/py_layer_op.h
+1
-1
paddle/fluid/operators/reshape_op.cc
paddle/fluid/operators/reshape_op.cc
+2
-2
paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+14
-0
paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+22
-0
paddle/fluid/platform/device/gpu/gpu_primitives.h
paddle/fluid/platform/device/gpu/gpu_primitives.h
+15
-0
paddle/fluid/platform/device/ipu/CMakeLists.txt
paddle/fluid/platform/device/ipu/CMakeLists.txt
+1
-1
paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
...form/device/ipu/popart_canonicalization/activation_ops.cc
+72
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
...ice/ipu/popart_canonicalization/canonicalization_utils.cc
+185
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
...vice/ipu/popart_canonicalization/canonicalization_utils.h
+64
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
...orm/device/ipu/popart_canonicalization/elementwise_ops.cc
+108
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
.../platform/device/ipu/popart_canonicalization/logic_ops.cc
+36
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
...d/platform/device/ipu/popart_canonicalization/math_ops.cc
+259
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
...uid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+301
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
...platform/device/ipu/popart_canonicalization/op_builder.cc
+195
-0
paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
.../platform/device/ipu/popart_canonicalization/op_builder.h
+85
-0
paddle/fluid/platform/device/ipu/supported_ops_autogen.h
paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+2
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+30
-2
paddle/fluid/platform/place.cc
paddle/fluid/platform/place.cc
+7
-0
paddle/fluid/platform/place.h
paddle/fluid/platform/place.h
+40
-2
paddle/fluid/pybind/.gitignore
paddle/fluid/pybind/.gitignore
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+200
-1
paddle/fluid/pybind/reader_py.cc
paddle/fluid/pybind/reader_py.cc
+3
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+15
-0
paddle/pten/api/lib/CMakeLists.txt
paddle/pten/api/lib/CMakeLists.txt
+4
-0
paddle/pten/api/lib/kernel_declare.h
paddle/pten/api/lib/kernel_declare.h
+37
-0
paddle/pten/api/lib/utils.cc
paddle/pten/api/lib/utils.cc
+6
-2
paddle/pten/core/kernel_alias_name.h
paddle/pten/core/kernel_alias_name.h
+2
-2
paddle/pten/core/kernel_factory.h
paddle/pten/core/kernel_factory.h
+1
-8
paddle/pten/core/kernel_registry.h
paddle/pten/core/kernel_registry.h
+147
-220
paddle/pten/kernels/cpu/creation.cc
paddle/pten/kernels/cpu/creation.cc
+2
-4
paddle/pten/kernels/cpu/linalg.cc
paddle/pten/kernels/cpu/linalg.cc
+2
-5
paddle/pten/kernels/cpu/manipulation.cc
paddle/pten/kernels/cpu/manipulation.cc
+22
-34
paddle/pten/kernels/cpu/math.cc
paddle/pten/kernels/cpu/math.cc
+8
-14
paddle/pten/kernels/cpu/utils.cc
paddle/pten/kernels/cpu/utils.cc
+1
-4
paddle/pten/kernels/cuda/creation.cu
paddle/pten/kernels/cuda/creation.cu
+2
-4
paddle/pten/kernels/cuda/linalg.cu
paddle/pten/kernels/cuda/linalg.cu
+2
-4
paddle/pten/kernels/cuda/manipulation.cu
paddle/pten/kernels/cuda/manipulation.cu
+22
-38
paddle/pten/kernels/cuda/math.cu
paddle/pten/kernels/cuda/math.cu
+8
-11
paddle/pten/kernels/cuda/utils.cu
paddle/pten/kernels/cuda/utils.cu
+1
-4
paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
+18
-1
paddle/pten/kernels/xpu/manipulation.cc
paddle/pten/kernels/xpu/manipulation.cc
+3
-13
paddle/pten/kernels/xpu/utils.cc
paddle/pten/kernels/xpu/utils.cc
+1
-4
paddle/pten/tests/api/test_reshape_api.cc
paddle/pten/tests/api/test_reshape_api.cc
+0
-6
python/paddle/__init__.py
python/paddle/__init__.py
+11
-0
python/paddle/device/__init__.py
python/paddle/device/__init__.py
+46
-3
python/paddle/distributed/auto_parallel/operators/dist_matmul.py
...paddle/distributed/auto_parallel/operators/dist_matmul.py
+442
-0
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+65
-17
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
...stributed/fleet/meta_parallel/sharding/sharding_stage2.py
+54
-14
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+151
-0
python/paddle/distributed/fleet/utils/internal_storage.py
python/paddle/distributed/fleet/utils/internal_storage.py
+84
-9
python/paddle/distribution.py
python/paddle/distribution.py
+2
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+7
-1
python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
.../paddle/fluid/contrib/slim/quantization/imperative/qat.py
+8
-3
python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
...d/contrib/slim/quantization/post_training_quantization.py
+72
-41
python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+2
-0
python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
...ntrib/slim/tests/test_post_training_quantization_while.py
+313
-0
python/paddle/fluid/dataloader/dataloader_iter.py
python/paddle/fluid/dataloader/dataloader_iter.py
+4
-0
python/paddle/fluid/dygraph/dygraph_to_static/utils.py
python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+5
-1
python/paddle/fluid/dygraph/varbase_patch_methods.py
python/paddle/fluid/dygraph/varbase_patch_methods.py
+1
-1
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+20
-0
python/paddle/fluid/reader.py
python/paddle/fluid/reader.py
+4
-1
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
...n/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+10
-9
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
.../fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+115
-0
python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
...ts/unittests/dygraph_to_static/test_program_translator.py
+14
-0
python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
...ddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+21
-13
python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
...ence/test_adaptive_pool2d_convert_global_pass_autoscan.py
+1
-1
python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
...s/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+135
-0
python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
...s/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+131
-56
python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
.../fluid/tests/unittests/test_auto_search_dist_matmul_op.py
+397
-0
python/paddle/fluid/tests/unittests/test_complex_view_op.py
python/paddle/fluid/tests/unittests/test_complex_view_op.py
+127
-0
python/paddle/fluid/tests/unittests/test_cuda_graph.py
python/paddle/fluid/tests/unittests/test_cuda_graph.py
+44
-1
python/paddle/fluid/tests/unittests/test_distribution.py
python/paddle/fluid/tests/unittests/test_distribution.py
+23
-0
python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
...dle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+3
-0
python/paddle/fluid/tests/unittests/test_fleet_executor.py
python/paddle/fluid/tests/unittests/test_fleet_executor.py
+12
-0
python/paddle/fluid/tests/unittests/test_gcd.py
python/paddle/fluid/tests/unittests/test_gcd.py
+93
-0
python/paddle/fluid/tests/unittests/test_lcm.py
python/paddle/fluid/tests/unittests/test_lcm.py
+93
-0
python/paddle/framework/__init__.py
python/paddle/framework/__init__.py
+1
-0
python/paddle/tensor/__init__.py
python/paddle/tensor/__init__.py
+11
-0
python/paddle/tensor/manipulation.py
python/paddle/tensor/manipulation.py
+92
-0
python/paddle/tensor/math.py
python/paddle/tensor/math.py
+137
-5
python/paddle/utils/code_gen/api.yaml
python/paddle/utils/code_gen/api.yaml
+1
-1
python/paddle/utils/code_gen/api_gen.py
python/paddle/utils/code_gen/api_gen.py
+1
-17
tools/parallel_UT_rule.py
tools/parallel_UT_rule.py
+2
-2
未找到文件。
paddle/fluid/distributed/fleet_executor/carrier.h
浏览文件 @
883ee1a3
...
@@ -75,6 +75,11 @@ class Carrier final {
...
@@ -75,6 +75,11 @@ class Carrier final {
bool
IsInit
()
const
;
bool
IsInit
()
const
;
// NOTE: This mutex will be used in interceptor's RunOps function.
// This mutex is used for avoiding forward ops and backward ops run
// simultaneously, which will lead to a random hang for some sync ops.
std
::
mutex
run
;
DISABLE_COPY_AND_ASSIGN
(
Carrier
);
DISABLE_COPY_AND_ASSIGN
(
Carrier
);
private:
private:
...
...
paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
浏览文件 @
883ee1a3
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
#include "paddle/fluid/distributed/fleet_executor/carrier.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/distributed/fleet_executor/task_node.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
...
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
...
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
}
}
void
ComputeInterceptor
::
RunOps
()
{
void
ComputeInterceptor
::
RunOps
()
{
Carrier
&
carrier_instance
=
Carrier
::
Instance
();
std
::
unique_lock
<
std
::
mutex
>
lock
(
carrier_instance
.
run
);
VLOG
(
3
)
<<
"ComputeInterceptor "
<<
interceptor_id_
<<
" running ops for the "
VLOG
(
3
)
<<
"ComputeInterceptor "
<<
interceptor_id_
<<
" running ops for the "
<<
step_
+
1
<<
" time."
;
<<
step_
+
1
<<
" time."
;
for
(
auto
op
:
node_
->
ops
())
{
for
(
auto
op
:
node_
->
ops
())
{
...
...
paddle/fluid/eager/accumulation/gradient_accumulation.cc
浏览文件 @
883ee1a3
...
@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
...
@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
}
#endif
#endif
#ifdef PADDLE_WITH_IPU
void
operator
()(
const
paddle
::
platform
::
IPUPlace
&
place
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
PermissionDenied
(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode"
,
place
));
}
#else
void
operator
()(
const
paddle
::
platform
::
IPUPlace
&
place
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
PermissionDenied
(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode"
,
place
));
}
#endif
void
operator
()(
const
paddle
::
platform
::
NPUPinnedPlace
&
place
)
{
void
operator
()(
const
paddle
::
platform
::
NPUPinnedPlace
&
place
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
paddle
::
platform
::
errors
::
PermissionDenied
(
"Gradient accumulation on place (%s) "
"Gradient accumulation on place (%s) "
...
...
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
883ee1a3
...
@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
...
@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
return
device
;
return
device
;
}
}
inline
::
DLDevice
operator
()(
const
platform
::
IPUPlace
&
place
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"platform::IPUPlace is not supported"
));
}
inline
::
DLDevice
operator
()(
const
platform
::
XPUPlace
&
place
)
const
{
inline
::
DLDevice
operator
()(
const
platform
::
XPUPlace
&
place
)
const
{
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"platform::XPUPlace is not supported"
));
platform
::
errors
::
Unimplemented
(
"platform::XPUPlace is not supported"
));
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
883ee1a3
...
@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
...
@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
#else
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"No XPU gc found in CPU/GPU paddle"
));
platform
::
errors
::
Unimplemented
(
"No XPU gc found in CPU/GPU paddle"
));
#endif
}
else
if
(
platform
::
is_ipu_place
(
place_
))
{
#ifdef PADDLE_WITH_IPU
gc
.
reset
(
new
IPUGarbageCollector
(
BOOST_GET_CONST
(
platform
::
IPUPlace
,
place_
),
max_memory_size
));
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"No IPU gc found in CPU/IPU paddle"
));
#endif
#endif
}
else
if
(
platform
::
is_npu_place
(
place_
))
{
}
else
if
(
platform
::
is_npu_place
(
place_
))
{
#ifdef PADDLE_WITH_ASCEND_CL
#ifdef PADDLE_WITH_ASCEND_CL
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
883ee1a3
...
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
...
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
cc_test
(
test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto
)
cc_test
(
test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto
)
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
cc_test
(
test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass
)
cc_test
(
test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass
)
cc_test
(
test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass
)
cc_test
(
test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass
)
cc_test
(
test_fc_elementwise_layernorm_fuse_pass
_cc
SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass
)
cc_test
(
test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass
)
cc_test
(
test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass
)
cc_test
(
test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass
)
cc_test
(
test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass
)
cc_test
(
test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass
)
cc_test
(
test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass
)
...
...
paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
浏览文件 @
883ee1a3
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_version_registry.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
...
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
REGISTER_PASS
(
fc_elementwise_layernorm_fuse_pass
,
REGISTER_PASS
(
fc_elementwise_layernorm_fuse_pass
,
paddle
::
framework
::
ir
::
FCElementwiseLayerNormFusePass
);
paddle
::
framework
::
ir
::
FCElementwiseLayerNormFusePass
);
REGISTER_PASS_CAPABILITY
(
fc_elementwise_layernorm_fuse_pass
)
.
AddCombination
(
paddle
::
framework
::
compatible
::
OpVersionComparatorCombination
()
.
EQ
(
"fc"
,
0
)
.
LE
(
"elementwise_add"
,
1
)
.
EQ
(
"layer_norm"
,
0
));
paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
AvgShardPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter AvgShardPass::ApplyImpl"
;
std
::
shared_ptr
<
platform
::
ipu
::
IpuBackend
>
ipu_backend
=
platform
::
ipu
::
IpuBackend
::
GetInstance
();
if
(
ipu_backend
->
GetIpuStrategy
()
->
need_avg_shard
)
{
VLOG
(
10
)
<<
"start AvgShardPass"
;
auto
nodes
=
ir
::
TopologySortOperations
(
*
graph
);
auto
num_ipus
=
ipu_backend
->
GetIpuStrategy
()
->
num_ipus
;
int
shard_position
=
nodes
.
size
()
/
num_ipus
;
int
index_and_stage
=
-
1
;
for
(
int
i
=
0
;
i
<
nodes
.
size
();
i
++
)
{
if
((
i
%
shard_position
)
==
0
&&
index_and_stage
<
num_ipus
-
1
)
{
index_and_stage
++
;
}
nodes
[
i
]
->
Op
()
->
SetAttr
(
"ipu_index"
,
index_and_stage
);
nodes
[
i
]
->
Op
()
->
SetAttr
(
"ipu_stage"
,
index_and_stage
);
}
VLOG
(
10
)
<<
"end AvgShardPass"
;
}
VLOG
(
10
)
<<
"leave AvgShardPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
avg_shard_pass
,
paddle
::
framework
::
ir
::
AvgShardPass
);
paddle/fluid/framework/ir/ipu/avg_shard_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
AvgShardPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
ForwardGraphExtractPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter ForwardGraphExtractPass::ApplyImpl"
;
std
::
unordered_map
<
OpRole
,
std
::
unordered_set
<
ir
::
Node
*>>
all_ops
{
{
OpRole
::
kForward
,
{}},
{
OpRole
::
kBackward
,
{}},
{
OpRole
::
kOptimize
,
{}},
{
OpRole
::
kRPC
,
{}},
{
OpRole
::
kDist
,
{}},
{
OpRole
::
kLRSched
,
{}},
{
OpRole
::
kLoss
,
{}},
{
OpRole
::
kNotSpecified
,
{}}};
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
!
node
->
IsOp
())
{
continue
;
}
auto
op_role
=
BOOST_GET_MUTABLE
(
int
,
node
->
Op
()
->
GetAttr
(
"op_role"
));
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kForward
))
{
all_ops
[
OpRole
::
kForward
].
insert
(
node
);
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kBackward
))
{
all_ops
[
OpRole
::
kBackward
].
insert
(
node
);
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kOptimize
))
{
all_ops
[
OpRole
::
kOptimize
].
insert
(
node
);
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kRPC
))
{
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kDist
))
{
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kLRSched
))
{
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kLoss
))
{
all_ops
[
OpRole
::
kLoss
].
insert
(
node
);
}
else
if
(
op_role
==
static_cast
<
int
>
(
OpRole
::
kNotSpecified
))
{
LOG
(
WARNING
)
<<
"Op: "
<<
node
->
Name
()
<<
" OpRole is NotSpecified "
;
}
}
std
::
unordered_set
<
ir
::
Node
*>
forward_vars
;
std
::
unordered_set
<
ir
::
Node
*>
backward_vars
;
std
::
unordered_set
<
ir
::
Node
*>
control_vars
;
// forward_vars
for
(
auto
&
nodes
:
std
::
array
<
std
::
unordered_set
<
ir
::
Node
*>
,
2
>
{
all_ops
[
OpRole
::
kForward
],
all_ops
[
OpRole
::
kLoss
]})
{
for
(
auto
*
node
:
nodes
)
{
for
(
auto
*
in_node
:
node
->
inputs
)
{
forward_vars
.
insert
(
in_node
);
}
for
(
auto
*
out_node
:
node
->
outputs
)
{
forward_vars
.
insert
(
out_node
);
}
}
}
// control_vars & backward_vars
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
!
node
->
IsVar
())
{
continue
;
}
if
(
node
->
IsCtrlVar
())
{
control_vars
.
insert
(
node
);
}
for
(
auto
*
in_node
:
node
->
inputs
)
{
if
(
all_ops
[
OpRole
::
kOptimize
].
count
(
in_node
))
{
backward_vars
.
insert
(
node
);
}
}
}
// all removed node
std
::
unordered_set
<
ir
::
Node
*>
rm_nodes
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
backward_vars
.
count
(
node
))
{
rm_nodes
.
insert
(
node
);
}
else
if
(
control_vars
.
count
(
node
))
{
rm_nodes
.
insert
(
node
);
}
else
if
(
all_ops
[
OpRole
::
kBackward
].
count
(
node
))
{
rm_nodes
.
insert
(
node
);
}
else
if
(
all_ops
[
OpRole
::
kForward
].
count
(
node
)
==
0
&&
all_ops
[
OpRole
::
kLoss
].
count
(
node
)
==
0
&&
forward_vars
.
count
(
node
)
==
0
)
{
rm_nodes
.
insert
(
node
);
}
else
if
(
node
->
Name
()
==
"feed"
||
node
->
Name
()
==
"fetch"
)
{
rm_nodes
.
insert
(
node
);
}
}
VLOG
(
10
)
<<
"Remove Node: "
;
for
(
auto
*
node
:
rm_nodes
)
{
// rm node releations
for
(
auto
*
node_in
:
node
->
inputs
)
{
for
(
size_t
i
=
0
;
i
<
node_in
->
outputs
.
size
();
++
i
)
{
if
(
node_in
->
outputs
[
i
]
==
node
)
{
node_in
->
outputs
.
erase
(
node_in
->
outputs
.
begin
()
+
i
);
break
;
}
}
}
for
(
auto
*
node_out
:
node
->
outputs
)
{
for
(
size_t
i
=
0
;
i
<
node_out
->
inputs
.
size
();
++
i
)
{
if
(
node_out
->
inputs
[
i
]
==
node
)
{
node_out
->
inputs
.
erase
(
node_out
->
inputs
.
begin
()
+
i
);
break
;
}
}
}
VLOG
(
10
)
<<
"
\t
"
<<
node
->
Name
();
graph
->
RemoveNode
(
node
);
}
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave ForwardGraphExtractPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
forward_graph_extract_pass
,
paddle
::
framework
::
ir
::
ForwardGraphExtractPass
);
paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
ForwardGraphExtractPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
InferShapePass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter InferShapePass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
std
::
shared_ptr
<
platform
::
ipu
::
IpuBackend
>
ipu_backend
=
platform
::
ipu
::
IpuBackend
::
GetInstance
();
auto
batch_size
=
ipu_backend
->
GetIpuStrategy
()
->
batch_size
;
auto
feed_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"feed_list"
);
for
(
auto
node
:
graph
->
Nodes
())
{
if
(
!
node
->
IsVar
())
{
continue
;
}
bool
is_feed
=
std
::
find
(
feed_list
.
begin
(),
feed_list
.
end
(),
node
->
Name
())
!=
feed_list
.
end
();
if
(
is_feed
)
{
auto
input_shape
=
node
->
Var
()
->
GetShape
();
if
(
input_shape
[
0
]
<=
-
1
)
{
input_shape
[
0
]
=
batch_size
;
node
->
Var
()
->
SetShape
(
input_shape
);
}
// int64->int32
if
(
node
->
Var
()
->
GetDataType
()
==
proto
::
VarType
::
INT64
)
{
node
->
Var
()
->
SetDataType
(
proto
::
VarType
::
INT32
);
}
}
}
// temp scope for shape inference
std
::
shared_ptr
<
paddle
::
framework
::
Scope
>
scope
(
new
paddle
::
framework
::
Scope
());
for
(
auto
node
:
graph
->
Nodes
())
{
if
(
!
node
->
IsVar
())
{
continue
;
}
auto
var_desc
=
node
->
Var
();
auto
*
ptr
=
scope
->
Var
(
var_desc
->
Name
());
paddle
::
framework
::
InitializeVariable
(
ptr
,
var_desc
->
GetType
());
auto
tensor
=
ptr
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
();
tensor
->
Resize
(
paddle
::
framework
::
make_ddim
(
var_desc
->
GetShape
()));
}
// infer shape
auto
nodes
=
ir
::
TopologySortOperations
(
*
graph
);
for
(
auto
node
:
nodes
)
{
auto
op_desc
=
node
->
Op
();
auto
op
=
paddle
::
framework
::
OpRegistry
::
CreateOp
(
*
op_desc
);
paddle
::
framework
::
RuntimeContext
ctx
(
op
->
Inputs
(),
op
->
Outputs
(),
*
scope
);
op
->
RuntimeInferShape
(
*
scope
,
paddle
::
platform
::
CPUPlace
(),
ctx
);
for
(
auto
it
=
ctx
.
outputs
.
begin
();
it
!=
ctx
.
outputs
.
end
();
it
++
)
{
for
(
int
i
=
0
;
i
<
it
->
second
.
size
();
i
++
)
{
auto
output_name
=
op_desc
->
Output
(
it
->
first
)[
i
];
auto
dim
=
it
->
second
[
i
]
->
GetMutable
<
paddle
::
framework
::
LoDTensor
>
()
->
dims
();
auto
new_shape
=
paddle
::
framework
::
vectorize
(
dim
);
for
(
auto
output_node
:
node
->
outputs
)
{
if
(
output_node
->
Name
()
==
output_name
)
{
output_node
->
Var
()
->
SetShape
(
new_shape
);
}
}
}
}
}
// release the temp scope
scope
.
reset
();
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave InferShapePass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
infer_shape_pass
,
paddle
::
framework
::
ir
::
InferShapePass
)
.
RequirePassAttr
(
"feed_list"
);
paddle/fluid/framework/ir/ipu/infer_shape_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
InferShapePass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
InferencePostprocessPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter InferencePostprocessPass::ApplyImpl"
;
std
::
vector
<
std
::
string
>
feed_list
;
feed_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"feed_list"
);
std
::
vector
<
std
::
string
>
fetch_list
;
fetch_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"fetch_list"
);
auto
*
feed_var
=
new
paddle
::
framework
::
VarDesc
(
"feed"
);
feed_var
->
SetType
(
proto
::
VarType
::
FEED_MINIBATCH
);
auto
*
feed_var_node
=
graph
->
CreateVarNode
(
feed_var
);
auto
*
fetch_var
=
new
paddle
::
framework
::
VarDesc
(
"fetch"
);
fetch_var
->
SetType
(
proto
::
VarType
::
FETCH_LIST
);
auto
*
fetch_var_node
=
graph
->
CreateVarNode
(
fetch_var
);
for
(
int
i
=
0
;
i
<
feed_list
.
size
();
i
++
)
{
for
(
auto
node
:
graph
->
Nodes
())
{
if
(
node
->
Name
()
==
feed_list
[
i
])
{
auto
*
op
=
new
paddle
::
framework
::
OpDesc
();
op
->
SetType
(
"feed"
);
op
->
SetInput
(
"X"
,
{
"feed"
});
op
->
SetOutput
(
"Out"
,
{
node
->
Name
()});
op
->
SetAttr
(
"col"
,
i
);
auto
*
op_node
=
graph
->
CreateOpNode
(
op
);
node
->
inputs
.
push_back
(
op_node
);
op_node
->
outputs
.
push_back
(
node
);
feed_var_node
->
outputs
.
push_back
(
op_node
);
op_node
->
inputs
.
push_back
(
feed_var_node
);
break
;
}
}
}
for
(
int
i
=
0
;
i
<
fetch_list
.
size
();
i
++
)
{
for
(
auto
node
:
graph
->
Nodes
())
{
if
(
node
->
Name
()
==
fetch_list
[
i
])
{
auto
*
op
=
new
paddle
::
framework
::
OpDesc
();
op
->
SetType
(
"fetch"
);
op
->
SetInput
(
"X"
,
{
node
->
Name
()});
op
->
SetOutput
(
"Out"
,
{
"fetch"
});
op
->
SetAttr
(
"col"
,
i
);
auto
*
op_node
=
graph
->
CreateOpNode
(
op
);
node
->
outputs
.
push_back
(
op_node
);
op_node
->
inputs
.
push_back
(
node
);
fetch_var_node
->
inputs
.
push_back
(
op_node
);
op_node
->
outputs
.
push_back
(
fetch_var_node
);
break
;
}
}
}
VLOG
(
10
)
<<
"leave InferencePostprocessPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
inference_postprocess_pass
,
paddle
::
framework
::
ir
::
InferencePostprocessPass
)
.
RequirePassAttr
(
"feed_list"
)
.
RequirePassAttr
(
"fetch_list"
);
paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
InferencePostprocessPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/inference_process_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
InferenceProcessPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter InferenceProcessPass::ApplyImpl"
;
// Get a new instance of ipu_backend
std
::
shared_ptr
<
platform
::
ipu
::
IpuBackend
>
ipu_backend
=
platform
::
ipu
::
IpuBackend
::
GetNewInstance
();
// Set scope
auto
&
scope
=
graph
->
Get
<
Scope
>
(
kParamScopeAttr
);
ipu_backend
->
SetScope
(
scope
);
// Set ipu_strategy
static
std
::
shared_ptr
<
platform
::
ipu
::
IpuStrategy
>
ipu_strategy_instance_
(
new
platform
::
ipu
::
IpuStrategy
());
ipu_strategy_instance_
->
is_training
=
false
;
auto
num_ipus
=
graph
->
Get
<
int
>
(
"num_ipus"
);
ipu_strategy_instance_
->
num_ipus
=
num_ipus
;
if
(
num_ipus
>
1
)
{
ipu_strategy_instance_
->
popart_options_
.
virtualGraphMode
=
platform
::
ipu
::
VirtualGraphMode
::
Manual
;
}
else
{
ipu_strategy_instance_
->
popart_options_
.
virtualGraphMode
=
platform
::
ipu
::
VirtualGraphMode
::
Off
;
}
auto
enable_pipelining
=
graph
->
Get
<
bool
>
(
"enable_pipelining"
);
ipu_strategy_instance_
->
popart_options_
.
enablePipelining
=
enable_pipelining
;
if
(
enable_pipelining
)
{
auto
batches_per_step
=
graph
->
Get
<
int
>
(
"batches_per_step"
);
PADDLE_ENFORCE_GE
(
batches_per_step
,
num_ipus
,
platform
::
errors
::
InvalidArgument
(
"Batched per step should be equal or "
"greater than the number of IPUs"
));
ipu_strategy_instance_
->
batches_per_step
=
batches_per_step
;
}
ipu_strategy_instance_
->
batch_size
=
graph
->
Get
<
int
>
(
"batch_size"
);
ipu_strategy_instance_
->
need_avg_shard
=
graph
->
Get
<
bool
>
(
"need_avg_shard"
);
ipu_backend
->
SetIpuStrategy
(
*
(
ipu_strategy_instance_
.
get
()));
// Get feed_list and fetch list
std
::
vector
<
std
::
string
>
feed_list
=
{};
std
::
vector
<
std
::
string
>
fetch_list
=
{};
for
(
auto
node
:
graph
->
Nodes
())
{
if
(
node
->
Name
()
==
"feed"
)
{
if
(
node
->
IsOp
())
{
feed_list
.
push_back
(
""
);
}
}
else
if
(
node
->
Name
()
==
"fetch"
)
{
if
(
node
->
IsOp
())
{
fetch_list
.
push_back
(
""
);
}
}
}
for
(
auto
node
:
graph
->
Nodes
())
{
if
(
node
->
Name
()
==
"feed"
)
{
if
(
node
->
IsOp
())
{
feed_list
[
BOOST_GET_CONST
(
int
,
node
->
Op
()
->
GetAttr
(
"col"
))]
=
node
->
outputs
[
0
]
->
Name
();
}
}
else
if
(
node
->
Name
()
==
"fetch"
)
{
if
(
node
->
IsOp
())
{
fetch_list
[
BOOST_GET_CONST
(
int
,
node
->
Op
()
->
GetAttr
(
"col"
))]
=
node
->
inputs
[
0
]
->
Name
();
}
}
}
// Run passes
std
::
vector
<
std
::
string
>
graph_pass
=
{
"forward_graph_extract_pass"
,
"infer_shape_pass"
,
"avg_shard_pass"
,
"popart_canonicalization_pass"
};
std
::
vector
<
std
::
string
>
compile_pass
=
{
"ipu_inplace_pass"
,
"ipu_graph_builder_pass"
,
"ipu_runtime_replacer_pass"
,
"inference_postprocess_pass"
};
for
(
auto
pass_name
:
graph_pass
)
{
auto
pass
=
PassRegistry
::
Instance
().
Get
(
pass_name
);
if
(
pass_name
==
"infer_shape_pass"
)
{
pass
->
Set
(
"feed_list"
,
new
std
::
vector
<
std
::
string
>
(
feed_list
.
begin
(),
feed_list
.
end
()));
}
pass
->
Apply
(
graph
);
}
for
(
auto
pass_name
:
compile_pass
)
{
auto
pass
=
PassRegistry
::
Instance
().
Get
(
pass_name
);
pass
->
Set
(
"feed_list"
,
new
std
::
vector
<
std
::
string
>
(
feed_list
.
begin
(),
feed_list
.
end
()));
pass
->
Set
(
"fetch_list"
,
new
std
::
vector
<
std
::
string
>
(
fetch_list
.
begin
(),
fetch_list
.
end
()));
pass
->
Apply
(
graph
);
}
VLOG
(
10
)
<<
"leave InferenceProcessPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
inference_process_pass
,
paddle
::
framework
::
ir
::
InferenceProcessPass
);
paddle/fluid/framework/ir/ipu/inference_process_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
InferenceProcessPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
IpuGraphBuilderPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter IpuGraphBuilderPass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
std
::
vector
<
std
::
string
>
feed_list
;
feed_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"feed_list"
);
std
::
vector
<
std
::
string
>
fetch_list
;
fetch_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"fetch_list"
);
std
::
shared_ptr
<
platform
::
ipu
::
IpuBackend
>
ipu_backend
=
platform
::
ipu
::
IpuBackend
::
GetInstance
();
ipu_backend
->
Compile
(
graph
,
feed_list
,
fetch_list
);
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave IpuGraphBuilderPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
ipu_graph_builder_pass
,
paddle
::
framework
::
ir
::
IpuGraphBuilderPass
)
.
RequirePassAttr
(
"feed_list"
)
.
RequirePassAttr
(
"fetch_list"
);
paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
IpuGraphBuilderPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
std
::
string
GenerateVarName
(
Node
*
node
)
{
return
node
->
Name
()
+
"_"
+
std
::
to_string
(
node
->
id
());
}
void
IpuInplacePass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
// use this pass after forward_graph_extract_pass
// raise error if the inplaced var both in feed_list & fetch_list
VLOG
(
10
)
<<
"enter IpuInplacePass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
std
::
vector
<
std
::
string
>
feed_list
;
feed_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"feed_list"
);
std
::
vector
<
std
::
string
>
fetch_list
;
fetch_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"fetch_list"
);
std
::
map
<
std
::
string
,
int
>
var_name
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
())
{
if
(
var_name
.
find
(
node
->
Name
())
==
var_name
.
end
())
{
var_name
.
emplace
(
node
->
Name
(),
1
);
}
else
{
var_name
[
node
->
Name
()]
++
;
}
}
}
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
())
{
if
(
var_name
[
node
->
Name
()]
>
1
)
{
auto
is_feed
=
(
std
::
find
(
feed_list
.
begin
(),
feed_list
.
end
(),
node
->
Name
())
!=
feed_list
.
end
())
&&
(
node
->
inputs
.
size
()
==
0
);
auto
is_fetch
=
(
std
::
find
(
fetch_list
.
begin
(),
fetch_list
.
end
(),
node
->
Name
())
!=
fetch_list
.
end
())
&&
(
node
->
outputs
.
size
()
==
0
);
if
(
!
is_feed
&&
!
is_fetch
&&
!
node
->
Var
()
->
Persistable
())
{
auto
old_name
=
node
->
Name
();
auto
new_name
=
GenerateVarName
(
node
);
node
->
RenameVar
(
new_name
);
for
(
auto
*
op_in
:
node
->
inputs
)
{
op_in
->
Op
()
->
RenameOutput
(
old_name
,
new_name
);
}
for
(
auto
*
op_out
:
node
->
outputs
)
{
op_out
->
Op
()
->
RenameInput
(
old_name
,
new_name
);
}
}
}
}
}
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave IpuInplacePass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
ipu_inplace_pass
,
paddle
::
framework
::
ir
::
IpuInplacePass
)
.
RequirePassAttr
(
"feed_list"
)
.
RequirePassAttr
(
"fetch_list"
);
paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
IpuInplacePass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
IPUPassBase
::
Init
(
const
std
::
string
&
repr
,
Graph
*
graph
)
const
{
repr_
=
repr
;
graph_
=
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/ipu_pass_base.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
IPUPassBase
:
public
Pass
{
public:
void
Init
(
const
std
::
string
&
repr
,
Graph
*
graph
)
const
;
virtual
~
IPUPassBase
()
{}
protected:
mutable
Graph
*
graph_
;
mutable
std
::
string
repr_
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
IpuRuntimeReplacerPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter IpuRuntimeReplacerPass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
std
::
vector
<
std
::
string
>
feed_list
;
feed_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"feed_list"
);
std
::
vector
<
std
::
string
>
fetch_list
;
fetch_list
=
Get
<
std
::
vector
<
std
::
string
>>
(
"fetch_list"
);
framework
::
OpDesc
ipu_rt_op_desc
;
ipu_rt_op_desc
.
SetType
(
"ipu_runtime"
);
ipu_rt_op_desc
.
SetInput
(
"FeedList"
,
feed_list
);
ipu_rt_op_desc
.
SetOutput
(
"FetchList"
,
fetch_list
);
ipu_rt_op_desc
.
Flush
();
// Create a new node for the ipu_runtime_op.
auto
*
ipu_rt_node
=
graph
->
CreateOpNode
(
&
ipu_rt_op_desc
);
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
())
{
for
(
auto
feed
:
feed_list
)
{
if
(
node
->
Name
()
==
feed
)
{
IR_NODE_LINK_TO
(
node
,
ipu_rt_node
);
}
}
for
(
auto
fetch
:
fetch_list
)
{
if
(
node
->
Name
()
==
fetch
)
{
IR_NODE_LINK_TO
(
ipu_rt_node
,
node
);
}
}
}
}
// set ipu_runtime_op dtype attr
if
(
fetch_list
.
size
()
==
1
)
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsVar
())
{
for
(
auto
fetch
:
fetch_list
)
{
if
(
node
->
Name
()
==
fetch
)
{
ipu_rt_node
->
Op
()
->
SetAttr
(
"dtype"
,
node
->
Var
()
->
GetDataType
());
}
}
}
}
}
// Remove unneeded nodes.
std
::
unordered_set
<
const
Node
*>
marked_nodes
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
())
{
auto
*
op_desc
=
node
->
Op
();
if
(
op_desc
->
Type
()
!=
"ipu_runtime"
)
{
marked_nodes
.
insert
(
node
);
}
}
}
GraphSafeRemoveNodes
(
graph
,
marked_nodes
);
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave IpuRuntimeReplacerPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
ipu_runtime_replacer_pass
,
paddle
::
framework
::
ir
::
IpuRuntimeReplacerPass
)
.
RequirePassAttr
(
"feed_list"
)
.
RequirePassAttr
(
"fetch_list"
);
paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
IpuRuntimeReplacerPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
IpuOptimizerExtractPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter IpuOptimizerExtractPass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
auto
ipu_backend
=
paddle
::
platform
::
ipu
::
IpuBackend
::
GetInstance
();
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
())
{
int
op_role
=
BOOST_GET_CONST
(
int
,
node
->
Op
()
->
GetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
()));
// graph usually have multiple optimizer node for different parameter,
// and these node have the same type and attr value usually
if
((
op_role
==
static_cast
<
int
>
(
framework
::
OpRole
::
kOptimize
)))
{
ipu_backend
->
GetExecutor
().
SetOptimizerType
(
node
->
Op
()
->
Type
());
VLOG
(
10
)
<<
"found optimizer type: "
<<
node
->
Op
()
->
Type
();
for
(
const
std
::
string
&
attr_name
:
node
->
Op
()
->
AttrNames
())
{
auto
attr_type
=
node
->
Op
()
->
GetAttrType
(
attr_name
);
// with adam, attr are float
if
(
attr_type
==
proto
::
AttrType
::
FLOAT
)
{
auto
attr_value
=
BOOST_GET_CONST
(
float
,
node
->
Op
()
->
GetAttr
(
attr_name
));
ipu_backend
->
GetExecutor
().
SetOptimizerAttr
(
attr_name
,
attr_value
);
}
else
{
VLOG
(
10
)
<<
"Skip "
<<
attr_type
;
}
}
auto
lr_var_name
=
node
->
Op
()
->
Input
(
"LearningRate"
);
PADDLE_ENFORCE_EQ
(
lr_var_name
.
size
(),
1u
,
platform
::
errors
::
InvalidArgument
(
"In op(%s), find input(LearningRate) failed."
,
node
->
Op
()
->
Type
()));
ipu_backend
->
GetExecutor
().
SetLRVarName
(
lr_var_name
[
0
]);
}
if
((
op_role
==
static_cast
<
int
>
(
framework
::
OpRole
::
kLoss
)))
{
VLOG
(
10
)
<<
"found loss op type: "
<<
node
->
Op
()
->
Type
();
auto
outputs
=
node
->
Op
()
->
Outputs
();
PADDLE_ENFORCE_EQ
(
outputs
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Can only support one loss key"
));
auto
losses_name
=
outputs
.
begin
()
->
second
;
PADDLE_ENFORCE_EQ
(
losses_name
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Can only support one loss name"
));
ipu_backend
->
GetExecutor
().
SetLoss
(
losses_name
[
0
]);
}
}
}
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave IpuOptimizerExtractPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
optimizer_extract_pass
,
paddle
::
framework
::
ir
::
IpuOptimizerExtractPass
);
paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
IpuOptimizerExtractPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/common.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
using
paddle
::
platform
::
ipu
::
IpuBackend
;
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
Node
;
void
IpuOptimizerStateAlignPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter IpuOptimizerStateAlignPass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
auto
ipu_backend
=
IpuBackend
::
GetInstance
();
const
auto
*
scope_
=
ipu_backend
->
GetScope
();
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
())
{
int
op_role
=
BOOST_GET_CONST
(
int
,
node
->
Op
()
->
GetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
()));
if
((
op_role
==
static_cast
<
int
>
(
framework
::
OpRole
::
kOptimize
)))
{
auto
inputs
=
node
->
Op
()
->
Inputs
();
if
(
inputs
.
count
(
platform
::
ipu
::
sBeta1Pow
))
{
auto
var
=
scope_
->
GetVar
(
inputs
.
at
(
platform
::
ipu
::
sBeta1Pow
)[
0
]);
auto
data
=
var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
data
<
float
>
();
auto
beta
=
BOOST_GET_CONST
(
float
,
node
->
Op
()
->
GetAttr
(
platform
::
ipu
::
sBeta1
));
// ensure current save with beta1pow, rather than step.
// beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
// support single Step__
bool
save_with_beta1pow
=
(
data
[
0
]
<
1.0
f
)
&&
(
data
[
0
]
>
0.0
f
);
float
step
=
0
;
float
beta_acc
=
beta
;
while
(
beta_acc
>
data
[
0
]
&&
save_with_beta1pow
)
{
beta_acc
*=
beta
;
step
+=
1
;
}
if
(
save_with_beta1pow
)
{
data
[
0
]
=
step
;
}
}
}
}
}
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave IpuOptimizerStateAlignPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
optimizer_state_align_pass
,
paddle
::
framework
::
ir
::
IpuOptimizerStateAlignPass
);
paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
/*
* This pass should only affect optimizer that need bias correction,
* include Adam/Lamb.
*/
class
IpuOptimizerStateAlignPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
Node
;
using
platform
::
ipu
::
SymbolHandler
;
void
PopartCanonicalizationPass
::
ApplyImpl
(
ir
::
Graph
*
graph
)
const
{
VLOG
(
10
)
<<
"enter PopartCanonicalizationPass::ApplyImpl"
;
VLOG
(
10
)
<<
"Raw Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
auto
nodes
=
graph
->
Nodes
();
for
(
auto
*
node
:
nodes
)
{
if
(
!
node
->
IsOp
())
{
continue
;
}
auto
*
op
=
node
->
Op
();
auto
op_type
=
op
->
Type
();
ir
::
Node
*
new_node
=
nullptr
;
SymbolHandler
handler
=
platform
::
ipu
::
GetHandler
(
op_type
);
if
(
handler
)
{
VLOG
(
11
)
<<
"Raw Paddle Node:"
;
VLOG
(
11
)
<<
node
->
Op
()
->
Proto
()
->
DebugString
();
new_node
=
handler
(
graph
,
node
);
VLOG
(
11
)
<<
"Post Popart Node:"
;
VLOG
(
11
)
<<
new_node
->
Op
()
->
Proto
()
->
DebugString
();
platform
::
ipu
::
ClearNode
(
node
);
graph
->
RemoveNode
(
node
);
}
else
{
LOG
(
ERROR
)
<<
"Can not find OpHandler for op_type: "
<<
op_type
;
}
}
VLOG
(
10
)
<<
"Post Graph: "
;
VLOG
(
10
)
<<
DebugString
(
graph
);
VLOG
(
10
)
<<
"leave PopartCanonicalizationPass::ApplyImpl"
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
popart_canonicalization_pass
,
paddle
::
framework
::
ir
::
PopartCanonicalizationPass
);
paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
PopartCanonicalizationPass
:
public
IPUPassBase
{
protected:
void
ApplyImpl
(
ir
::
Graph
*
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
浏览文件 @
883ee1a3
...
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
...
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
LOG
(
WARNING
)
<<
"Pass in op compat failed."
;
LOG
(
WARNING
)
<<
"Pass in op compat failed."
;
return
;
return
;
}
}
const
int
kNumFields
=
5
;
const
int
kNumFields
=
5
;
const
int
kTransOffset
=
1
;
const
int
kTransOffset
=
1
;
const
int
kTransOutOffset
=
2
;
const
int
kTransOutOffset
=
2
;
const
int
kFlattenOffset
=
3
;
const
int
kFlattenOffset
=
3
;
const
int
kFlattenOutOffset
=
4
;
const
int
kFlattenOutOffset
=
4
;
std
::
vector
<
Node
*>
nodes
;
std
::
vector
<
Node
*>
nodes
;
std
::
vector
<
int
>
trans_axis0
;
int
flatten_axis0
;
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
for
(
int
i
=
0
;
i
<
times
;
i
++
)
{
PADDLE_ENFORCE_NOT_NULL
(
PADDLE_ENFORCE_NOT_NULL
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))),
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))),
...
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
...
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
platform
::
errors
::
NotFound
(
"Can not find %s in subgraph."
,
platform
::
errors
::
NotFound
(
"Can not find %s in subgraph."
,
input_nodes
[
i
]
->
name
()));
input_nodes
[
i
]
->
name
()));
if
(
i
==
0
)
{
trans_axis0
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
0
)))
->
Op
()
->
GetAttr
(
"axis"
));
flatten_axis0
=
BOOST_GET_CONST
(
int
,
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
0
)))
->
Op
()
->
GetAttr
(
"axis"
));
}
else
{
std
::
vector
<
int
>
trans_axis
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
)))
->
Op
()
->
GetAttr
(
"axis"
));
// All axis of transpose should be the same
if
(
trans_axis0
!=
trans_axis
)
return
;
int
flatten_axis
=
BOOST_GET_CONST
(
int
,
subgraph
.
at
(
pattern
.
GetPDNode
(
"flatten"
+
std
::
to_string
(
0
)))
->
Op
()
->
GetAttr
(
"axis"
));
// All axis of flatten should be the same
if
(
flatten_axis0
!=
flatten_axis
)
return
;
}
nodes
.
push_back
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
subgraph
.
at
(
input_nodes
[
i
]));
nodes
.
push_back
(
nodes
.
push_back
(
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
subgraph
.
at
(
pattern
.
GetPDNode
(
"transpose"
+
std
::
to_string
(
i
))));
...
...
paddle/fluid/framework/op_registry.h
浏览文件 @
883ee1a3
...
@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
...
@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
...
...
paddle/fluid/framework/paddle2cinn/CMakeLists.txt
浏览文件 @
883ee1a3
...
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
...
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
cc_library
(
build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce
)
cc_library
(
build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce
)
cc_library
(
transform_desc SRCS transform_desc.cc DEPS proto_desc cinn
)
cc_library
(
transform_desc SRCS transform_desc.cc DEPS proto_desc cinn
)
cc_library
(
cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn
)
cc_library
(
cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn
)
cc_library
(
cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn
)
cc_library
(
cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn
cinn_launch_context
)
if
(
WITH_TESTING
)
if
(
WITH_TESTING
)
cc_test
(
cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn
)
cc_test
(
cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn
)
...
...
paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
浏览文件 @
883ee1a3
...
@@ -29,55 +29,32 @@ namespace paddle {
...
@@ -29,55 +29,32 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
paddle2cinn
{
namespace
paddle2cinn
{
using
GraphHashStrategy
=
CinnCacheKey
::
GraphHashStrategy
;
CinnCacheKey
::
CinnCacheKey
(
GraphHashStrategy
graph_hash
)
:
graph_hash_
(
graph_hash
)
{}
CinnCacheKey
::
CinnCacheKey
(
CinnCacheKey
::
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
string
&
arch_str
)
{
const
std
::
string
&
arch_str
,
GraphHashStrategy
graph_hash
)
:
graph_hash_
(
graph_hash
)
{
this
->
SetKey
(
graph
,
input_tensors
,
arch_str
);
this
->
SetKey
(
graph
,
input_tensors
,
arch_str
);
}
}
CinnCacheKey
::
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
CinnCacheKey
::
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
DDim
>&
input_shapes
,
const
std
::
map
<
std
::
string
,
DDim
>&
input_shapes
,
const
std
::
string
&
arch_str
)
{
const
std
::
string
&
arch_str
,
GraphHashStrategy
graph_hash
)
:
graph_hash_
(
graph_hash
)
{
this
->
SetKey
(
graph
,
input_shapes
,
arch_str
);
this
->
SetKey
(
graph
,
input_shapes
,
arch_str
);
}
}
size_t
CinnCacheKey
::
HashGraph
(
const
ir
::
Graph
&
graph
)
{
// using Dot to unqiue graph
inference
::
analysis
::
Dot
dot
;
std
::
unordered_map
<
const
ir
::
Node
*
,
std
::
string
>
node2dot
;
int
id
=
0
;
// Create nodes
// graph.Nodes() return unordered_set, the same graph may
// return different result?
for
(
const
ir
::
Node
*
n
:
graph
.
Nodes
())
{
std
::
string
node_id
=
std
::
to_string
(
id
++
);
dot
.
AddNode
(
node_id
,
{},
n
->
Name
(),
true
);
node2dot
[
n
]
=
node_id
;
}
// Create edges
for
(
const
ir
::
Node
*
n
:
graph
.
Nodes
())
{
const
auto
&
src_id
=
node2dot
.
at
(
n
);
for
(
auto
*
out
:
n
->
outputs
)
{
const
auto
&
dest_id
=
node2dot
.
at
(
out
);
dot
.
AddEdge
(
src_id
,
dest_id
,
{});
}
}
const
std
::
string
&
viz_graph
=
dot
.
Build
();
VLOG
(
1
)
<<
"The hash graph:
\n
"
<<
viz_graph
;
size_t
hash_val
=
std
::
hash
<
std
::
string
>
()(
viz_graph
);
VLOG
(
4
)
<<
"The graph's hash value is: "
<<
hash_val
;
return
hash_val
;
}
void
CinnCacheKey
::
SetKey
(
void
CinnCacheKey
::
SetKey
(
const
ir
::
Graph
&
graph
,
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
string
&
arch_str
)
{
const
std
::
string
&
arch_str
)
{
graph_
serialize_str_
=
std
::
to_string
(
HashGraph
(
graph
)
);
graph_
hash_val_
=
graph_hash_
(
graph
);
for
(
const
auto
&
name_tensor
:
input_tensors
)
{
for
(
const
auto
&
name_tensor
:
input_tensors
)
{
input_shapes_
[
name_tensor
.
first
]
=
name_tensor
.
second
->
dims
();
input_shapes_
[
name_tensor
.
first
]
=
name_tensor
.
second
->
dims
();
}
}
...
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
...
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
void
CinnCacheKey
::
SetKey
(
const
ir
::
Graph
&
graph
,
void
CinnCacheKey
::
SetKey
(
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
DDim
>&
input_shapes
,
const
std
::
map
<
std
::
string
,
DDim
>&
input_shapes
,
const
std
::
string
&
arch_str
)
{
const
std
::
string
&
arch_str
)
{
graph_
serialize_str_
=
std
::
to_string
(
HashGraph
(
graph
)
);
graph_
hash_val_
=
graph_hash_
(
graph
);
input_shapes_
=
input_shapes
;
input_shapes_
=
input_shapes
;
arch_str_
=
arch_str
;
arch_str_
=
arch_str
;
}
}
...
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
...
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
}
}
bool
CinnCacheKey
::
operator
==
(
const
CinnCacheKey
&
other
)
const
{
bool
CinnCacheKey
::
operator
==
(
const
CinnCacheKey
&
other
)
const
{
return
graph_
serialize_str_
==
other
.
graph_serialize_str
_
&&
return
graph_
hash_val_
==
other
.
graph_hash_val
_
&&
input_shapes_
==
other
.
input_shapes_
&&
arch_str_
==
other
.
arch_str_
;
input_shapes_
==
other
.
input_shapes_
&&
arch_str_
==
other
.
arch_str_
;
}
}
...
@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
...
@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
ret
=
hash_combine
(
ret
,
string_hasher
(
name_shape
.
second
.
to_str
()));
ret
=
hash_combine
(
ret
,
string_hasher
(
name_shape
.
second
.
to_str
()));
}
}
ret
=
hash_combine
(
ret
,
string_hasher
(
key
.
graph_serialize_str_
)
);
ret
=
hash_combine
(
ret
,
key
.
graph_hash_val_
);
ret
=
hash_combine
(
ret
,
string_hasher
(
key
.
arch_str_
));
ret
=
hash_combine
(
ret
,
string_hasher
(
key
.
arch_str_
));
return
ret
;
return
ret
;
}
}
size_t
CinnCacheKeyByStructure
::
HashGraph
(
const
ir
::
Graph
&
graph
)
{
// sort grad node by name and id.
auto
compare
=
[](
ir
::
Node
*
n1
,
ir
::
Node
*
n2
)
{
return
(
n1
->
Name
()
==
n2
->
Name
())
?
(
n1
->
id
()
<
n2
->
id
())
:
(
n1
->
Name
()
<
n2
->
Name
());
};
// graph.Nodes() return unordered_set, here using set to avoid the same graph
// may return different result
std
::
set
<
ir
::
Node
*
,
bool
(
*
)(
ir
::
Node
*
,
ir
::
Node
*
)
>
node_set
(
compare
),
output_set
(
compare
);
node_set
.
insert
(
graph
.
Nodes
().
begin
(),
graph
.
Nodes
().
end
());
std
::
string
hash_str
;
for
(
ir
::
Node
*
n
:
node_set
)
{
hash_str
.
append
(
n
->
Name
());
output_set
.
clear
();
output_set
.
insert
(
n
->
outputs
.
begin
(),
n
->
outputs
.
end
());
for
(
auto
*
out
:
output_set
)
{
hash_str
.
append
(
out
->
Name
());
}
}
VLOG
(
1
)
<<
"The hash graph:
\n
"
<<
hash_str
;
size_t
hash_val
=
std
::
hash
<
std
::
string
>
()(
hash_str
);
VLOG
(
4
)
<<
"The graph's hash value by graph structure is: "
<<
hash_val
;
return
hash_val
;
}
size_t
CinnCacheKeyByAddress
::
HashGraph
(
const
ir
::
Graph
&
graph
)
{
size_t
hash_val
=
reinterpret_cast
<
size_t
>
(
&
graph
);
VLOG
(
4
)
<<
"The graph's hash value by graph address is: "
<<
hash_val
;
return
hash_val
;
}
}
// namespace paddle2cinn
}
// namespace paddle2cinn
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
浏览文件 @
883ee1a3
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#pragma once
#pragma once
#include <functional>
#include <map>
#include <map>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ddim.h"
...
@@ -33,14 +34,18 @@ namespace paddle2cinn {
...
@@ -33,14 +34,18 @@ namespace paddle2cinn {
// shapes.
// shapes.
class
CinnCacheKey
{
class
CinnCacheKey
{
public:
public:
using
GraphHashStrategy
=
std
::
function
<
size_t
(
const
ir
::
Graph
&
)
>
;
explicit
CinnCacheKey
(
GraphHashStrategy
graph_hash
);
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
string
&
arch_str
);
const
std
::
string
&
arch_str
,
GraphHashStrategy
graph_hash
);
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
CinnCacheKey
(
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
DDim
>&
input_shapes
,
const
std
::
map
<
std
::
string
,
DDim
>&
input_shapes
,
const
std
::
string
&
arch_str
);
const
std
::
string
&
arch_str
,
GraphHashStrategy
graph_hash
);
~
CinnCacheKey
()
{}
~
CinnCacheKey
()
=
default
;
void
SetKey
(
const
ir
::
Graph
&
graph
,
void
SetKey
(
const
ir
::
Graph
&
graph
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
...
@@ -58,13 +63,38 @@ class CinnCacheKey {
...
@@ -58,13 +63,38 @@ class CinnCacheKey {
};
};
private:
private:
size_t
HashGraph
(
const
ir
::
Graph
&
graph
);
GraphHashStrategy
graph_hash_
;
size_t
graph_hash_val_
;
std
::
string
graph_serialize_str_
;
std
::
map
<
std
::
string
,
DDim
>
input_shapes_
;
std
::
map
<
std
::
string
,
DDim
>
input_shapes_
;
std
::
string
arch_str_
;
std
::
string
arch_str_
;
};
};
#define CINN_CACHE_KEY_CREATE(NAME) \
class NAME : public CinnCacheKey { \
public: \
NAME() : CinnCacheKey(HashGraph) {} \
\
NAME(const ir::Graph& graph, \
const std::map<std::string, const LoDTensor*>& input_tensors, \
const std::string& arch_str) \
: CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {} \
\
NAME(const ir::Graph& graph, \
const std::map<std::string, DDim>& input_shapes, \
const std::string& arch_str) \
: CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {} \
\
private: \
static size_t HashGraph(const ir::Graph& graph); \
};
// Class to store the keys by graph address for compiling CINN.
CINN_CACHE_KEY_CREATE
(
CinnCacheKeyByAddress
)
// Class to store the keys by graph structure for compiling CINN.
CINN_CACHE_KEY_CREATE
(
CinnCacheKeyByStructure
)
#undef CINN_CACHE_KEY_CREATE
}
// namespace paddle2cinn
}
// namespace paddle2cinn
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
浏览文件 @
883ee1a3
...
@@ -26,8 +26,8 @@ namespace paddle {
...
@@ -26,8 +26,8 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
paddle2cinn
{
namespace
paddle2cinn
{
TEST
(
CinnCacheKeyTest
,
TestAsUnorderedKey
)
{
TEST
(
CinnCacheKeyTest
,
TestAsUnorderedKey
ByStructure
)
{
std
::
unordered_set
<
CinnCacheKey
,
CinnCacheKey
::
Hash
>
test_set
;
std
::
unordered_set
<
CinnCacheKey
ByStructure
,
CinnCacheKey
::
Hash
>
test_set
;
ProgramDesc
empty_program
;
ProgramDesc
empty_program
;
ir
::
Graph
empty_graph
(
empty_program
);
ir
::
Graph
empty_graph
(
empty_program
);
...
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
...
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
DDim
ddim
=
paddle
::
framework
::
make_ddim
({
1
,
2
,
3
});
DDim
ddim
=
paddle
::
framework
::
make_ddim
({
1
,
2
,
3
});
std
::
map
<
std
::
string
,
DDim
>
feed_shapes
=
{{
"X"
,
ddim
}};
std
::
map
<
std
::
string
,
DDim
>
feed_shapes
=
{{
"X"
,
ddim
}};
CinnCacheKey
cache_key0
(
empty_graph
,
feed_tensors
,
"x86"
);
CinnCacheKey
ByStructure
cache_key0
(
empty_graph
,
feed_tensors
,
"x86"
);
CinnCacheKey
cache_key1
(
empty_graph
,
feed_shapes
,
"x86"
);
CinnCacheKey
ByStructure
cache_key1
(
empty_graph
,
feed_shapes
,
"x86"
);
EXPECT_EQ
(
cache_key0
,
cache_key1
);
EXPECT_EQ
(
cache_key0
,
cache_key1
);
CinnCacheKey
cache_key2
(
graph
,
feed_shapes
,
"x86"
);
CinnCacheKey
ByStructure
cache_key2
(
graph
,
feed_shapes
,
"x86"
);
CinnCacheKey
cache_key3
(
graph
,
feed_shapes
,
"nvgpu"
);
CinnCacheKey
ByStructure
cache_key3
(
graph
,
feed_shapes
,
"nvgpu"
);
CinnCacheKey
cache_key4
(
graph
,
feed_tensors
,
"nvgpu"
);
CinnCacheKey
ByStructure
cache_key4
(
graph
,
feed_tensors
,
"nvgpu"
);
EXPECT_NE
(
cache_key2
,
cache_key3
);
EXPECT_NE
(
cache_key2
,
cache_key3
);
EXPECT_EQ
(
cache_key3
,
cache_key4
);
EXPECT_EQ
(
cache_key3
,
cache_key4
);
CinnCacheKey
cache_key5
(
empty_graph
,
CinnCacheKeyByStructure
cache_key5
(
std
::
map
<
std
::
string
,
const
LoDTensor
*>
(),
"unk"
);
empty_graph
,
std
::
map
<
std
::
string
,
const
LoDTensor
*>
(),
"unk"
);
CinnCacheKey
cache_key6
(
empty_graph
,
std
::
map
<
std
::
string
,
DDim
>
(),
"unk"
);
CinnCacheKeyByStructure
cache_key6
(
empty_graph
,
std
::
map
<
std
::
string
,
DDim
>
(),
"unk"
);
EXPECT_EQ
(
cache_key5
,
cache_key6
);
EXPECT_EQ
(
cache_key5
,
cache_key6
);
EXPECT_NE
(
cache_key1
,
cache_key3
);
EXPECT_NE
(
cache_key1
,
cache_key3
);
...
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
...
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
EXPECT_EQ
(
test_set
.
find
(
cache_key6
),
test_set
.
end
());
EXPECT_EQ
(
test_set
.
find
(
cache_key6
),
test_set
.
end
());
}
}
TEST
(
CinnCacheKeyTest
,
TestAsUnorderedKeyByAddress
)
{
std
::
unordered_set
<
CinnCacheKeyByAddress
,
CinnCacheKey
::
Hash
>
test_set
;
ProgramDesc
empty_program
;
ir
::
Graph
empty_graph
(
empty_program
);
ProgramDesc
program
;
auto
*
global_block
=
program
.
MutableBlock
(
0
);
auto
*
x
=
global_block
->
Var
(
"X"
);
x
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
ir
::
Graph
graph
(
program
);
LoDTensor
tensor
;
tensor
.
Resize
({
1
,
2
,
3
});
const
LoDTensor
*
tensor_pointer
=
&
tensor
;
std
::
map
<
std
::
string
,
const
LoDTensor
*>
feed_tensors
=
{
{
"X"
,
tensor_pointer
}};
DDim
ddim
=
paddle
::
framework
::
make_ddim
({
1
,
2
,
3
});
std
::
map
<
std
::
string
,
DDim
>
feed_shapes
=
{{
"X"
,
ddim
}};
CinnCacheKeyByAddress
cache_key0
(
empty_graph
,
feed_tensors
,
"x86"
);
CinnCacheKeyByAddress
cache_key1
(
empty_graph
,
feed_shapes
,
"x86"
);
EXPECT_EQ
(
cache_key0
,
cache_key1
);
CinnCacheKeyByAddress
cache_key2
(
graph
,
feed_shapes
,
"x86"
);
CinnCacheKeyByAddress
cache_key3
(
graph
,
feed_shapes
,
"nvgpu"
);
CinnCacheKeyByAddress
cache_key4
(
graph
,
feed_tensors
,
"nvgpu"
);
EXPECT_NE
(
cache_key2
,
cache_key3
);
EXPECT_EQ
(
cache_key3
,
cache_key4
);
CinnCacheKeyByAddress
cache_key5
(
empty_graph
,
std
::
map
<
std
::
string
,
const
LoDTensor
*>
(),
"unk"
);
CinnCacheKeyByAddress
cache_key6
(
empty_graph
,
std
::
map
<
std
::
string
,
DDim
>
(),
"unk"
);
EXPECT_EQ
(
cache_key5
,
cache_key6
);
EXPECT_NE
(
cache_key1
,
cache_key3
);
EXPECT_NE
(
cache_key4
,
cache_key2
);
EXPECT_NE
(
cache_key3
,
cache_key5
);
EXPECT_NE
(
cache_key6
,
cache_key4
);
EXPECT_NE
(
cache_key5
,
cache_key1
);
EXPECT_NE
(
cache_key2
,
cache_key6
);
test_set
.
insert
(
cache_key0
);
test_set
.
insert
(
cache_key1
);
test_set
.
insert
(
cache_key3
);
test_set
.
insert
(
cache_key4
);
test_set
.
insert
(
cache_key5
);
test_set
.
insert
(
cache_key6
);
EXPECT_EQ
(
test_set
.
size
(),
3U
);
auto
iter
=
test_set
.
find
(
cache_key0
);
EXPECT_NE
(
iter
,
test_set
.
end
());
test_set
.
erase
(
iter
);
EXPECT_EQ
(
test_set
.
size
(),
2U
);
EXPECT_EQ
(
test_set
.
find
(
cache_key1
),
test_set
.
end
());
iter
=
test_set
.
find
(
cache_key3
);
EXPECT_NE
(
iter
,
test_set
.
end
());
test_set
.
erase
(
iter
);
EXPECT_EQ
(
test_set
.
size
(),
1U
);
EXPECT_EQ
(
test_set
.
find
(
cache_key4
),
test_set
.
end
());
iter
=
test_set
.
find
(
cache_key5
);
EXPECT_NE
(
iter
,
test_set
.
end
());
test_set
.
erase
(
iter
);
EXPECT_EQ
(
test_set
.
size
(),
0U
);
EXPECT_EQ
(
test_set
.
find
(
cache_key6
),
test_set
.
end
());
}
TEST
(
CinnCacheKeyTest
,
TestSameGraph
)
{
ProgramDesc
program1
;
auto
*
global_block1
=
program1
.
MutableBlock
(
0
);
auto
*
x1
=
global_block1
->
Var
(
"X"
);
x1
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
ir
::
Graph
graph1
(
program1
);
ProgramDesc
program2
;
auto
*
global_block2
=
program2
.
MutableBlock
(
0
);
auto
*
x2
=
global_block2
->
Var
(
"X"
);
x2
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
ir
::
Graph
graph2
(
program2
);
LoDTensor
tensor
;
tensor
.
Resize
({
1
,
2
,
3
});
const
LoDTensor
*
tensor_pointer
=
&
tensor
;
std
::
map
<
std
::
string
,
const
LoDTensor
*>
feed_tensors
=
{
{
"X"
,
tensor_pointer
}};
CinnCacheKeyByAddress
cache_key_by_address1
(
graph1
,
feed_tensors
,
"x86"
);
CinnCacheKeyByAddress
cache_key_by_address2
(
graph2
,
feed_tensors
,
"x86"
);
EXPECT_NE
(
cache_key_by_address1
,
cache_key_by_address2
);
CinnCacheKeyByStructure
cache_key_by_struct1
(
graph1
,
feed_tensors
,
"x86"
);
CinnCacheKeyByStructure
cache_key_by_struct2
(
graph2
,
feed_tensors
,
"x86"
);
EXPECT_EQ
(
cache_key_by_struct1
,
cache_key_by_struct2
);
}
}
// namespace paddle2cinn
}
// namespace paddle2cinn
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
浏览文件 @
883ee1a3
...
@@ -41,6 +41,7 @@
...
@@ -41,6 +41,7 @@
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/inference/analysis/dot.h"
#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/string/string_helper.h"
...
@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
...
@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
std
::
map
<
std
::
string
,
const
LoDTensor
*>&
input_tensors
,
const
Target
&
target
,
void
*
stream
)
{
const
Target
&
target
,
void
*
stream
)
{
VLOG
(
1
)
<<
"-- The graph to be compiled is:
\n
"
<<
VizGraph
(
graph
);
VLOG
(
1
)
<<
"-- The graph to be compiled is:
\n
"
<<
VizGraph
(
graph
);
CinnCacheKey
cur_key
(
graph
,
input_tensors
,
target
.
arch_str
());
CinnCacheKeyByAddress
cur_key_by_address
(
graph
,
input_tensors
,
target
.
arch_str
());
CinnCacheKeyByStructure
cur_key_by_struct
;
bool
exist
=
false
;
bool
exist
=
false
;
{
{
AutoRDLock
r_guard
{
&
rwlock_
};
AutoRDLock
r_guard
{
&
rwlock_
};
exist
=
cache_
.
count
(
cur_key
)
!=
0
;
exist
=
cache_by_address_
.
count
(
cur_key_by_address
)
!=
0
;
// if cannot find graph by address, checkout whether the graph structure
// have been stored in cache.
if
(
!
exist
)
{
// generate the structure cache key
cur_key_by_struct
.
SetKey
(
graph
,
input_tensors
,
target
.
arch_str
());
// if the graph structure can be found, storing the graph address in
// cache for next query.
if
(
cache_by_struct_
.
count
(
cur_key_by_struct
)
!=
0
)
{
exist
=
true
;
cache_by_address_
[
cur_key_by_address
]
=
cache_by_struct_
.
at
(
cur_key_by_struct
).
get
();
}
}
}
}
if
(
!
exist
)
{
if
(
!
exist
)
{
std
::
int64_t
compiled_num
=
real_compiled_num_
.
fetch_add
(
1
);
std
::
int64_t
compiled_num
=
real_compiled_num_
.
fetch_add
(
1
);
auto
compiled_res
=
auto
compiled_res
=
CompileGraph
(
graph
,
input_tensors
,
target
,
compiled_num
,
stream
);
CompileGraph
(
graph
,
input_tensors
,
target
,
compiled_num
,
stream
);
AutoWRLock
w_guard
{
&
rwlock_
};
AutoWRLock
w_guard
{
&
rwlock_
};
if
(
!
cache_
.
count
(
cur_key
))
{
if
(
!
cache_by_struct_
.
count
(
cur_key_by_struct
))
{
cache_
[
cur_key
]
=
std
::
move
(
compiled_res
);
cache_by_address_
[
cur_key_by_address
]
=
compiled_res
.
get
();
cache_by_struct_
[
cur_key_by_struct
]
=
std
::
move
(
compiled_res
);
}
}
}
}
AutoRDLock
guard
{
&
rwlock_
};
AutoRDLock
guard
{
&
rwlock_
};
const
auto
&
cached_boj
=
*
cache_
[
cur_key
];
const
auto
&
cached_boj
=
*
cache_
by_address_
[
cur_key_by_address
];
return
cached_boj
;
return
cached_boj
;
}
}
...
@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
...
@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
{
{
AutoWRLock
guard
{
&
rwlock_
};
AutoWRLock
guard
{
&
rwlock_
};
graphs_
.
clear
();
graphs_
.
clear
();
cache_
.
clear
();
cache_by_address_
.
clear
();
cache_by_struct_
.
clear
();
}
}
real_compiled_num_
.
store
(
0
);
real_compiled_num_
.
store
(
0
);
}
}
...
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
...
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
*
compiled_obj
=
{
std
::
move
(
graph_compiler
),
*
compiled_obj
=
{
std
::
move
(
graph_compiler
),
std
::
move
(
compiled_res
.
runtime_program
),
scope
,
std
::
move
(
compiled_res
.
runtime_program
),
scope
,
symbol
.
var_model_to_program_map
()};
symbol
.
var_model_to_program_map
()};
compiled_obj
->
launch_context
=
std
::
make_unique
<
operators
::
details
::
CinnLaunchContext
>
(
compiled_obj
->
paddle2cinn_varmap
,
compiled_obj
->
scope
);
return
compiled_obj
;
return
compiled_obj
;
}
}
...
...
paddle/fluid/framework/paddle2cinn/cinn_compiler.h
浏览文件 @
883ee1a3
...
@@ -31,6 +31,13 @@
...
@@ -31,6 +31,13 @@
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
details
{
class
CinnLaunchContext
;
}
// namespace details
}
// namespace operators
namespace
framework
{
namespace
framework
{
namespace
paddle2cinn
{
namespace
paddle2cinn
{
...
@@ -39,6 +46,7 @@ struct CinnCompiledObject {
...
@@ -39,6 +46,7 @@ struct CinnCompiledObject {
std
::
unique_ptr
<::
cinn
::
hlir
::
framework
::
Program
>
runtime_program
;
std
::
unique_ptr
<::
cinn
::
hlir
::
framework
::
Program
>
runtime_program
;
std
::
shared_ptr
<::
cinn
::
hlir
::
framework
::
Scope
>
scope
;
std
::
shared_ptr
<::
cinn
::
hlir
::
framework
::
Scope
>
scope
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
paddle2cinn_varmap
;
std
::
unordered_map
<
std
::
string
,
std
::
string
>
paddle2cinn_varmap
;
std
::
unique_ptr
<
operators
::
details
::
CinnLaunchContext
>
launch_context
;
};
};
// Entrance to use CINN.
// Entrance to use CINN.
...
@@ -87,9 +95,12 @@ class CinnCompiler {
...
@@ -87,9 +95,12 @@ class CinnCompiler {
void
*
stream
=
nullptr
)
const
;
void
*
stream
=
nullptr
)
const
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
ir
::
Graph
>>
graphs_
;
std
::
unordered_map
<
std
::
string
,
std
::
unique_ptr
<
ir
::
Graph
>>
graphs_
;
std
::
unordered_map
<
CinnCacheKey
,
std
::
unique_ptr
<
CinnCompiledObject
>
,
std
::
unordered_map
<
CinnCacheKey
ByAddress
,
CinnCompiledObject
*
,
CinnCacheKey
::
Hash
>
CinnCacheKey
::
Hash
>
cache_
;
cache_by_address_
;
std
::
unordered_map
<
CinnCacheKeyByStructure
,
std
::
unique_ptr
<
CinnCompiledObject
>
,
CinnCacheKey
::
Hash
>
cache_by_struct_
;
std
::
atomic_int64_t
real_compiled_num_
{
0
};
std
::
atomic_int64_t
real_compiled_num_
{
0
};
mutable
RWLock
rwlock_
;
mutable
RWLock
rwlock_
;
...
...
paddle/fluid/framework/pten_utils.cc
浏览文件 @
883ee1a3
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include <sstream>
#include <sstream>
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/pten/core/convert_utils.h"
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
...
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
...
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
}
}
KernelSignature
KernelArgsNameMakerByOpProto
::
GetKernelSignature
()
{
KernelSignature
KernelArgsNameMakerByOpProto
::
GetKernelSignature
()
{
return
KernelSignature
(
op_proto_
->
type
(),
GetInputArgsNames
(),
return
KernelSignature
(
pten
::
TransToPtenKernelName
(
op_proto_
->
type
()),
GetAttrsArgsNames
(),
GetOutputArgsNames
());
GetInputArgsNames
(),
GetAttrsArgsNames
(),
GetOutputArgsNames
());
}
}
std
::
string
KernelSignatureToString
(
const
KernelSignature
&
signature
)
{
std
::
string
KernelSignatureToString
(
const
KernelSignature
&
signature
)
{
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
883ee1a3
...
@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
...
@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
}
}
#ifdef PADDLE_WITH_IPU
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
IPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
platform
::
is_ipu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
IPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
platform
::
is_ipu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
IPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
IPUPlace
,
src_place
),
src_ptr
,
size
);
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
else
if
(
platform
::
is_xpu_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_xpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
is_cpu_place
(
dst_place
))
{
...
@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
}
}
#ifdef PADDLE_WITH_IPU
else
if
(
platform
::
is_ipu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
IPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_ipu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
IPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
else
if
(
platform
::
is_xpu_place
(
src_place
)
&&
// NOLINT
else
if
(
platform
::
is_xpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_cpu_place
(
dst_place
))
{
platform
::
is_cpu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
src_place
),
src_ptr
,
size
);
BOOST_GET_CONST
(
platform
::
XPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
}
platform
::
is_xpu_place
(
dst_place
))
{
else
if
(
platform
::
is_cpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_xpu_place
(
dst_place
))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
BOOST_GET_CONST
(
platform
::
CPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
if
(
platform
::
is_xpu_place
(
src_place
)
&&
// NOLINT
}
platform
::
is_xpu_place
(
dst_place
))
{
else
if
(
platform
::
is_xpu_place
(
src_place
)
&&
// NOLINT
platform
::
is_xpu_place
(
dst_place
))
{
if
(
src_ptr
==
dst_ptr
)
{
if
(
src_ptr
==
dst_ptr
)
{
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
VLOG
(
3
)
<<
"Skip copy the same data async from "
<<
src_place
<<
" to "
<<
dst_place
;
<<
dst_place
;
...
@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
...
@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
}
}
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
dst_place
),
dst_ptr
,
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
dst_place
),
dst_ptr
,
BOOST_GET_CONST
(
platform
::
XPUPlace
,
src_place
),
src_ptr
,
size
);
BOOST_GET_CONST
(
platform
::
XPUPlace
,
src_place
),
src_ptr
,
size
);
}
else
{
// NOLINT
}
else
{
// NOLINT
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
"Copy from %s to %s is not supported."
,
src_place
,
dst_place
));
}
}
...
@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
...
@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
platform
::
errors
::
Unimplemented
(
"Not supported on place (%s) "
,
npu
));
platform
::
errors
::
Unimplemented
(
"Not supported on place (%s) "
,
npu
));
// return GetResultHelper(out, npu);
// return GetResultHelper(out, npu);
}
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
IPUPlace
&
ipu
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Not supported on place (%s) "
,
ipu
));
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
NPUPinnedPlace
&
cpu
)
const
{
const
platform
::
NPUPinnedPlace
&
cpu
)
const
{
...
@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
...
@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
void
VisitorImpl
(
const
platform
::
XPUPlace
&
xpu
)
const
{
void
VisitorImpl
(
const
platform
::
XPUPlace
&
xpu
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported"
));
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported"
));
}
}
void
VisitorImpl
(
const
platform
::
IPUPlace
&
ipu
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"IPUPlace is not supported"
));
}
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
void
VisitorImpl
(
const
platform
::
CUDAPlace
&
gpu
)
const
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/imperative/gradient_accumulator.cc
浏览文件 @
883ee1a3
...
@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
...
@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
"is not supported in imperative mode"
,
"is not supported in imperative mode"
,
place
));
place
));
}
}
// there is NO support in IPUPlace
void
operator
()(
const
platform
::
IPUPlace
&
place
)
{
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode"
,
place
));
}
private:
private:
int64_t
numel_
;
int64_t
numel_
;
...
...
paddle/fluid/imperative/prepared_operator.cc
浏览文件 @
883ee1a3
...
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
...
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
op
.
Type
(),
outs
,
dev_ctx
->
GetPlace
());
op
.
Type
(),
outs
,
dev_ctx
->
GetPlace
());
}
}
if
(
FLAGS_benchmark
)
{
dev_ctx
->
Wait
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
GpuGetLastError
());
VLOG
(
4
)
<<
"Operator("
<<
op
.
Type
()
<<
"): context wait and get last error"
;
#endif
}
/**
/**
* [ Why need handle complex gradient to real gradient? ]
* [ Why need handle complex gradient to real gradient? ]
*
*
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
883ee1a3
...
@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
...
@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
}
}
#endif
#endif
// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
// If later the operators::StridedMemcpyWithAxis0 is supported,
// then this specific SplitTensorsForAllReduce can be removed.
#ifdef PADDLE_WITH_ASCEND_CL
template
<
>
void
SplitTensorsForAllReduce
<
platform
::
NPUDeviceContext
,
float
>
(
const
platform
::
NPUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
framework
::
Tensor
>
*
p_dense_tensors
)
{
auto
*
in
=
p_dense_contents
->
GetMutable
<
framework
::
LoDTensor
>
();
std
::
vector
<
framework
::
Tensor
*>
outs
;
std
::
vector
<
const
framework
::
Tensor
*>
shape_refer
;
outs
.
reserve
(
p_dense_tensors
->
size
());
shape_refer
.
reserve
(
p_dense_tensors
->
size
());
for
(
auto
&
tensor
:
*
p_dense_tensors
)
{
outs
.
emplace_back
(
&
tensor
);
shape_refer
.
emplace_back
(
&
tensor
);
}
operators
::
math
::
SplitFunctor
<
platform
::
NPUDeviceContext
,
float
>
split_functor_
;
split_functor_
(
context
,
*
in
,
shape_refer
,
0
,
&
outs
);
}
template
<
>
void
ConcatTensorsWithType
<
platform
::
NPUDeviceContext
>
(
const
platform
::
NPUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>
&
dense_tensors_
,
framework
::
Variable
*
p_dense_contents
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP32
:
ConcatTensorsForAllReduce
<
platform
::
NPUDeviceContext
,
float
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
template
<
>
void
SplitTensorsWithType
<
platform
::
NPUDeviceContext
>
(
const
platform
::
NPUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
framework
::
Tensor
>
*
p_dense_tensors
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP32
:
SplitTensorsForAllReduce
<
platform
::
NPUDeviceContext
,
float
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
#endif
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
auto
place
=
context
.
GetPlace
();
auto
place
=
context
.
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
platform
::
is_gpu_place
(
place
))
{
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
883ee1a3
...
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
...
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
const
AllocatorMap
&
GetAllocatorMap
()
{
const
AllocatorMap
&
GetAllocatorMap
()
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
Is
ThisThread
Capturing
()))
{
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_allocator_map_
.
end
(),
iter
,
cuda_graph_allocator_map_
.
end
(),
platform
::
errors
::
PermissionDenied
(
platform
::
errors
::
PermissionDenied
(
"No memory pool is prepared for CUDA Graph capturing."
));
"No memory pool is prepared for CUDA Graph capturing."
));
VLOG
(
10
)
<<
"Choose CUDA Graph memory pool to allocate memory"
;
return
iter
->
second
->
allocators_
;
return
iter
->
second
->
allocators_
;
}
else
{
}
else
{
return
allocators_
;
return
allocators_
;
...
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
...
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_HIP)
auto
cuda_allocator
=
std
::
make_shared
<
CUDAAllocator
>
(
p
);
auto
cuda_allocator
=
std
::
make_shared
<
CUDAAllocator
>
(
p
);
cuda_allocators_
[
p
][
stream
]
=
std
::
make_shared
<
AutoGrowthBestFitAllocator
>
(
cuda_allocators_
[
p
][
stream
]
=
std
::
make_shared
<
AutoGrowthBestFitAllocator
>
(
cuda_allocator
,
platform
::
GpuMinChunkSize
(),
allow_free_idle_chunk_
);
cuda_allocator
,
platform
::
GpuMinChunkSize
(),
0
,
allow_free_idle_chunk_
);
#endif
#endif
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA)
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
883ee1a3
...
@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
...
@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return
GetCPUBuddyAllocator
()
->
Used
();
return
GetCPUBuddyAllocator
()
->
Used
();
}
}
// For Graphcore IPU
template
<
>
void
*
Alloc
<
platform
::
IPUPlace
>
(
const
platform
::
IPUPlace
&
place
,
size_t
size
)
{
VLOG
(
10
)
<<
"Allocate "
<<
size
<<
" bytes on "
<<
platform
::
Place
(
place
);
VLOG
(
10
)
<<
"IPUPlace, Allocate on cpu."
;
void
*
p
=
GetCPUBuddyAllocator
()
->
Alloc
(
size
);
if
(
FLAGS_init_allocated_mem
)
{
memset
(
p
,
0xEF
,
size
);
}
VLOG
(
10
)
<<
" pointer="
<<
p
;
return
p
;
}
template
<
>
void
Free
<
platform
::
IPUPlace
>
(
const
platform
::
IPUPlace
&
place
,
void
*
p
,
size_t
size
)
{
VLOG
(
10
)
<<
"Free pointer="
<<
p
<<
" on "
<<
platform
::
Place
(
place
);
GetCPUBuddyAllocator
()
->
Free
(
p
);
}
template
<
>
uint64_t
Release
<
platform
::
IPUPlace
>
(
const
platform
::
IPUPlace
&
place
)
{
return
GetCPUBuddyAllocator
()
->
Release
();
}
template
<
>
size_t
Used
<
platform
::
IPUPlace
>
(
const
platform
::
IPUPlace
&
place
)
{
return
GetCPUBuddyAllocator
()
->
Used
();
}
// For kunlun XPU
// For kunlun XPU
template
<
>
template
<
>
void
*
Alloc
<
platform
::
XPUPlace
>
(
const
platform
::
XPUPlace
&
place
,
size_t
size
)
{
void
*
Alloc
<
platform
::
XPUPlace
>
(
const
platform
::
XPUPlace
&
place
,
size_t
size
)
{
...
...
paddle/fluid/operators/cinn/cinn_launch_context.cc
浏览文件 @
883ee1a3
...
@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
...
@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
[](
const
auto
&
name_view
)
{
return
std
::
string
(
name_view
.
data
());
});
[](
const
auto
&
name_view
)
{
return
std
::
string
(
name_view
.
data
());
});
}
}
bool
CinnLaunchContext
::
IsVariableUsed
(
const
std
::
string
&
paddle_name
)
{
void
CinnLaunchContext
::
UpdateCapturedEnv
(
const
framework
::
Scope
&
scope
,
return
paddle2cinn_varmap_
.
count
(
paddle_name
)
>
0
&&
const
platform
::
Place
&
place
)
{
cinn_variable_names_
.
count
(
paddle2cinn_varmap_
.
at
(
paddle_name
))
>
0
;
if
(
std
::
addressof
(
scope
)
==
cached_scope_
&&
std
::
addressof
(
place
)
==
cached_place_
)
{
VLOG
(
4
)
<<
"Captured scope:"
<<
cached_scope_
<<
", place:"
<<
cached_place_
<<
" are not changed"
;
return
;
}
cached_scope_
=
std
::
addressof
(
scope
);
cached_place_
=
std
::
addressof
(
place
);
cached_temp_scope_
=
scope
.
NewTmpScope
();
VLOG
(
4
)
<<
"Captured env is update, scope:"
<<
cached_scope_
<<
"->"
<<
std
::
addressof
(
scope
)
<<
", place:"
<<
cached_place_
<<
"->"
<<
std
::
addressof
(
place
);
}
bool
CinnLaunchContext
::
IsArgumentsInitialized
()
const
{
if
(
hold_buffers_
.
empty
()
||
name2argument_
.
empty
())
{
return
false
;
}
return
true
;
}
bool
CinnLaunchContext
::
IsVariableUsed
(
const
std
::
string
&
paddle_var_name
)
const
{
return
paddle2cinn_varmap_
.
count
(
paddle_var_name
)
>
0
&&
cinn_variable_names_
.
count
(
paddle2cinn_varmap_
.
at
(
paddle_var_name
))
>
0
;
}
}
CinnTensor
CinnLaunchContext
::
GetCinnTensor
(
const
std
::
string
&
var_name
)
{
CinnTensor
CinnLaunchContext
::
GetCinnTensor
(
const
std
::
string
&
var_name
)
{
...
@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
...
@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
return
all_parameters
;
return
all_parameters
;
}
}
void
CinnLaunchContext
::
CheckTensorEquivalent
(
const
std
::
string
&
paddle_name
,
void
CinnLaunchContext
::
CheckTensorEquivalent
(
const
LoDTensor
&
paddle_tensor
,
const
std
::
string
&
paddle_var_name
,
const
LoDTensor
&
paddle_tensor
,
const
CinnTensor
&
cinn_tensor
)
{
const
CinnTensor
&
cinn_tensor
)
{
// check dimension
// check dimension
auto
cinn_dims
=
framework
::
make_ddim
(
cinn_tensor
->
shape
().
data
());
auto
cinn_dims
=
framework
::
make_ddim
(
cinn_tensor
->
shape
().
data
());
PADDLE_ENFORCE_EQ
(
paddle_tensor
.
dims
(),
cinn_dims
,
PADDLE_ENFORCE_EQ
(
paddle_tensor
.
dims
(),
cinn_dims
,
platform
::
errors
::
PreconditionNotMet
(
platform
::
errors
::
PreconditionNotMet
(
"Tensors' shape in variable(%s) are not equivalent, "
"Tensors' shape in variable(%s) are not equivalent, "
"paddle's shape = [%s], but cinn's shape = [%s]."
,
"paddle's shape = [%s], but cinn's shape = [%s]."
,
paddle_name
,
paddle_tensor
.
dims
(),
cinn_dims
));
paddle_
var_
name
,
paddle_tensor
.
dims
(),
cinn_dims
));
// TODO(CtfGo): check the underlying data type after CINN ready
// TODO(CtfGo): check the underlying data type after CINN ready
}
}
void
CinnLaunchContext
::
AssignExternalVariable
(
const
std
::
string
&
paddle_name
,
void
CinnLaunchContext
::
AssignExternalVariable
(
const
platform
::
Place
&
place
,
const
std
::
string
&
paddle_var_name
)
{
LoDTensor
*
paddle_tensor
)
{
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
IsVariableUsed
(
paddle_name
),
true
,
IsVariableUsed
(
paddle_var_name
),
true
,
platform
::
errors
::
InvalidArgument
(
platform
::
errors
::
InvalidArgument
(
"Paddle variable(%s) not used by cinn"
,
"Paddle variable(%s) not used by cinn"
,
paddle_name
));
paddle_var_name
));
const
auto
&
cinn_name
=
paddle2cinn_varmap_
.
at
(
paddle_name
);
const
auto
&
cinn_var_name
=
paddle2cinn_varmap_
.
at
(
paddle_var_name
);
CinnTensor
cinn_tensor
=
GetCinnTensor
(
cinn_name
);
const
auto
&
paddle_tensor
=
if
(
!
paddle_tensor
->
IsInitialized
())
{
cached_scope_
->
GetVar
(
paddle_var_name
)
->
Get
<
LoDTensor
>
();
paddle_tensor
->
Resize
(
framework
::
make_ddim
(
cinn_tensor
->
shape
().
data
()));
CinnTensor
cinn_tensor
=
GetCinnTensor
(
cinn_var_name
);
if
(
paddle_tensor
.
IsInitialized
())
{
CheckTensorEquivalent
(
paddle_var_name
,
paddle_tensor
,
cinn_tensor
);
}
}
CheckTensorEquivalent
(
paddle_name
,
*
paddle_tensor
,
cinn_tensor
);
return
SetArgument
(
cinn_name
,
place
,
/* free_mem_callback = */
false
,
paddle_tensor
);
}
void
CinnLaunchContext
::
AssignInternalVariable
(
const
std
::
string
&
cinn_name
,
auto
cinn_buffer
=
std
::
make_unique
<
cinn_buffer_t
>
();
const
platform
::
Place
&
place
,
// assign dimensions and alloc/free callback of cinn_buffer_t
LoDTensor
*
paddle_tensor
)
{
cinn_buffer
->
resize
(
cinn_tensor
->
shape
().
data
().
data
(),
PADDLE_ENFORCE_GT
(
cinn_variable_names_
.
count
(
cinn_name
),
0
,
cinn_tensor
->
shape
().
data
().
size
());
platform
::
errors
::
InvalidArgument
(
cinn_buffer
->
external_malloc
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
"Variable(%s) not found in cinn socpe."
,
cinn_name
));
[
this
,
paddle_var_name
](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
CinnTensor
cinn_tensor
=
GetCinnTensor
(
cinn_name
);
auto
*
tensor
=
if
(
!
paddle_tensor
->
IsInitialized
())
{
cached_scope_
->
GetVar
(
paddle_var_name
)
->
GetMutable
<
LoDTensor
>
();
paddle_tensor
->
Resize
(
framework
::
make_ddim
(
cinn_tensor
->
shape
().
data
()));
tensor
->
Resize
(
framework
::
DDim
(
buffer
->
dims
,
buffer
->
dimensions
));
}
buffer
->
memory
=
reinterpret_cast
<
uint8_t
*>
(
CheckTensorEquivalent
(
cinn_name
,
*
paddle_tensor
,
cinn_tensor
);
tensor
->
mutable_data
<
float
>
(
*
cached_place_
));
return
SetArgument
(
cinn_name
,
place
,
/* free_mem_callback = */
true
,
return
0
;
paddle_tensor
);
});
}
std
::
unique_ptr
<
cinn_buffer_t
>
CinnLaunchContext
::
ShareTensorWithCinnBuffer
(
// external variables will be recycled by global gc, so do nothing here
const
platform
::
Place
&
place
,
bool
free_mem_callback
,
LoDTensor
*
tensor
)
{
cinn_buffer
->
external_free
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
// convert paddle dimensions array to cinn format
[](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
std
::
vector
<
cinn_dimension_t
>
cinn_dims
(
tensor
->
dims
().
size
());
// Do nothing
for
(
auto
i
=
0
;
i
<
tensor
->
dims
().
size
();
++
i
)
{
return
0
;
cinn_dims
[
i
]
=
static_cast
<
cinn_dimension_t
>
(
tensor
->
dims
().
at
(
i
));
});
}
return
SetArgument
(
cinn_var_name
,
std
::
move
(
cinn_buffer
));
}
void
CinnLaunchContext
::
AssignInternalVariable
(
const
std
::
string
&
cinn_var_name
)
{
PADDLE_ENFORCE_GT
(
cinn_variable_names_
.
count
(
cinn_var_name
),
0
,
platform
::
errors
::
InvalidArgument
(
"Variable(%s) not found in cinn socpe."
,
cinn_var_name
));
CinnTensor
cinn_tensor
=
GetCinnTensor
(
cinn_var_name
);
auto
cinn_buffer
=
std
::
make_unique
<
cinn_buffer_t
>
();
auto
cinn_buffer
=
std
::
make_unique
<
cinn_buffer_t
>
();
// assign size and memory
// assign dimensions and alloc/free callback of cinn_buffer_t
cinn_buffer
->
resize
(
cinn_dims
.
data
(),
cinn_dims
.
size
());
cinn_buffer
->
resize
(
cinn_tensor
->
shape
().
data
().
data
(),
cinn_tensor
->
shape
().
data
().
size
());
cinn_buffer
->
external_malloc
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
cinn_buffer
->
external_malloc
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
[
place
,
tensor
](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
[
this
,
cinn_var_name
](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
buffer
->
memory
=
auto
*
tensor
=
reinterpret_cast
<
uint8_t
*>
(
tensor
->
mutable_data
<
float
>
(
place
));
cached_temp_scope_
->
Var
(
cinn_var_name
)
->
GetMutable
<
LoDTensor
>
();
tensor
->
Resize
(
framework
::
DDim
(
buffer
->
dims
,
buffer
->
dimensions
));
buffer
->
memory
=
reinterpret_cast
<
uint8_t
*>
(
tensor
->
mutable_data
<
float
>
(
*
cached_place_
));
return
0
;
return
0
;
});
});
if
(
free_mem_callback
)
{
// internal variables should release its buffer immediately
cinn_buffer
->
external_free
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
// if no instruction use it
[
tensor
](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
tensor
->
clear
();
return
0
;
});
return
cinn_buffer
;
}
cinn_buffer
->
external_free
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
cinn_buffer
->
external_free
=
new
std
::
function
<
int
(
void
*
,
cinn_buffer_t
*
)
>
(
[](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
[
this
,
cinn_var_name
](
void
*
ctx
,
cinn_buffer_t
*
buffer
)
{
// Do nothing
auto
*
tensor
=
cached_temp_scope_
->
GetVar
(
cinn_var_name
)
->
GetMutable
<
LoDTensor
>
();
tensor
->
clear
();
return
0
;
return
0
;
});
});
return
cinn_buffer
;
return
SetArgument
(
cinn_var_name
,
std
::
move
(
cinn_buffer
))
;
}
}
void
CinnLaunchContext
::
SetArgument
(
const
std
::
string
&
cinn_name
,
void
CinnLaunchContext
::
SetArgument
(
const
std
::
string
&
cinn_
var_
name
,
const
platform
::
Place
&
place
,
std
::
unique_ptr
<
cinn_buffer_t
>&&
buffer
)
{
bool
free_mem_callback
,
VLOG
(
4
)
<<
"SetArgument-"
<<
name2argument_
.
size
()
<<
": name("
LoDTensor
*
paddle_tensor
)
{
<<
cinn_var_name
<<
"), dims("
auto
buffer
=
<<
framework
::
DDim
(
buffer
->
dims
,
buffer
->
dimensions
)
<<
")."
;
ShareTensorWithCinnBuffer
(
place
,
free_mem_callback
,
paddle_tensor
);
name2argument_
.
emplace
(
cinn_name
,
buffer
.
get
());
name2argument_
.
emplace
(
cinn_
var_
name
,
buffer
.
get
());
hold_buffers_
.
emplace_back
(
std
::
move
(
buffer
));
hold_buffers_
.
emplace_back
(
std
::
move
(
buffer
));
VLOG
(
4
)
<<
"SetArgument-"
<<
name2argument_
.
size
()
<<
": "
<<
"name("
<<
cinn_name
<<
"), dims("
<<
paddle_tensor
->
dims
()
<<
")."
;
}
}
const
std
::
map
<
std
::
string
,
cinn_pod_value_t
>&
const
std
::
map
<
std
::
string
,
cinn_pod_value_t
>&
...
...
paddle/fluid/operators/cinn/cinn_launch_context.h
浏览文件 @
883ee1a3
...
@@ -24,7 +24,7 @@
...
@@ -24,7 +24,7 @@
#include "cinn/runtime/cinn_runtime.h"
#include "cinn/runtime/cinn_runtime.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/
platform/plac
e.h"
#include "paddle/fluid/
framework/scop
e.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -40,16 +40,22 @@ class CinnLaunchContext {
...
@@ -40,16 +40,22 @@ class CinnLaunchContext {
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
paddle2cinn_varmap
,
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
paddle2cinn_varmap
,
const
std
::
shared_ptr
<
CinnScope
>&
cinn_scope
);
const
std
::
shared_ptr
<
CinnScope
>&
cinn_scope
);
// explicitly update several environment variables captured
// by callback of execution arguments
void
UpdateCapturedEnv
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
place
);
// Return whether execution arguments has been initialized
bool
IsArgumentsInitialized
()
const
;
// Return whether a Paddle variable used on compiled kernels
// Return whether a Paddle variable used on compiled kernels
bool
IsVariableUsed
(
const
std
::
string
&
var_name
)
;
bool
IsVariableUsed
(
const
std
::
string
&
paddle_var_name
)
const
;
// Assign tensor buffer to input or output variables
// Assign tensor buffer to input or output variables
void
AssignExternalVariable
(
const
std
::
string
&
var_name
,
void
AssignExternalVariable
(
const
std
::
string
&
paddle_var_name
);
const
platform
::
Place
&
place
,
LoDTensor
*
tensor
);
// Assign tensor buffer to internal variables
// Assign tensor buffer to internal variables
void
AssignInternalVariable
(
const
std
::
string
&
var_name
,
void
AssignInternalVariable
(
const
std
::
string
&
cinn_var_name
);
const
platform
::
Place
&
place
,
LoDTensor
*
tensor
);
// Extract internal variable names from CinnScope
// Extract internal variable names from CinnScope
// by excluding used input and output variables
// by excluding used input and output variables
...
@@ -58,10 +64,6 @@ class CinnLaunchContext {
...
@@ -58,10 +64,6 @@ class CinnLaunchContext {
// Finalize all execution arguments and return them
// Finalize all execution arguments and return them
const
std
::
map
<
std
::
string
,
cinn_pod_value_t
>&
FinalizeArguments
()
const
;
const
std
::
map
<
std
::
string
,
cinn_pod_value_t
>&
FinalizeArguments
()
const
;
std
::
vector
<
std
::
unique_ptr
<
cinn_buffer_t
>>
HandoverBuffers
()
{
return
std
::
move
(
hold_buffers_
);
}
private:
private:
// Get CinnTensor with CINN variable name
// Get CinnTensor with CINN variable name
CinnTensor
GetCinnTensor
(
const
std
::
string
&
var_name
);
CinnTensor
GetCinnTensor
(
const
std
::
string
&
var_name
);
...
@@ -72,16 +74,15 @@ class CinnLaunchContext {
...
@@ -72,16 +74,15 @@ class CinnLaunchContext {
const
LoDTensor
&
paddle_tensor
,
const
LoDTensor
&
paddle_tensor
,
const
CinnTensor
&
cinn_tensor
);
const
CinnTensor
&
cinn_tensor
);
// Share the buffer of a Paddle tensor to CINN by delivering memory address
// Set an argument with (cinn name)->(cinn_buffer_t) pair
// to a cinn_buffer_t object
void
SetArgument
(
const
std
::
string
&
cinn_var_name
,
std
::
unique_ptr
<
cinn_buffer_t
>
ShareTensorWithCinnBuffer
(
std
::
unique_ptr
<
cinn_buffer_t
>&&
buffer
);
const
platform
::
Place
&
place
,
bool
free_mem_callback
,
LoDTensor
*
tensor
);
// Set an argument with (cinn name)->(paddle tensor) pair
void
SetArgument
(
const
std
::
string
&
cinn_name
,
const
platform
::
Place
&
place
,
bool
free_mem_callback
,
LoDTensor
*
paddle_tensor
);
private:
private:
const
framework
::
Scope
*
cached_scope_
=
nullptr
;
const
platform
::
Place
*
cached_place_
=
nullptr
;
std
::
unique_ptr
<
framework
::
Scope
>
cached_temp_scope_
=
nullptr
;
// a variable name map from paddle to cinn
// a variable name map from paddle to cinn
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
paddle2cinn_varmap_
;
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
paddle2cinn_varmap_
;
// the variable scope of cinn
// the variable scope of cinn
...
...
paddle/fluid/operators/cinn/cinn_launch_context_test.cc
浏览文件 @
883ee1a3
...
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
...
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
return
std
::
make_unique
<
CinnLaunchContext
>
(
paddle2cinn_varmap
,
cinn_scope
);
return
std
::
make_unique
<
CinnLaunchContext
>
(
paddle2cinn_varmap
,
cinn_scope
);
}
}
TEST
(
CinnLaunchContextTest
,
Test
IsVariableUsed
)
{
TEST
(
CinnLaunchContextTest
,
Test
Basic
)
{
auto
launch_context
=
CreateDefaultLaunchContext
();
auto
launch_context
=
CreateDefaultLaunchContext
();
// test IsVariableUsed
ASSERT_EQ
(
launch_context
->
IsVariableUsed
(
"var1"
),
true
);
ASSERT_EQ
(
launch_context
->
IsVariableUsed
(
"var1"
),
true
);
ASSERT_EQ
(
launch_context
->
IsVariableUsed
(
"var4"
),
false
);
ASSERT_EQ
(
launch_context
->
IsVariableUsed
(
"var4"
),
false
);
}
// test UpdateCapturedEnv
platform
::
CPUPlace
place
;
TEST
(
CinnLaunchContextTest
,
TestGetInternalVariableNames
)
{
framework
::
Scope
scope
;
auto
launch_context
=
CreateDefaultLaunchContext
();
ASSERT_NO_THROW
(
launch_context
->
UpdateCapturedEnv
(
scope
,
place
));
auto
internal_variable_names
=
launch_context
->
GetInternalVariableNames
();
// test IsArgumentsInitialized
ASSERT_EQ
(
internal_variable_names
.
size
(),
3
);
ASSERT_FALSE
(
launch_context
->
IsArgumentsInitialized
());
EXPECT_NE
(
internal_variable_names
.
find
(
"cinn_var2"
),
internal_variable_names
.
end
());
}
}
TEST
(
CinnLaunchContextTest
,
TestCheckTensorEquivalent
)
{
TEST
(
CinnLaunchContextTest
,
TestCheckTensorEquivalent
)
{
auto
launch_context
=
CreateDefaultLaunchContext
();
platform
::
CPUPlace
place
;
platform
::
CPUPlace
place
;
framework
::
Scope
scope
;
framework
::
Scope
scope
;
auto
launch_context
=
CreateDefaultLaunchContext
();
launch_context
->
UpdateCapturedEnv
(
scope
,
place
);
auto
*
tensor1
=
scope
.
Var
(
"var1"
)
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor1
=
scope
.
Var
(
"var1"
)
->
GetMutable
<
LoDTensor
>
();
// CheckTensorEquivalent: tensor dimension not equivalent
// CheckTensorEquivalent: tensor dimension not equivalent
tensor1
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
3
,
5
}),
place
);
tensor1
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
3
,
5
}),
place
);
ASSERT_THROW
(
launch_context
->
AssignExternalVariable
(
"var1"
,
place
,
tensor1
),
ASSERT_THROW
(
launch_context
->
AssignExternalVariable
(
"var1"
),
paddle
::
platform
::
EnforceNotMet
);
paddle
::
platform
::
EnforceNotMet
);
}
}
TEST
(
CinnLaunchContextTest
,
TestAssignVariablePreCondition
)
{
TEST
(
CinnLaunchContextTest
,
TestAssignVariablePreCondition
)
{
auto
launch_context
=
CreateDefaultLaunchContext
();
platform
::
CPUPlace
place
;
platform
::
CPUPlace
place
;
framework
::
Scope
scope
;
framework
::
Scope
scope
;
auto
launch_context
=
CreateDefaultLaunchContext
();
launch_context
->
UpdateCapturedEnv
(
scope
,
place
);
auto
*
tensor4
=
scope
.
Var
(
"var4"
)
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor4
=
scope
.
Var
(
"var4"
)
->
GetMutable
<
LoDTensor
>
();
// not used
// not used
ASSERT_THROW
(
launch_context
->
AssignExternalVariable
(
"var4"
,
place
,
tensor4
),
ASSERT_THROW
(
launch_context
->
AssignExternalVariable
(
"var4"
),
paddle
::
platform
::
EnforceNotMet
);
paddle
::
platform
::
EnforceNotMet
);
// not found
// not found
ASSERT_THROW
(
ASSERT_THROW
(
launch_context
->
AssignInternalVariable
(
"cinn_var4"
),
launch_context
->
AssignExternalVariable
(
"cinn_var4"
,
place
,
tensor4
),
paddle
::
platform
::
EnforceNotMet
);
paddle
::
platform
::
EnforceNotMet
);
}
}
TEST
(
CinnLaunchContextTest
,
TestSetArgument
)
{
TEST
(
CinnLaunchContextTest
,
TestSetArgument
)
{
platform
::
CPUPlace
cpu_place
;
platform
::
Place
place
(
cpu_place
);
framework
::
Scope
scope
;
auto
launch_context
=
CreateDefaultLaunchContext
();
auto
launch_context
=
CreateDefaultLaunchContext
();
launch_context
->
UpdateCapturedEnv
(
scope
,
place
);
platform
::
CPUPlace
place
;
// assign external variables
framework
::
Scope
scope
;
auto
*
tensor1
=
scope
.
Var
(
"var1"
)
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor1
=
scope
.
Var
(
"var1"
)
->
GetMutable
<
LoDTensor
>
();
float
*
data1
=
float
*
data1
=
tensor1
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
3
,
4
}),
place
);
tensor1
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
3
,
4
}),
place
);
data1
[
0
]
=
9.99
f
;
data1
[
0
]
=
9.99
f
;
data1
[
10
]
=
19.99
f
;
data1
[
10
]
=
19.99
f
;
ASSERT_NO_THROW
(
launch_context
->
AssignExternalVariable
(
"var1"
));
// assign external variable
ASSERT_NO_THROW
(
launch_context
->
AssignExternalVariable
(
"var1"
,
place
,
tensor1
));
auto
*
tensor2
=
scope
.
Var
(
"var2"
)
->
GetMutable
<
LoDTensor
>
();
tensor2
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
6
,
7
,
8
}),
place
);
ASSERT_NO_THROW
(
launch_context
->
AssignInternalVariable
(
"cinn_var2"
,
place
,
tensor2
));
// FinalizeArguments not missed check
ASSERT_THROW
(
launch_context
->
FinalizeArguments
(),
paddle
::
platform
::
EnforceNotMet
);
auto
*
tensor3
=
scope
.
Var
(
"var3"
)
->
GetMutable
<
LoDTensor
>
();
auto
*
tensor3
=
scope
.
Var
(
"var3"
)
->
GetMutable
<
LoDTensor
>
();
tensor3
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
10
,
16
}),
place
);
tensor3
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
10
,
16
}),
place
);
ASSERT_NO_THROW
(
ASSERT_NO_THROW
(
launch_context
->
AssignExternalVariable
(
"var3"
));
launch_context
->
AssignExternalVariable
(
"var3"
,
place
,
tensor3
));
// FinalizeArguments missed check
ASSERT_THROW
(
launch_context
->
FinalizeArguments
(),
paddle
::
platform
::
EnforceNotMet
);
// test get internal variables
auto
internal_variable_names
=
launch_context
->
GetInternalVariableNames
();
ASSERT_EQ
(
internal_variable_names
.
size
(),
1
);
EXPECT_EQ
(
*
internal_variable_names
.
begin
(),
"cinn_var2"
);
auto
*
tensor2
=
scope
.
Var
(
"var2"
)
->
GetMutable
<
LoDTensor
>
();
tensor2
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
6
,
7
,
8
}),
place
);
ASSERT_NO_THROW
(
launch_context
->
AssignInternalVariable
(
"cinn_var2"
));
// check argument is set correctly and alloc/free callbacks work well
auto
name2argument
=
launch_context
->
FinalizeArguments
();
auto
name2argument
=
launch_context
->
FinalizeArguments
();
ASSERT_EQ
(
name2argument
.
size
(),
3
);
ASSERT_EQ
(
name2argument
.
size
(),
3
);
ASSERT_EQ
(
name2argument
.
count
(
"cinn_var1"
),
1
);
ASSERT_EQ
(
name2argument
.
count
(
"cinn_var1"
),
1
);
// check ShareTensorWithCinnBuffer
ASSERT_TRUE
(
launch_context
->
IsArgumentsInitialized
());
auto
*
cinn_buffer
=
auto
*
cinn_buffer
=
static_cast
<
cinn_buffer_t
*>
(
name2argument
.
at
(
"cinn_var1"
));
static_cast
<
cinn_buffer_t
*>
(
name2argument
.
at
(
"cinn_var1"
));
ASSERT_EQ
(
cinn_buffer
->
memory
,
nullptr
);
ASSERT_EQ
(
cinn_buffer
->
memory
,
nullptr
);
cinn_buffer
->
external_malloc
->
operator
()(
nullptr
,
cinn_buffer
);
cinn_buffer
->
external_malloc
->
operator
()(
nullptr
,
cinn_buffer
);
ASSERT_NE
(
cinn_buffer
->
memory
,
nullptr
);
ASSERT_NE
(
cinn_buffer
->
memory
,
nullptr
);
...
...
paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
浏览文件 @
883ee1a3
...
@@ -31,26 +31,6 @@ namespace operators {
...
@@ -31,26 +31,6 @@ namespace operators {
namespace
details
{
namespace
details
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
void
CUDART_CB
ReleaseScope
(
void
*
data
)
{
auto
*
temp_scope
=
static_cast
<
framework
::
Scope
*>
(
data
);
delete
temp_scope
;
}
void
CUDART_CB
ReleaseBuffers
(
void
*
data
)
{
auto
*
buffers
=
static_cast
<
std
::
vector
<
std
::
unique_ptr
<
cinn_buffer_t
>>*>
(
data
);
delete
buffers
;
}
template
<
>
void
ReleaseResource
<
platform
::
CUDADeviceContext
>
(
const
std
::
vector
<
void
*>&
resources
,
void
*
stream
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaLaunchHostFunc
(
static_cast
<
gpuStream_t
>
(
stream
),
ReleaseScope
,
resources
[
0
]));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaLaunchHostFunc
(
static_cast
<
gpuStream_t
>
(
stream
),
ReleaseBuffers
,
resources
[
1
]));
}
template
<
>
template
<
>
void
*
GetStream
<
platform
::
CUDADeviceContext
>
(
void
*
GetStream
<
platform
::
CUDADeviceContext
>
(
const
framework
::
ExecutionContext
&
ctx
)
{
const
framework
::
ExecutionContext
&
ctx
)
{
...
...
paddle/fluid/operators/cinn/cinn_launch_op.h
浏览文件 @
883ee1a3
...
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
...
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
// Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
// Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
void
SetCinnRuntimeFlags
();
void
SetCinnRuntimeFlags
();
template
<
typename
DeviceContext
>
void
ReleaseResource
(
const
std
::
vector
<
void
*>&
resources
,
void
*
stream
)
{
auto
*
temp_scope
=
static_cast
<
framework
::
Scope
*>
(
resources
[
0
]);
auto
*
buffers
=
static_cast
<
std
::
vector
<
std
::
unique_ptr
<
cinn_buffer_t
>>*>
(
resources
[
1
]);
delete
temp_scope
;
delete
buffers
;
}
template
<
typename
DeviceContext
>
template
<
typename
DeviceContext
>
void
*
GetStream
(
const
framework
::
ExecutionContext
&
ctx
)
{
void
*
GetStream
(
const
framework
::
ExecutionContext
&
ctx
)
{
return
nullptr
;
return
nullptr
;
}
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
template
<
>
void
ReleaseResource
<
platform
::
CUDADeviceContext
>
(
const
std
::
vector
<
void
*>&
resources
,
void
*
stream
);
template
<
>
template
<
>
void
*
GetStream
<
platform
::
CUDADeviceContext
>
(
void
*
GetStream
<
platform
::
CUDADeviceContext
>
(
const
framework
::
ExecutionContext
&
ctx
);
const
framework
::
ExecutionContext
&
ctx
);
...
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
...
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
compilation_key
,
inputs_name2tensor
,
target
,
stream
);
compilation_key
,
inputs_name2tensor
,
target
,
stream
);
details
::
DebugCinnCompiledResult
(
cinn_compiled_object
);
details
::
DebugCinnCompiledResult
(
cinn_compiled_object
);
auto
launch_context
=
std
::
make_unique
<
details
::
CinnLaunchContext
>
(
auto
*
launch_context
=
cinn_compiled_object
.
launch_context
.
get
();
cinn_compiled_object
.
paddle2cinn_varmap
,
cinn_compiled_object
.
scope
);
// Step 3. Prepare arguments needed for the compiled executable program.
// Step 3. Prepare arguments needed for the compiled executable program.
VLOG
(
4
)
<<
"CinnLaunchOp prepare arguments"
;
launch_context
->
UpdateCapturedEnv
(
scope
,
place
);
if
(
!
launch_context
->
IsArgumentsInitialized
())
{
// 3.1 Prepare input variables: tensors of input variables have
VLOG
(
4
)
<<
"CinnLaunchOp prepare arguments"
;
// been initialized before graph compiled, just check the
// equiality between tensors of paddle and cinn.
// 3.1 Prepare input variables: tensors of input variables have
for
(
const
auto
&
var_name
:
input_variable_names
)
{
// been initialized before graph compiled, just check the
if
(
!
launch_context
->
IsVariableUsed
(
var_name
))
{
// equiality between tensors of paddle and cinn.
// some input variables don't need for cinn because they are
for
(
const
auto
&
var_name
:
input_variable_names
)
{
// eliminated by optimized passes or some cinn operators use
if
(
!
launch_context
->
IsVariableUsed
(
var_name
))
{
// less variables
// some input variables don't need for cinn because they are
VLOG
(
4
)
<<
"Input variable("
<<
var_name
<<
") not used by cinn"
;
// eliminated by optimized passes or some cinn operators use
continue
;
// less variables
VLOG
(
4
)
<<
"Input variable("
<<
var_name
<<
") not used by cinn"
;
continue
;
}
launch_context
->
AssignExternalVariable
(
var_name
);
}
}
launch_context
->
AssignExternalVariable
(
// 3.2 Prepare output variables: all output variables should
var_name
,
place
,
scope
.
GetVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
());
// be initialized and allocated buffer before
}
// the runtime program start execution, the compilation result
// includes details of their buffer assginment and we use that to
// 3.2 Prepare output variables: all output variables should
// allocate space in Paddle. For those variables allocated yet,
// be initialized and allocated buffer before
// like persistable parameters, just check the equiality between
// the runtime program start execution, the compilation result
// Paddle allocation and CINN buffer assginment.
// includes details of their buffer assginment and we use that to
auto
output_variable_names
=
ctx
.
OutputNames
(
kOutputs
);
// allocate space in Paddle. For those variables allocated yet,
for
(
const
auto
var_name
:
output_variable_names
)
{
// like persistable parameters, just check the equiality between
PADDLE_ENFORCE_EQ
(
// Paddle allocation and CINN buffer assginment.
launch_context
->
IsVariableUsed
(
var_name
),
true
,
auto
output_variable_names
=
ctx
.
OutputNames
(
kOutputs
);
platform
::
errors
::
InvalidArgument
(
for
(
const
auto
var_name
:
output_variable_names
)
{
"Output variable(%s) not used by cinn"
,
var_name
));
PADDLE_ENFORCE_EQ
(
launch_context
->
IsVariableUsed
(
var_name
),
true
,
platform
::
errors
::
InvalidArgument
(
launch_context
->
AssignExternalVariable
(
var_name
);
"Output variable(%s) not used by cinn"
,
var_name
));
}
auto
*
tensor
=
scope
.
GetVar
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
launch_context
->
AssignExternalVariable
(
var_name
,
place
,
tensor
);
}
// 3.3 Prepare internal or temporary variables: Create a temporary
// 3.3 Prepare internal or temporary variables: Create a temporary
// scope to keep internal variables within graph or temporary
// scope to keep internal variables within graph or temporary
// variables needed by the compiled runtime program in addition.
// variables needed by the compiled runtime program in addition.
// Here we directly use the names from CinnScope as Paddle variable
// Here we directly use the names from CinnScope as Paddle variable
// names, because they will not be used outside the graph
// names, because they will not be used outside the graph
// and should be destructed after computation finished.
// and should be destructed after computation finished.
auto
internal_variable_names
=
launch_context
->
GetInternalVariableNames
();
auto
internal_variable_names
=
launch_context
->
GetInternalVariableNames
();
framework
::
Scope
*
temp_scope
=
scope
.
NewTmpScope
().
release
();
for
(
const
auto
&
var_name
:
internal_variable_names
)
{
for
(
const
auto
&
var_name
:
internal_variable_names
)
{
launch_context
->
AssignInternalVariable
(
var_name
);
auto
*
tensor
=
temp_scope
->
Var
(
var_name
)
->
GetMutable
<
LoDTensor
>
();
}
launch_context
->
AssignInternalVariable
(
var_name
,
place
,
tensor
);
}
}
// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
// Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
...
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
...
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
VLOG
(
4
)
<<
"Run Cinn compiled executable program with stream: "
<<
stream
;
VLOG
(
4
)
<<
"Run Cinn compiled executable program with stream: "
<<
stream
;
details
::
LaunchCinnExecution
(
cinn_compiled_object
,
*
launch_context
,
stream
);
details
::
LaunchCinnExecution
(
cinn_compiled_object
,
*
launch_context
,
stream
);
VLOG
(
4
)
<<
"CinnLaunchOp launch execution done."
;
VLOG
(
4
)
<<
"CinnLaunchOp launch execution done."
;
// Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
auto
*
buffers_holder
=
new
std
::
vector
<
std
::
unique_ptr
<
cinn_buffer_t
>>
{
launch_context
->
HandoverBuffers
()};
details
::
ReleaseResource
<
DeviceContext
>
({
temp_scope
,
buffers_holder
},
stream
);
}
}
};
};
...
...
paddle/fluid/operators/cinn/cinn_launch_op_test.cc
浏览文件 @
883ee1a3
...
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
...
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
scope
.
Var
(
test_out_name
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
test_out_name
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
expected_out_name
)
->
GetMutable
<
LoDTensor
>
();
scope
.
Var
(
expected_out_name
)
->
GetMutable
<
LoDTensor
>
();
cinn_launch_op
->
Run
(
scope
,
place
);
platform
::
Place
run_place
(
place
);
elementwise_add_op
->
Run
(
scope
,
place
);
cinn_launch_op
->
Run
(
scope
,
run_place
);
elementwise_add_op
->
Run
(
scope
,
run_place
);
LoDTensor
test_out
,
expected_out
;
LoDTensor
test_out
,
expected_out
;
TensorCopySync
(
scope
.
Var
(
test_out_name
)
->
Get
<
LoDTensor
>
(),
TensorCopySync
(
scope
.
Var
(
test_out_name
)
->
Get
<
LoDTensor
>
(),
...
...
paddle/fluid/operators/complex_view_op.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/complex_view_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
class
AsComplexOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"as_complex"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"as_complex"
);
auto
in_dims
=
ctx
->
GetInputDim
(
"X"
);
const
int
input_rank
=
in_dims
.
size
();
PADDLE_ENFORCE_GE
(
input_rank
,
1
,
platform
::
errors
::
InvalidArgument
(
"The rank of input(X) is less than 1. "
"Expected the rank of input(X) to be equal to or greater than 1."
"But received rank of input(X) = %d"
,
input_rank
));
const
int
last_dim_size
=
in_dims
[
input_rank
-
1
];
PADDLE_ENFORCE_EQ
(
last_dim_size
,
2
,
platform
::
errors
::
InvalidArgument
(
"The size of the last dimension of input(X)"
"does not equals 2."
"Expected the size of last dimension of input(X) to be 2."
"But received %d"
,
last_dim_size
));
const
framework
::
DDim
out_dims
(
in_dims
.
Get
(),
input_rank
-
1
);
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
};
class
AsComplexOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor), The input tensor of view_as_complex op."
);
AddOutput
(
"Out"
,
"(Tensor), The output tensor of view_as_complex op."
);
AddComment
(
R"DOC(
As_complex Operator.
This operator is used to return a complex tensor represented
by an old-fashioned real tensor. The size of the last dimension of
the input tensor should be 2, which corresponds to 'real' and
'complex', respectively.
)DOC"
);
}
};
template
<
typename
T
>
class
AsComplexGradMaker
:
public
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
void
Apply
(
GradOpPtr
<
T
>
retv
)
const
override
{
retv
->
SetType
(
"as_real"
);
retv
->
SetInput
(
"X"
,
this
->
OutputGrad
(
"Out"
));
retv
->
SetAttrMap
(
this
->
Attrs
());
retv
->
SetOutput
(
"Out"
,
this
->
InputGrad
(
"X"
));
}
};
class
AsRealOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"as_real"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"as_real"
);
auto
out_dims_v
=
framework
::
vectorize
(
ctx
->
GetInputDim
(
"X"
));
out_dims_v
.
push_back
(
2
);
const
framework
::
DDim
out_dims
=
framework
::
make_ddim
(
out_dims_v
);
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
return
framework
::
OpKernelType
(
framework
::
ToRealType
(
input_data_type
),
ctx
.
GetPlace
());
}
};
class
AsRealOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor), The input tensor of as_real op."
);
AddOutput
(
"Out"
,
"(Tensor), The output tensor of as_real op."
);
AddComment
(
R"DOC(
AsReal Operator.
This operator is used to return an old-fashioned real tensor from a
complex tensor. The size of the last dimension of the output tensor is 2,
which corresponds to 'real' and 'complex', respectively.
)DOC"
);
}
};
template
<
typename
T
>
class
AsRealGradMaker
:
public
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
void
Apply
(
GradOpPtr
<
T
>
retv
)
const
override
{
retv
->
SetType
(
"as_complex"
);
retv
->
SetInput
(
"X"
,
this
->
OutputGrad
(
"Out"
));
retv
->
SetAttrMap
(
this
->
Attrs
());
retv
->
SetOutput
(
"Out"
,
this
->
InputGrad
(
"X"
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
as_complex
,
ops
::
AsComplexOp
,
ops
::
AsComplexOpMaker
,
ops
::
AsComplexGradMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
AsComplexGradMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OPERATOR
(
as_real
,
ops
::
AsRealOp
,
ops
::
AsRealOpMaker
,
ops
::
AsRealGradMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
AsRealGradMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OP_CPU_KERNEL
(
as_complex
,
ops
::
AsComplexKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AsComplexKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
as_real
,
ops
::
AsRealKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AsRealKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/complex_view_op.cu
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/complex_view_op.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/enforce.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
as_complex
,
ops
::
AsComplexKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AsComplexKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
as_real
,
ops
::
AsRealKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AsRealKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/complex_view_op.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/complex_functors.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
AsComplexKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
auto
*
x
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
out
->
mutable_data
<
platform
::
complex
<
T
>>
(
context
.
GetPlace
());
// TensorCopy also changes output's shape & dtype
const
framework
::
DDim
out_dims_original
=
out
->
dims
();
framework
::
TensorCopy
(
*
x
,
context
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims_original
);
// restored the shape
out
->
mutable_data
<
platform
::
complex
<
T
>>
(
context
.
GetPlace
());
// restore the dtype
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
AsRealKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
auto
*
x
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
const
framework
::
DDim
out_dims_original
=
out
->
dims
();
framework
::
TensorCopy
(
*
x
,
context
.
GetPlace
(),
out
);
out
->
Resize
(
out_dims_original
);
// restored the shape
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
// restore the dtype
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/ipu_runtime_op.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/ipu_runtime_op.h"
namespace
paddle
{
namespace
operators
{
class
IpuRuntimeOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
proto
::
VarType
::
Type
(
ctx
.
Attr
<
int
>
(
"dtype"
)),
ctx
.
device_context
());
}
};
class
IpuRuntimeOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"FeedList"
,
"FeedList of Graph"
).
AsDuplicable
();
AddOutput
(
"FetchList"
,
"FetchList of Graph"
).
AsDuplicable
();
AddAttr
<
int
>
(
"dtype"
,
"(int, default 5 (FP32)) "
"Output data type"
)
.
SetDefault
(
framework
::
proto
::
VarType
::
FP32
);
AddComment
(
R"DOC(
Run graph by PopART runtime.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
ipu_runtime
,
ops
::
IpuRuntimeOp
,
ops
::
IpuRuntimeOpMaker
);
REGISTER_OP_IPU_KERNEL
(
ipu_runtime
,
ops
::
IpuRuntimeKernel
<
float
>
,
ops
::
IpuRuntimeKernel
<
double
>
,
ops
::
IpuRuntimeKernel
<
int
>
,
ops
::
IpuRuntimeKernel
<
int64_t
>
,
ops
::
IpuRuntimeKernel
<
bool
>
,
ops
::
IpuRuntimeKernel
<
int8_t
>
,
ops
::
IpuRuntimeKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/ipu_runtime_op.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/framework/ipu/ipu_backend.h"
#include "paddle/fluid/framework/tensor.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
IpuRuntimeKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#ifdef PADDLE_WITH_IPU
auto
ipu_backend
=
framework
::
ipu
::
IpuBackend
::
GetInstance
();
if
(
!
ipu_backend
->
DeviceIsAttached
())
{
const
platform
::
IPUDeviceContext
&
ipu_ctx
=
reinterpret_cast
<
const
platform
::
IPUDeviceContext
&>
(
ctx
.
device_context
());
ipu_backend
->
AttachDevice
(
ipu_ctx
.
DeviceId
());
}
auto
inputs
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"FeedList"
);
auto
outputs
=
ctx
.
MultiOutput
<
framework
::
Tensor
>
(
"FetchList"
);
auto
output_names
=
ctx
.
OutputNames
(
"FetchList"
);
VLOG
(
4
)
<<
"IpuRuntime Kernel, begin to run graph"
;
ipu_backend
->
Run
(
inputs
,
outputs
,
ctx
);
// post-run
// resize tensor when tensor.dims() is empty
for
(
size_t
i
=
0
;
i
<
outputs
.
size
();
++
i
)
{
auto
*
out
=
outputs
[
i
];
if
(
out
->
dims
().
size
()
==
0
)
{
auto
tensor_dtype
=
out
->
type
();
auto
sizeof_dtype
=
framework
::
SizeOfType
(
tensor_dtype
);
int64_t
dim
=
out
->
memory_size
()
/
sizeof_dtype
;
out
->
Resize
({
dim
});
VLOG
(
10
)
<<
"set ipu_runtime_op output: "
<<
output_names
[
i
]
<<
" dims from () to: "
<<
"("
<<
dim
<<
")"
;
}
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Please compile WITH_IPU option to enable ipu_runtime op"
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/math_function.cc
浏览文件 @
883ee1a3
...
@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
...
@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported"
));
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported"
));
}
}
template
<
>
void
set_constant_with_place
<
platform
::
IPUPlace
>
(
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
float
value
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"IPUPlace is not supported"
));
}
template
<
>
template
<
>
void
set_constant_with_place
<
platform
::
CPUPlace
>
(
void
set_constant_with_place
<
platform
::
CPUPlace
>
(
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
...
...
paddle/fluid/operators/py_layer_op.h
浏览文件 @
883ee1a3
...
@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
...
@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
protected:
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
)
;
auto
data_type
=
paddle
::
framework
::
proto
::
VarType
::
Type
::
VarType_Type_FP32
;
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
return
framework
::
OpKernelType
(
data_type
,
ctx
.
device_context
());
}
}
...
...
paddle/fluid/operators/reshape_op.cc
浏览文件 @
883ee1a3
...
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
...
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
multi_inputs
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"ShapeTensor"
);
auto
multi_inputs
=
ctx
.
MultiInput
<
framework
::
Tensor
>
(
"ShapeTensor"
);
if
(
multi_inputs
.
size
()
>
0
)
{
if
(
multi_inputs
.
size
()
>
0
)
{
return
framework
::
KernelSignature
(
"reshape
.
mulhost"
,
{
"X"
,
"ShapeTensor"
},
return
framework
::
KernelSignature
(
"reshape
_
mulhost"
,
{
"X"
,
"ShapeTensor"
},
{},
{
"Out"
});
{},
{
"Out"
});
}
else
if
(
ctx
.
HasInput
(
"Shape"
))
{
}
else
if
(
ctx
.
HasInput
(
"Shape"
))
{
return
framework
::
KernelSignature
(
"reshape
.
host"
,
{
"X"
,
"Shape"
},
{},
return
framework
::
KernelSignature
(
"reshape
_
host"
,
{
"X"
,
"Shape"
},
{},
{
"Out"
});
{
"Out"
});
}
else
{
}
else
{
return
framework
::
KernelSignature
(
"reshape"
,
{
"X"
},
{
"shape"
},
{
"Out"
});
return
framework
::
KernelSignature
(
"reshape"
,
{
"X"
},
{
"shape"
},
{
"Out"
});
...
...
paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
浏览文件 @
883ee1a3
...
@@ -18,6 +18,7 @@ namespace paddle {
...
@@ -18,6 +18,7 @@ namespace paddle {
namespace
platform
{
namespace
platform
{
std
::
unique_ptr
<
CUDAGraph
>
CUDAGraph
::
capturing_graph_
{
nullptr
};
std
::
unique_ptr
<
CUDAGraph
>
CUDAGraph
::
capturing_graph_
{
nullptr
};
paddle
::
optional
<
std
::
thread
::
id
>
CUDAGraph
::
capturing_thread_id_
{
paddle
::
none
};
void
CUDAGraph
::
Reset
()
{
void
CUDAGraph
::
Reset
()
{
if
(
is_reset_
)
return
;
if
(
is_reset_
)
return
;
...
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
...
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
IsCapturing
(),
true
,
IsCapturing
(),
true
,
errors
::
PermissionDenied
(
"BeginSegmentCapture should be called when CUDA "
errors
::
PermissionDenied
(
"BeginSegmentCapture should be called when CUDA "
"Graph is capturing."
));
"Graph is capturing."
));
if
(
IsThreadLocalCapturing
())
{
PADDLE_ENFORCE_EQ
(
IsThisThreadCapturing
(),
true
,
platform
::
errors
::
PermissionDenied
(
"When capturing CUDA Graph in the thread local mode, "
"you cannot begin segmented capturing in the thread "
"which is not the one that starts the capturing."
));
}
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamBeginCapture
(
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamBeginCapture
(
capturing_graph_
->
stream_
,
capturing_graph_
->
capture_mode_
));
capturing_graph_
->
stream_
,
capturing_graph_
->
capture_mode_
));
PADDLE_ENFORCE_EQ
(
IsValidCapturing
(),
true
,
PADDLE_ENFORCE_EQ
(
IsValidCapturing
(),
true
,
...
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
...
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
capturing_graph_
->
place_
=
place
;
capturing_graph_
->
place_
=
place
;
capturing_graph_
->
stream_
=
stream
;
capturing_graph_
->
stream_
=
stream
;
capturing_graph_
->
capture_mode_
=
mode
;
capturing_graph_
->
capture_mode_
=
mode
;
if
(
mode
==
cudaStreamCaptureModeThreadLocal
)
{
capturing_thread_id_
=
std
::
this_thread
::
get_id
();
VLOG
(
10
)
<<
"Capturing CUDA Graph in thread local mode, thread id: "
<<
capturing_thread_id_
;
}
BeginSegmentCapture
();
BeginSegmentCapture
();
#endif
#endif
}
}
...
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
...
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
std
::
unique_ptr
<
CUDAGraph
>
CUDAGraph
::
EndCapture
()
{
std
::
unique_ptr
<
CUDAGraph
>
CUDAGraph
::
EndCapture
()
{
EndSegmentCapture
();
EndSegmentCapture
();
capturing_thread_id_
=
paddle
::
none
;
return
std
::
move
(
capturing_graph_
);
return
std
::
move
(
capturing_graph_
);
}
}
...
...
paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
浏览文件 @
883ee1a3
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
#include <functional>
#include <functional>
#include <memory>
#include <memory>
#include <mutex>
#include <mutex>
#include <thread>
#include <vector>
#include <vector>
#include "cuda.h" // NOLINT
#include "cuda.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
...
@@ -26,6 +27,7 @@
...
@@ -26,6 +27,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/utils/optional.h"
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -99,6 +101,25 @@ class CUDAGraph {
...
@@ -99,6 +101,25 @@ class CUDAGraph {
// supported during capturing CUDA Graph.
// supported during capturing CUDA Graph.
static
bool
IsValidCapturing
();
static
bool
IsValidCapturing
();
static
bool
IsThreadLocalCapturing
()
{
#if CUDA_VERSION >= 10010
return
IsCapturing
()
&&
capturing_graph_
->
capture_mode_
==
cudaStreamCaptureModeThreadLocal
;
#else
return
false
;
#endif
}
static
bool
IsThisThreadCapturing
()
{
if
(
UNLIKELY
(
IsCapturing
()))
{
return
IsThreadLocalCapturing
()
?
capturing_thread_id_
.
get
()
==
std
::
this_thread
::
get_id
()
:
true
;
}
else
{
return
false
;
}
}
private:
private:
static
CUDAGraphID
UniqueID
()
{
static
CUDAGraphID
UniqueID
()
{
static
std
::
atomic
<
CUDAGraphID
>
id
;
static
std
::
atomic
<
CUDAGraphID
>
id
;
...
@@ -118,6 +139,7 @@ class CUDAGraph {
...
@@ -118,6 +139,7 @@ class CUDAGraph {
bool
is_reset_
{
false
};
bool
is_reset_
{
false
};
std
::
mutex
mtx_
;
std
::
mutex
mtx_
;
static
paddle
::
optional
<
std
::
thread
::
id
>
capturing_thread_id_
;
static
std
::
unique_ptr
<
CUDAGraph
>
capturing_graph_
;
static
std
::
unique_ptr
<
CUDAGraph
>
capturing_graph_
;
};
};
...
...
paddle/fluid/platform/device/gpu/gpu_primitives.h
浏览文件 @
883ee1a3
...
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
...
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
return
(
val
&
0xFFFFu
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
return
(
val
&
0xFFFFu
)
|
(
static_cast
<
uint32_t
>
(
high_half
.
x
)
<<
16
);
}
}
#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
static
__device__
__forceinline__
float16
CUDAFP16ToPDFP16
(
__half
x
)
{
return
*
reinterpret_cast
<
float16
*>
(
&
x
);
}
static
__device__
__forceinline__
__half
PDFP16ToCUDAFP16
(
float16
x
)
{
return
*
reinterpret_cast
<
__half
*>
(
&
x
);
}
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
return
CUDAFP16ToPDFP16
(
atomicAdd
(
reinterpret_cast
<
__half
*>
(
address
),
PDFP16ToCUDAFP16
(
val
)));
}
#else
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
CUDA_ATOMIC_WRAPPER
(
Add
,
float16
)
{
// concrete packed float16 value may exsits in lower or higher 16bits
// concrete packed float16 value may exsits in lower or higher 16bits
// of the 32bits address.
// of the 32bits address.
...
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
...
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
}
}
}
}
#endif
#endif
#endif
CUDA_ATOMIC_WRAPPER
(
Add
,
complex
<
float
>
)
{
CUDA_ATOMIC_WRAPPER
(
Add
,
complex
<
float
>
)
{
float
*
real
=
reinterpret_cast
<
float
*>
(
address
);
float
*
real
=
reinterpret_cast
<
float
*>
(
address
);
...
...
paddle/fluid/platform/device/ipu/CMakeLists.txt
浏览文件 @
883ee1a3
# IPU
IF
(
WITH_IPU
)
IF
(
WITH_IPU
)
FILE
(
GLOB POPART_CANONICALIZATION_SRC
${
PADDLE_SOURCE_DIR
}
/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc
)
cc_library
(
ipu_device SRCS device.cc DEPS enforce popart
)
cc_library
(
ipu_device SRCS device.cc DEPS enforce popart
)
cc_library
(
ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart
)
cc_library
(
ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart
)
cc_library
(
ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce
)
cc_library
(
ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce
)
...
...
paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
namespace
{
Node
*
activation_op_handler
(
Graph
*
graph
,
Node
*
node
,
const
std
::
string
&
type
)
{
auto
new_node
=
CreateBaseOp
(
graph
,
node
,
type
,
{
GetInputVarNode
(
"X"
,
node
)},
node
->
outputs
);
return
new_node
;
}
Node
*
relu_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
activation_op_handler
(
graph
,
node
,
"popart_relu"
);
}
Node
*
tanh_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
activation_op_handler
(
graph
,
node
,
"popart_tanh"
);
}
Node
*
log_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
activation_op_handler
(
graph
,
node
,
"popart_log"
);
}
Node
*
sigmoid_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
activation_op_handler
(
graph
,
node
,
"popart_sigmoid"
);
}
Node
*
sqrt_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
activation_op_handler
(
graph
,
node
,
"popart_sqrt"
);
}
Node
*
gelu_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
activation_op_handler
(
graph
,
node
,
"popart_gelu_v2"
);
}
Node
*
log_softmax_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
axis
=
BOOST_GET_CONST
(
int
,
node
->
Op
()
->
GetAttr
(
"axis"
));
auto
new_softmax
=
CreateSoftmaxOpset11
(
graph
,
node
,
node
->
inputs
,
{},
axis
);
return
CreateBaseOp
(
graph
,
node
,
"popart_log"
,
new_softmax
->
outputs
,
node
->
outputs
);
}
REGISTER_HANDLER
(
relu
,
relu_handler
);
REGISTER_HANDLER
(
tanh
,
tanh_handler
);
REGISTER_HANDLER
(
log
,
log_handler
);
REGISTER_HANDLER
(
sigmoid
,
sigmoid_handler
);
REGISTER_HANDLER
(
sqrt
,
sqrt_handler
);
REGISTER_HANDLER
(
gelu
,
gelu_handler
);
REGISTER_HANDLER
(
log_softmax
,
log_softmax_handler
);
}
// namespace
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
// This avoids the static initialisation order fiasco,
std
::
unordered_map
<
std
::
string
,
SymbolHandler
>
&
SymbolHandlers
()
{
static
std
::
unordered_map
<
std
::
string
,
SymbolHandler
>
symbol_handlers
;
return
symbol_handlers
;
}
bool
RegisterHandler
(
const
std
::
string
&
symbol
,
const
SymbolHandler
&
handler
)
{
if
(
SymbolHandlers
().
count
(
symbol
)
!=
0
)
{
LOG
(
WARNING
)
<<
"Trying to register popart handler twice for operator: "
<<
symbol
;
return
false
;
}
bool
new_handler
=
SymbolHandlers
().
emplace
(
symbol
,
handler
).
second
;
return
new_handler
;
}
// Return a pointer to a handler if one is registered for this kind of node or
// an empty std::function otherwise.
SymbolHandler
GetHandler
(
const
std
::
string
&
kind
)
{
auto
it
=
SymbolHandlers
().
find
(
kind
);
if
(
it
!=
SymbolHandlers
().
end
())
{
return
it
->
second
;
}
return
{};
}
void
ConnectNodes
(
Node
*
first_node
,
Node
*
next_node
)
{
first_node
->
outputs
.
push_back
(
next_node
);
next_node
->
inputs
.
push_back
(
first_node
);
}
void
DisConnectNodes
(
Node
*
first_node
,
Node
*
next_node
)
{
auto
rm_by_value
=
[
&
](
std
::
vector
<
Node
*>
&
vec
,
Node
*
n
)
{
vec
.
erase
(
std
::
remove
(
vec
.
begin
(),
vec
.
end
(),
n
),
vec
.
end
());
};
rm_by_value
(
first_node
->
outputs
,
next_node
);
rm_by_value
(
next_node
->
inputs
,
first_node
);
rm_by_value
(
first_node
->
inputs
,
next_node
);
rm_by_value
(
next_node
->
outputs
,
first_node
);
}
void
ClearNode
(
Node
*
node
)
{
auto
rm_by_value
=
[
&
](
std
::
vector
<
Node
*>
&
vec
,
Node
*
n
)
{
vec
.
erase
(
std
::
remove
(
vec
.
begin
(),
vec
.
end
(),
n
),
vec
.
end
());
};
for
(
auto
*
node_in
:
node
->
inputs
)
{
rm_by_value
(
node_in
->
outputs
,
node
);
}
for
(
auto
*
node_out
:
node
->
outputs
)
{
rm_by_value
(
node_out
->
inputs
,
node
);
}
}
void
CopyOpAttr
(
const
std
::
string
&
attr_name
,
OpDesc
*
op
,
OpDesc
*
new_op
,
bool
override
)
{
if
(
new_op
->
HasAttr
(
attr_name
)
&&
!
override
)
{
return
;
}
if
(
op
->
HasAttr
(
attr_name
))
{
VLOG
(
10
)
<<
"Copying attr: "
<<
attr_name
<<
" from "
<<
op
->
Type
()
<<
" to "
<<
new_op
->
Type
();
new_op
->
SetAttr
(
attr_name
,
op
->
GetAttr
(
attr_name
));
new_op
->
Flush
();
}
}
const
int
VarType2OnnxDtype
(
const
int
type
)
{
auto
dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
type
);
switch
(
dtype
)
{
case
framework
::
proto
::
VarType
::
BOOL
:
return
static_cast
<
int
>
(
ONNXDataType
::
BOOL
);
case
framework
::
proto
::
VarType
::
INT16
:
return
static_cast
<
int
>
(
ONNXDataType
::
INT16
);
case
framework
::
proto
::
VarType
::
INT32
:
return
static_cast
<
int
>
(
ONNXDataType
::
INT32
);
case
framework
::
proto
::
VarType
::
INT64
:
return
static_cast
<
int
>
(
ONNXDataType
::
INT64
);
case
framework
::
proto
::
VarType
::
FP16
:
return
static_cast
<
int
>
(
ONNXDataType
::
FLOAT16
);
case
framework
::
proto
::
VarType
::
FP32
:
return
static_cast
<
int
>
(
ONNXDataType
::
FLOAT
);
case
framework
::
proto
::
VarType
::
FP64
:
return
static_cast
<
int
>
(
ONNXDataType
::
DOUBLE
);
case
framework
::
proto
::
VarType
::
UINT8
:
return
static_cast
<
int
>
(
ONNXDataType
::
UINT8
);
case
framework
::
proto
::
VarType
::
INT8
:
return
static_cast
<
int
>
(
ONNXDataType
::
INT8
);
case
framework
::
proto
::
VarType
::
BF16
:
return
static_cast
<
int
>
(
ONNXDataType
::
BFLOAT16
);
case
framework
::
proto
::
VarType
::
COMPLEX64
:
return
static_cast
<
int
>
(
ONNXDataType
::
COMPLEX64
);
case
framework
::
proto
::
VarType
::
COMPLEX128
:
return
static_cast
<
int
>
(
ONNXDataType
::
COMPLEX128
);
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unsupported data type: %d."
,
dtype
));
}
}
const
std
::
string
VarType2PopStr
(
const
int
type
)
{
auto
dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
type
);
switch
(
dtype
)
{
case
framework
::
proto
::
VarType
::
UINT8
:
return
"UINT8"
;
case
framework
::
proto
::
VarType
::
INT8
:
return
"INT8"
;
case
framework
::
proto
::
VarType
::
INT16
:
return
"INT16"
;
case
framework
::
proto
::
VarType
::
INT32
:
return
"INT32"
;
case
framework
::
proto
::
VarType
::
INT64
:
return
"INT64"
;
case
framework
::
proto
::
VarType
::
BOOL
:
return
"BOOL"
;
case
framework
::
proto
::
VarType
::
FP64
:
return
"DOUBLE"
;
case
framework
::
proto
::
VarType
::
FP32
:
return
"FLOAT"
;
case
framework
::
proto
::
VarType
::
FP16
:
return
"FLOAT16"
;
default:
PADDLE_THROW
(
paddle
::
platform
::
errors
::
Unavailable
(
"Unsupported data type."
));
}
}
Node
*
GetInputVarNode
(
const
std
::
string
&
input_name
,
const
Node
*
op_node
,
const
int
id
)
{
auto
var_name
=
op_node
->
Op
()
->
Input
(
input_name
).
at
(
id
);
return
GetInputVarNodeByVarName
(
var_name
,
op_node
);
}
Node
*
GetOutputVarNode
(
const
std
::
string
&
output_name
,
const
Node
*
op_node
,
const
int
id
)
{
auto
var_name
=
op_node
->
Op
()
->
Output
(
output_name
).
at
(
id
);
return
GetOutputVarNodeByVarName
(
var_name
,
op_node
);
}
Node
*
GetInputVarNodeByVarName
(
const
std
::
string
&
var_name
,
const
Node
*
op_node
)
{
for
(
auto
*
var
:
op_node
->
inputs
)
{
if
(
var
->
Name
()
==
var_name
)
{
return
var
;
}
}
return
nullptr
;
}
Node
*
GetOutputVarNodeByVarName
(
const
std
::
string
&
var_name
,
const
Node
*
op_node
)
{
for
(
auto
*
var
:
op_node
->
outputs
)
{
if
(
var
->
Name
()
==
var_name
)
{
return
var
;
}
}
return
nullptr
;
}
const
bool
is_float_equal
(
float
a
,
float
b
,
float
eps
)
{
return
std
::
fabs
(
a
-
b
)
<=
eps
;
}
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
Node
;
using
framework
::
OpDesc
;
#define REGISTER_HANDLER(name, func) \
static bool __UNUSED_##name = \
paddle::platform::ipu::RegisterHandler(#name, func)
using
SymbolHandler
=
std
::
function
<
Node
*
(
Graph
*
,
Node
*
)
>
;
std
::
unordered_map
<
std
::
string
,
SymbolHandler
>
&
SymbolHandlers
();
bool
RegisterHandler
(
const
std
::
string
&
,
const
SymbolHandler
&
);
SymbolHandler
GetHandler
(
const
std
::
string
&
);
void
ConnectNodes
(
Node
*
first_node
,
Node
*
next_node
);
void
DisConnectNodes
(
Node
*
first_node
,
Node
*
next_node
);
void
ClearNode
(
Node
*
node
);
void
CopyOpAttr
(
const
std
::
string
&
attr_name
,
OpDesc
*
op
,
OpDesc
*
new_op
,
bool
override
=
false
);
const
int
VarType2OnnxDtype
(
const
int
type
);
const
std
::
string
VarType2PopStr
(
const
int
type
);
Node
*
GetInputVarNode
(
const
std
::
string
&
input_name
,
const
Node
*
op_node
,
const
int
id
=
0
);
Node
*
GetOutputVarNode
(
const
std
::
string
&
output_name
,
const
Node
*
op_node
,
const
int
id
=
0
);
Node
*
GetInputVarNodeByVarName
(
const
std
::
string
&
var_name
,
const
Node
*
op_node
);
Node
*
GetOutputVarNodeByVarName
(
const
std
::
string
&
var_name
,
const
Node
*
op_node
);
const
bool
is_float_equal
(
float
a
,
float
b
,
float
eps
=
1e-8
);
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
namespace
{
Node
*
elementwise_op_handler
(
Graph
*
graph
,
Node
*
node
,
const
std
::
string
&
type
)
{
auto
*
op
=
node
->
Op
();
auto
x_shape
=
GetInputVarNode
(
"X"
,
node
)
->
Var
()
->
GetShape
();
int64_t
x_rank
=
x_shape
.
size
();
auto
y_shape
=
GetInputVarNode
(
"Y"
,
node
)
->
Var
()
->
GetShape
();
int64_t
y_rank
=
y_shape
.
size
();
auto
axis
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"axis"
));
if
(
axis
==
-
1
||
axis
==
x_rank
-
1
||
x_rank
==
y_rank
)
{
auto
new_node
=
CreateBaseOp
(
graph
,
node
,
type
,
{
GetInputVarNode
(
"X"
,
node
),
GetInputVarNode
(
"Y"
,
node
)},
node
->
outputs
);
return
new_node
;
}
else
{
auto
y_new_shape
=
std
::
vector
<
int64_t
>
(
x_rank
,
1
);
for
(
int
i
=
axis
;
i
<
axis
+
y_rank
;
++
i
)
{
y_new_shape
[
i
]
=
y_shape
[
i
-
axis
];
}
auto
attrs
=
AttributeMap
{
{
"value"
,
y_new_shape
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
x_rank
}},
{
"dtype"
,
ONNXDataType
::
INT64
},
};
// constant
auto
new_node_const
=
CreateConst
(
graph
,
node
,
{},
{},
attrs
);
// reshape
auto
new_node_reshape
=
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
GetInputVarNode
(
"Y"
,
node
),
new_node_const
->
outputs
[
0
]},
{});
// elementwise_op
auto
new_node
=
CreateBaseOp
(
graph
,
node
,
type
,
{
GetInputVarNode
(
"X"
,
node
),
new_node_reshape
->
outputs
[
0
]},
node
->
outputs
);
return
new_node
;
}
}
Node
*
elementwise_add_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_add"
);
}
Node
*
elementwise_sub_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_sub"
);
}
Node
*
elementwise_div_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_div"
);
}
Node
*
elementwise_mul_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_mul"
);
}
Node
*
elementwise_min_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_min"
);
}
Node
*
elementwise_max_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_max"
);
}
Node
*
elementwise_pow_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_pow"
);
}
Node
*
elementwise_mod_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
elementwise_op_handler
(
graph
,
node
,
"popart_mod"
);
}
REGISTER_HANDLER
(
elementwise_add
,
elementwise_add_handler
);
REGISTER_HANDLER
(
elementwise_sub
,
elementwise_sub_handler
);
REGISTER_HANDLER
(
elementwise_div
,
elementwise_div_handler
);
REGISTER_HANDLER
(
elementwise_mul
,
elementwise_mul_handler
);
REGISTER_HANDLER
(
elementwise_min
,
elementwise_min_handler
);
REGISTER_HANDLER
(
elementwise_max
,
elementwise_max_handler
);
REGISTER_HANDLER
(
elementwise_pow
,
elementwise_pow_handler
);
REGISTER_HANDLER
(
elementwise_mod
,
elementwise_mod_handler
);
}
// namespace
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
namespace
{
Node
*
equal_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
new_node
=
CreateBaseOp
(
graph
,
node
,
"popart_equal"
,
{
GetInputVarNode
(
"X"
,
node
),
GetInputVarNode
(
"Y"
,
node
)},
node
->
outputs
);
return
new_node
;
}
REGISTER_HANDLER
(
equal
,
equal_handler
);
}
// namespace
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
namespace
{
Node
*
mean_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_reducemean"
,
{
GetInputVarNode
(
"X"
,
node
)},
{
GetOutputVarNode
(
"Out"
,
node
)},
{
{
"keepdims"
,
int64_t
{
0
}},
});
}
Node
*
pow_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
if
(
op
->
HasInput
(
"FactorTensor"
)
&&
!
op
->
Input
(
"FactorTensor"
).
empty
())
{
return
CreateBaseOp
(
graph
,
node
,
"popart_pow"
,
{
GetInputVarNode
(
"X"
,
node
),
GetInputVarNode
(
"FactorTensor"
,
node
)},
node
->
outputs
);
}
else
{
// Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
auto
value_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"factor"
));
auto
attrs
=
MakeConstAttrMapFromValue
<
float
>
(
value_
,
{
1
},
ONNXDataType
::
FLOAT
);
auto
new_node_const
=
CreateConst
(
graph
,
node
,
{},
{},
attrs
);
return
CreateBaseOp
(
graph
,
node
,
"popart_pow"
,
{
GetInputVarNode
(
"X"
,
node
),
new_node_const
->
outputs
[
0
]},
node
->
outputs
);
}
}
Node
*
mul_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
x_num_col_dims
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"x_num_col_dims"
));
auto
y_num_col_dims
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"y_num_col_dims"
));
auto
x_shape_
=
GetInputVarNode
(
"X"
,
node
)
->
Var
()
->
GetShape
();
auto
y_shape_
=
GetInputVarNode
(
"Y"
,
node
)
->
Var
()
->
GetShape
();
// build the shape for reshape
std
::
vector
<
int64_t
>
reshape_shape_
{};
for
(
int
left
=
0
;
left
<
x_num_col_dims
;
left
++
)
{
reshape_shape_
.
push_back
(
int64_t
(
x_shape_
[
left
]));
}
for
(
int
right
=
y_num_col_dims
;
right
<
y_shape_
.
size
();
right
++
)
{
reshape_shape_
.
push_back
(
int64_t
(
y_shape_
[
right
]));
}
auto
x_flatten
=
CreateBaseOp
(
graph
,
node
,
"popart_flatten"
,
{
GetInputVarNode
(
"X"
,
node
)},
{},
{{
"axis"
,
int64_t
(
x_num_col_dims
)}});
auto
y_flatten
=
CreateBaseOp
(
graph
,
node
,
"popart_flatten"
,
{
GetInputVarNode
(
"Y"
,
node
)},
{},
{{
"axis"
,
int64_t
(
y_num_col_dims
)}});
auto
matmul
=
CreateBaseOp
(
graph
,
node
,
"popart_matmul"
,
{
x_flatten
->
outputs
[
0
],
y_flatten
->
outputs
[
0
]},
{},
{});
auto
reshape_const
=
CreateConst
(
graph
,
node
,
{},
{},
{{
"value"
,
reshape_shape_
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
int64_t
(
reshape_shape_
.
size
())}},
{
"dtype"
,
ONNXDataType
::
INT64
}});
return
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
matmul
->
outputs
[
0
],
reshape_const
->
outputs
[
0
]},
node
->
outputs
,
{});
}
Node
*
matmul_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
transpose_x
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"transpose_X"
));
auto
transpose_y
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"transpose_Y"
));
auto
alpha
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"alpha"
));
auto
x_shape
=
GetInputVarNode
(
"X"
,
node
)
->
Var
()
->
GetShape
();
auto
y_shape
=
GetInputVarNode
(
"Y"
,
node
)
->
Var
()
->
GetShape
();
int
x_rank
=
x_shape
.
size
();
std
::
vector
<
int64_t
>
perm
;
if
(
x_rank
==
1
)
{
perm
=
std
::
vector
<
int64_t
>
{
0
};
}
else
if
(
x_rank
==
2
)
{
return
CreateGemm
(
graph
,
node
,
{
GetInputVarNode
(
"X"
,
node
),
GetInputVarNode
(
"Y"
,
node
)},
node
->
outputs
,
transpose_x
,
transpose_y
,
alpha
);
}
else
if
(
x_rank
==
3
)
{
perm
=
std
::
vector
<
int64_t
>
{
0
,
2
,
1
};
}
else
if
(
x_rank
==
4
)
{
perm
=
std
::
vector
<
int64_t
>
{
0
,
1
,
3
,
2
};
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"op matmul with input rank == %d"
,
x_rank
));
}
Node
*
x_node
=
GetInputVarNode
(
"X"
,
node
);
Node
*
y_node
=
GetInputVarNode
(
"Y"
,
node
);
if
(
transpose_x
)
{
x_node
=
CreateBaseOp
(
graph
,
node
,
"popart_transpose"
,
{
GetInputVarNode
(
"X"
,
node
)},
{},
{{
"perm"
,
perm
}});
x_node
=
x_node
->
outputs
[
0
];
}
if
(
transpose_y
)
{
y_node
=
CreateBaseOp
(
graph
,
node
,
"popart_transpose"
,
{
GetInputVarNode
(
"Y"
,
node
)},
{},
{{
"perm"
,
perm
}});
y_node
=
y_node
->
outputs
[
0
];
}
if
(
is_float_equal
(
alpha
,
1.0
))
{
auto
o_node
=
CreateBaseOp
(
graph
,
node
,
"popart_matmul"
,
{
x_node
,
y_node
},
{});
auto
attr
=
MakeConstAttrMapFromValue
(
alpha
,
{
1
},
ONNXDataType
::
FLOAT
);
auto
const_node
=
CreateConst
(
graph
,
node
,
{},
{},
attr
);
return
CreateBaseOp
(
graph
,
node
,
"popart_mul"
,
{
o_node
->
outputs
[
0
],
const_node
->
outputs
[
0
]},
node
->
outputs
);
}
else
{
return
CreateBaseOp
(
graph
,
node
,
"popart_matmul"
,
{
x_node
,
y_node
},
node
->
outputs
);
}
}
Node
*
sum_handler
(
Graph
*
graph
,
Node
*
node
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_sum"
,
node
->
inputs
,
node
->
outputs
);
}
Node
*
softmax_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
axis
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"axis"
));
return
CreateSoftmaxOpset11
(
graph
,
node
,
node
->
inputs
,
node
->
outputs
,
axis
);
}
Node
*
scale_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
scale_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"scale"
));
auto
bias_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"bias"
));
auto
bias_after_scale_
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"bias_after_scale"
));
auto
data_type_
=
GetInputVarNode
(
"X"
,
node
)
->
Var
()
->
GetDataType
();
auto
new_node_bias_var
=
CreateConst
(
graph
,
node
,
{},
{},
{{
"value"
,
std
::
vector
<
float
>
{
bias_
}},
{
"dims"
,
std
::
vector
<
int64_t
>
{
1
}},
{
"dtype"
,
ONNXDataType
::
FLOAT
}});
new_node_bias_var
=
new_node_bias_var
->
outputs
[
0
];
Node
*
new_node_scale_var
=
nullptr
;
if
(
op
->
HasInput
(
"ScaleTensor"
)
&&
!
op
->
Input
(
"ScaleTensor"
).
empty
())
{
new_node_scale_var
=
GetInputVarNode
(
"ScaleTensor"
,
node
);
}
else
{
new_node_scale_var
=
CreateConst
(
graph
,
node
,
{},
{},
{{
"value"
,
std
::
vector
<
float
>
{
scale_
}},
{
"dims"
,
std
::
vector
<
int64_t
>
{
1
}},
{
"dtype"
,
ONNXDataType
::
FLOAT
}});
new_node_scale_var
=
new_node_scale_var
->
outputs
[
0
];
}
// convert to float32
auto
new_node_cast
=
CreateCast
(
graph
,
node
,
{
GetInputVarNode
(
"X"
,
node
)},
{},
static_cast
<
int
>
(
framework
::
proto
::
VarType
::
FP32
));
Node
*
result
=
nullptr
;
if
(
bias_after_scale_
)
{
auto
new_node_mul
=
CreateBaseOp
(
graph
,
node
,
"popart_mul"
,
{
new_node_cast
->
outputs
[
0
],
new_node_scale_var
},
{},
{});
result
=
CreateBaseOp
(
graph
,
node
,
"popart_add"
,
{
new_node_mul
->
outputs
[
0
],
new_node_bias_var
},
{},
{});
}
else
{
auto
new_node_add
=
CreateBaseOp
(
graph
,
node
,
"popart_add"
,
{
new_node_cast
->
outputs
[
0
],
new_node_bias_var
},
{},
{});
result
=
CreateBaseOp
(
graph
,
node
,
"popart_mul"
,
{
new_node_add
->
outputs
[
0
],
new_node_scale_var
},
{},
{});
}
auto
result_after_cast
=
CreateCast
(
graph
,
node
,
result
->
outputs
,
node
->
outputs
,
static_cast
<
int
>
(
data_type_
));
return
result_after_cast
;
}
Node
*
cross_entropy2_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
ignoreIndex
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"ignore_index"
));
auto
new_cast
=
CreateCast
(
graph
,
node
,
{
GetInputVarNode
(
"Label"
,
node
)},
{},
framework
::
proto
::
VarType
::
INT32
);
auto
label_shape_
=
GetInputVarNode
(
"Label"
,
node
)
->
Var
()
->
GetShape
();
if
(
label_shape_
.
size
()
==
1
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_nllloss"
,
{
GetInputVarNode
(
"X"
,
node
),
new_cast
->
outputs
[
0
]},
{
GetOutputVarNode
(
"Y"
,
node
)},
{
{
"ignoreIndex"
,
ignoreIndex
},
});
}
else
{
std
::
vector
<
int64_t
>
new_shape_
{
label_shape_
[
0
]};
auto
const_before_loss
=
CreateBaseOp
(
graph
,
node
,
"popart_constant"
,
{},
{},
{{
"value"
,
new_shape_
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
static_cast
<
int64_t
>
(
new_shape_
.
size
())}},
{
"dtype"
,
ONNXDataType
::
INT64
}});
auto
reshape_before_loss
=
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
new_cast
->
outputs
[
0
],
const_before_loss
->
outputs
[
0
]},
{},
{});
auto
nllloss
=
CreateBaseOp
(
graph
,
node
,
"popart_nllloss"
,
{
GetInputVarNode
(
"X"
,
node
),
reshape_before_loss
->
outputs
[
0
]},
{},
{
{
"ignoreIndex"
,
ignoreIndex
},
});
auto
const_after_loss
=
CreateBaseOp
(
graph
,
node
,
"popart_constant"
,
{},
{},
{{
"value"
,
label_shape_
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
static_cast
<
int64_t
>
(
label_shape_
.
size
())}},
{
"dtype"
,
ONNXDataType
::
INT64
}});
auto
reshape_after_loss
=
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
nllloss
->
outputs
[
0
],
const_after_loss
->
outputs
[
0
]},
{
GetOutputVarNode
(
"Y"
,
node
)},
{});
return
reshape_after_loss
;
}
}
REGISTER_HANDLER
(
mean
,
mean_handler
);
REGISTER_HANDLER
(
pow
,
pow_handler
);
REGISTER_HANDLER
(
mul
,
mul_handler
);
REGISTER_HANDLER
(
matmul
,
matmul_handler
);
REGISTER_HANDLER
(
sum
,
sum_handler
);
REGISTER_HANDLER
(
softmax
,
softmax_handler
);
REGISTER_HANDLER
(
scale
,
scale_handler
);
REGISTER_HANDLER
(
cross_entropy2
,
cross_entropy2_handler
);
}
// namespace
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
namespace
{
Node
*
conv2d_handler
(
Graph
*
graph
,
Node
*
node
)
{
OpDesc
*
op
=
node
->
Op
();
auto
dilations_
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op
->
GetAttr
(
"dilations"
));
auto
dilations
=
std
::
vector
<
int64_t
>
{
dilations_
.
begin
(),
dilations_
.
end
()};
auto
group_
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"groups"
));
auto
pads_
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op
->
GetAttr
(
"paddings"
));
if
(
pads_
.
size
()
==
2
)
{
pads_
.
push_back
(
pads_
[
0
]);
pads_
.
push_back
(
pads_
[
1
]);
}
auto
pads
=
std
::
vector
<
int64_t
>
{
pads_
.
begin
(),
pads_
.
end
()};
auto
stride_
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op
->
GetAttr
(
"strides"
));
auto
stride
=
std
::
vector
<
int64_t
>
{
stride_
.
begin
(),
stride_
.
end
()};
if
(
op
->
HasInput
(
"Bias"
)
&&
!
op
->
Input
(
"Bias"
).
empty
())
{
return
CreateConv
(
graph
,
node
,
{
GetInputVarNode
(
"Input"
,
node
),
GetInputVarNode
(
"Filter"
,
node
),
GetInputVarNode
(
"Bias"
,
node
),
},
node
->
outputs
,
dilations
,
group_
,
{},
pads
,
stride
);
}
else
{
return
CreateConv
(
graph
,
node
,
{
GetInputVarNode
(
"Input"
,
node
),
GetInputVarNode
(
"Filter"
,
node
),
},
node
->
outputs
,
dilations
,
group_
,
{},
pads
,
stride
);
}
}
Node
*
batch_norm_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
std
::
vector
<
Node
*>
inputs
;
inputs
.
push_back
(
GetInputVarNode
(
"X"
,
node
));
inputs
.
push_back
(
GetInputVarNode
(
"Scale"
,
node
));
inputs
.
push_back
(
GetInputVarNode
(
"Bias"
,
node
));
inputs
.
push_back
(
GetInputVarNode
(
"Mean"
,
node
));
inputs
.
push_back
(
GetInputVarNode
(
"Variance"
,
node
));
int64_t
num_outputs
=
1
;
std
::
vector
<
Node
*>
outputs
;
auto
is_test_type
=
op
->
GetAttrType
(
"is_test"
);
bool
is_test
;
if
(
is_test_type
==
0
)
{
// int
is_test
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"is_test"
));
}
else
{
// bool
is_test
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"is_test"
));
}
outputs
.
push_back
(
GetOutputVarNode
(
"Y"
,
node
));
if
(
!
is_test
)
{
outputs
.
push_back
(
GetOutputVarNode
(
"MeanOut"
,
node
));
outputs
.
push_back
(
GetOutputVarNode
(
"VarianceOut"
,
node
));
outputs
.
push_back
(
GetOutputVarNode
(
"SavedMean"
,
node
));
outputs
.
push_back
(
GetOutputVarNode
(
"SavedVariance"
,
node
));
num_outputs
=
5
;
}
// outputs.push_back(GetOutputVarNode("ReserveSpace", node));
auto
momentum
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"momentum"
));
auto
epsilon
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"epsilon"
));
// data_layout
return
CreateBaseOp
(
graph
,
node
,
"popart_batchnormalization"
,
inputs
,
outputs
,
{
{
"momentum"
,
momentum
},
{
"epsilon"
,
epsilon
},
{
"num_outputs"
,
num_outputs
},
});
}
Node
*
pool2d_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
pooling_type
=
BOOST_GET_CONST
(
std
::
string
,
op
->
GetAttr
(
"pooling_type"
));
auto
global_pooling
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"global_pooling"
));
if
(
global_pooling
)
{
if
(
pooling_type
==
"max"
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_globalmaxpool"
,
node
->
inputs
,
node
->
outputs
);
}
else
if
(
pooling_type
==
"avg"
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_globalaveragepool"
,
node
->
inputs
,
node
->
outputs
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"op pool2d with unkonwn pooling_type: %s"
,
pooling_type
));
}
}
if
(
op
->
HasAttr
(
"padding_algorithm"
))
{
auto
padding_algorithm
=
BOOST_GET_CONST
(
std
::
string
,
op
->
GetAttr
(
"padding_algorithm"
));
if
(
padding_algorithm
!=
"EXPLICIT"
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"op pool2d with unkonwn padding_algorithm: %s"
,
padding_algorithm
));
}
}
auto
ksize
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op
->
GetAttr
(
"ksize"
));
auto
kernel_shape
=
std
::
vector
<
int64_t
>
{
ksize
.
begin
(),
ksize
.
end
()};
auto
ceil_mode_
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"ceil_mode"
));
auto
ceil_mode
=
int64_t
(
ceil_mode_
?
1
:
0
);
auto
paddings
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op
->
GetAttr
(
"paddings"
));
auto
pads
=
std
::
vector
<
int64_t
>
{
paddings
.
begin
(),
paddings
.
end
()};
if
(
pads
.
size
()
==
2
)
{
pads
.
push_back
(
paddings
[
0
]);
pads
.
push_back
(
paddings
[
1
]);
}
auto
strides_
=
BOOST_GET_CONST
(
std
::
vector
<
int
>
,
op
->
GetAttr
(
"strides"
));
auto
strides
=
std
::
vector
<
int64_t
>
{
strides_
.
begin
(),
strides_
.
end
()};
if
(
pooling_type
==
"max"
)
{
int64_t
num_outputs
=
1
;
auto
dilations
=
std
::
vector
<
int64_t
>
{};
int64_t
storage_order
=
0
;
return
CreateBaseOp
(
graph
,
node
,
"popart_maxpool"
,
node
->
inputs
,
node
->
outputs
,
{
{
"num_outputs"
,
num_outputs
},
{
"kernel_shape"
,
kernel_shape
},
{
"ceil_mode"
,
ceil_mode
},
{
"dilations"
,
dilations
},
{
"pads"
,
pads
},
{
"storage_order"
,
storage_order
},
{
"strides"
,
strides
},
});
}
else
if
(
pooling_type
==
"avg"
)
{
int64_t
count_include_pad
=
0
;
return
CreateBaseOp
(
graph
,
node
,
"popart_averagepool"
,
node
->
inputs
,
node
->
outputs
,
{
{
"kernel_shape"
,
kernel_shape
},
{
"ceil_mode"
,
ceil_mode
},
{
"count_include_pad"
,
count_include_pad
},
{
"pads"
,
pads
},
{
"strides"
,
strides
},
});
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"op pool2d with unkonwn pooling_type: %s"
,
pooling_type
));
}
}
Node
*
group_norm_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
epsilon_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"epsilon"
));
auto
groups_
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"groups"
));
auto
groups
=
int64_t
{
groups_
};
auto
attrs_
=
AttributeMap
{{
"epsilon"
,
epsilon_
},
{
"num_groups"
,
groups
}};
std
::
vector
<
Node
*>
inputs_
=
{
GetInputVarNode
(
"X"
,
node
),
GetInputVarNode
(
"Scale"
,
node
),
GetInputVarNode
(
"Bias"
,
node
)};
std
::
vector
<
Node
*>
outputs_
=
{
GetOutputVarNode
(
"Y"
,
node
),
GetOutputVarNode
(
"Mean"
,
node
),
GetOutputVarNode
(
"Variance"
,
node
)};
return
CreateBaseOp
(
graph
,
node
,
"popart_groupnormalization_v2"
,
inputs_
,
outputs_
,
attrs_
);
}
Node
*
instance_norm_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
epsilon_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"epsilon"
));
auto
attrs_
=
AttributeMap
{{
"epsilon"
,
epsilon_
}};
std
::
vector
<
Node
*>
inputs_
=
{
GetInputVarNode
(
"X"
,
node
),
GetInputVarNode
(
"Scale"
,
node
),
GetInputVarNode
(
"Bias"
,
node
)};
std
::
vector
<
Node
*>
outputs_
=
{
GetOutputVarNode
(
"Y"
,
node
)};
return
CreateBaseOp
(
graph
,
node
,
"popart_instancenormalization"
,
inputs_
,
outputs_
,
attrs_
);
}
Node
*
layer_norm_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
begin_norm_axis_
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"begin_norm_axis"
));
auto
input_shape_
=
GetInputVarNode
(
"X"
,
node
)
->
Var
()
->
GetShape
();
std
::
vector
<
int64_t
>
norm_shape_
{
1
,
1
};
for
(
int
i
=
0
;
i
<
input_shape_
.
size
();
i
++
)
{
if
(
i
<
begin_norm_axis_
)
{
norm_shape_
[
0
]
*=
input_shape_
[
i
];
}
else
{
norm_shape_
[
1
]
*=
input_shape_
[
i
];
}
}
auto
attrs1
=
AttributeMap
{
{
"value"
,
norm_shape_
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
static_cast
<
int64_t
>
(
norm_shape_
.
size
())}},
{
"dtype"
,
ONNXDataType
::
INT64
}};
auto
reshape1_const
=
CreateBaseOp
(
graph
,
node
,
"popart_constant"
,
{},
{},
attrs1
);
auto
new_node_reshape1
=
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
GetInputVarNode
(
"X"
,
node
),
reshape1_const
->
outputs
[
0
]},
{},
{});
auto
epsilon_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"epsilon"
));
int64_t
groups_
=
1
;
auto
groupnorm_attrs_
=
AttributeMap
{{
"epsilon"
,
epsilon_
},
{
"num_groups"
,
groups_
}};
auto
out_Y_
=
MakeVarNode
(
graph
,
node
);
CreateBaseOp
(
graph
,
node
,
"popart_groupnormalization_v2"
,
{
new_node_reshape1
->
outputs
[
0
],
GetInputVarNode
(
"Scale"
,
node
),
GetInputVarNode
(
"Bias"
,
node
)},
{
out_Y_
,
GetOutputVarNode
(
"Mean"
,
node
),
GetOutputVarNode
(
"Variance"
,
node
)},
groupnorm_attrs_
);
auto
attrs2
=
AttributeMap
{
{
"value"
,
input_shape_
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
static_cast
<
int64_t
>
(
input_shape_
.
size
())}},
{
"dtype"
,
ONNXDataType
::
INT64
}};
auto
reshape2_const
=
CreateBaseOp
(
graph
,
node
,
"popart_constant"
,
{},
{},
attrs2
);
auto
new_node_reshape2
=
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
out_Y_
,
reshape2_const
->
outputs
[
0
]},
{
GetOutputVarNode
(
"Y"
,
node
)},
{});
return
new_node_reshape2
;
}
Node
*
dropout_handler
(
Graph
*
graph
,
Node
*
node
)
{
auto
*
op
=
node
->
Op
();
auto
dropout_prob_
=
BOOST_GET_CONST
(
float
,
op
->
GetAttr
(
"dropout_prob"
));
auto
dropout_implementation_
=
BOOST_GET_CONST
(
std
::
string
,
op
->
GetAttr
(
"dropout_implementation"
));
auto
is_test_type_
=
op
->
GetAttrType
(
"is_test"
);
bool
is_test_
;
if
(
is_test_type_
==
0
)
{
// int
is_test_
=
BOOST_GET_CONST
(
int
,
op
->
GetAttr
(
"is_test"
));
}
else
{
// bool
is_test_
=
BOOST_GET_CONST
(
bool
,
op
->
GetAttr
(
"is_test"
));
}
if
(
is_test_
)
{
if
(
dropout_implementation_
==
"upscale_in_train"
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_identity"
,
{
GetInputVarNode
(
"X"
,
node
)},
{
GetOutputVarNode
(
"Out"
,
node
)},
{});
}
else
if
(
dropout_implementation_
==
"downgrade_in_infer"
)
{
auto
scale
=
CreateConst
(
graph
,
node
,
{},
{},
{{
"value"
,
std
::
vector
<
float
>
{
1
-
dropout_prob_
}},
{
"dims"
,
std
::
vector
<
int64_t
>
{
1
}},
{
"dtype"
,
ONNXDataType
::
FLOAT
}});
return
CreateBaseOp
(
graph
,
node
,
"popart_mul"
,
{
GetInputVarNode
(
"X"
,
node
),
scale
->
outputs
[
0
]},
{
GetOutputVarNode
(
"Out"
,
node
)},
{});
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid dropout_implementation"
));
}
}
else
{
if
(
dropout_implementation_
==
"upscale_in_train"
)
{
auto
attrs_
=
AttributeMap
{{
"num_outputs"
,
(
int64_t
)
1
},
{
"ratio"
,
dropout_prob_
}};
return
CreateBaseOp
(
graph
,
node
,
"popart_dropout"
,
{
GetInputVarNode
(
"X"
,
node
)},
{
GetOutputVarNode
(
"Out"
,
node
)},
attrs_
);
}
else
if
(
dropout_implementation_
==
"downgrade_in_infer"
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Do not support downgrade_in_infer with training"
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid dropout_implementation"
));
}
}
}
REGISTER_HANDLER
(
pool2d
,
pool2d_handler
);
REGISTER_HANDLER
(
batch_norm
,
batch_norm_handler
);
REGISTER_HANDLER
(
group_norm
,
group_norm_handler
);
REGISTER_HANDLER
(
instance_norm
,
instance_norm_handler
);
REGISTER_HANDLER
(
layer_norm
,
layer_norm_handler
);
REGISTER_HANDLER
(
conv2d
,
conv2d_handler
);
REGISTER_HANDLER
(
dropout
,
dropout_handler
);
}
// namespace
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
// singleton
static
int
var_count
=
0
;
static
int
op_count
=
0
;
const
std
::
string
GenerateVarName
()
{
return
std
::
string
(
"_gen_var_"
)
+
std
::
to_string
(
var_count
++
);
}
const
std
::
string
GenerateOpName
()
{
return
std
::
string
(
"_gen_op_"
)
+
std
::
to_string
(
op_count
++
);
}
const
std
::
string
CreateOpIdentifyId
(
Node
*
node
)
{
// format: op_type|out_var0|out_var1|...|_gen_*
// this name will be used as op name when exporting onnx model from popart
auto
op_type
=
node
->
Name
();
std
::
string
op_out
=
""
;
for
(
auto
*
out_node
:
node
->
outputs
)
{
op_out
+=
"|"
;
op_out
+=
out_node
->
Name
();
}
return
{
op_type
+
op_out
+
"|"
+
GenerateOpName
()};
}
Node
*
MakeVarNode
(
Graph
*
graph
,
Node
*
node
)
{
auto
var_name
=
GenerateVarName
();
auto
var_desc
=
std
::
make_unique
<
framework
::
VarDesc
>
(
var_name
);
auto
var
=
graph
->
CreateVarNode
(
var_desc
.
get
());
return
var
;
}
Node
*
MakeOpNode
(
Graph
*
graph
,
Node
*
node
,
const
std
::
string
&
type
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
)
{
auto
op_desc
=
std
::
make_unique
<
framework
::
OpDesc
>
();
op_desc
->
SetType
(
type
);
auto
op
=
graph
->
CreateOpNode
(
op_desc
.
get
());
for
(
auto
*
in
:
inputs
)
{
ConnectNodes
(
in
,
op
);
}
if
(
outputs
.
empty
())
{
auto
var
=
MakeVarNode
(
graph
,
node
);
ConnectNodes
(
op
,
var
);
}
else
{
for
(
auto
*
out
:
outputs
)
{
ConnectNodes
(
op
,
out
);
}
}
// i/o
std
::
vector
<
std
::
string
>
input_names
;
for
(
auto
node
:
op
->
inputs
)
{
input_names
.
push_back
(
node
->
Name
());
}
op
->
Op
()
->
SetInput
(
"__inputs__"
,
input_names
);
std
::
vector
<
std
::
string
>
output_names
;
for
(
auto
node
:
op
->
outputs
)
{
output_names
.
push_back
(
node
->
Name
());
}
op
->
Op
()
->
SetOutput
(
"__outputs__"
,
output_names
);
op
->
Op
()
->
Flush
();
return
op
;
}
Node
*
CreateBaseOp
(
Graph
*
graph
,
Node
*
node
,
const
std
::
string
&
type
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
AttributeMap
&
attrs
)
{
auto
new_node
=
MakeOpNode
(
graph
,
node
,
type
,
inputs
,
outputs
);
if
(
!
attrs
.
empty
())
{
new_node
->
Op
()
->
SetAttrMap
(
attrs
);
}
// deal special attr
if
(
!
new_node
->
Op
()
->
HasAttr
(
sIpuIndexAttr
))
{
CopyOpAttr
(
sIpuIndexAttr
,
node
->
Op
(),
new_node
->
Op
());
}
if
(
!
new_node
->
Op
()
->
HasAttr
(
sIpuStageAttr
))
{
CopyOpAttr
(
sIpuStageAttr
,
node
->
Op
(),
new_node
->
Op
());
}
{
new_node
->
Op
()
->
SetAttr
(
sOpIdentifyIdAttr
,
CreateOpIdentifyId
(
node
));
new_node
->
Op
()
->
Flush
();
}
return
new_node
;
}
Node
*
CreateConst
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
AttributeMap
&
attrs
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_constant"
,
inputs
,
outputs
,
attrs
);
}
Node
*
CreateCast
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
int
otype
)
{
auto
to
=
VarType2PopStr
(
otype
);
return
CreateBaseOp
(
graph
,
node
,
"popart_cast"
,
inputs
,
outputs
,
{{
"to"
,
to
}});
}
Node
*
CreateGemm
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
int64_t
transA
,
int64_t
transB
,
float
alpha
,
float
beta
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_gemm"
,
inputs
,
outputs
,
{
{
"alpha"
,
alpha
},
{
"beta"
,
beta
},
{
"transA"
,
transA
},
{
"transB"
,
transB
},
});
}
Node
*
CreateReshape
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
std
::
vector
<
int64_t
>
&
oshape
)
{
auto
attr
=
AttributeMap
{
{
"value"
,
oshape
},
{
"dims"
,
std
::
vector
<
int64_t
>
{
static_cast
<
int64_t
>
(
oshape
.
size
())}},
{
"dtype"
,
ONNXDataType
::
INT64
}};
auto
new_node_const
=
CreateBaseOp
(
graph
,
node
,
"popart_constant"
,
{},
{},
attr
);
auto
new_node_reshape
=
CreateBaseOp
(
graph
,
node
,
"popart_reshape"
,
{
inputs
[
0
],
new_node_const
->
outputs
[
0
]},
outputs
);
return
new_node_reshape
;
}
Node
*
CreateConv
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
std
::
vector
<
int64_t
>
&
dilations
,
int64_t
group
,
const
std
::
vector
<
int64_t
>
&
kernel_shape
,
const
std
::
vector
<
int64_t
>
&
pads
,
const
std
::
vector
<
int64_t
>
&
strides
)
{
auto
attrs
=
AttributeMap
{
{
"dilations"
,
dilations
},
{
"group"
,
group
},
{
"kernel_shape"
,
kernel_shape
},
{
"pads"
,
pads
},
{
"strides"
,
strides
},
};
return
CreateBaseOp
(
graph
,
node
,
"popart_conv"
,
inputs
,
outputs
,
attrs
);
}
Node
*
CreateSoftmaxOpset11
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
int64_t
axis
)
{
PADDLE_ENFORCE_EQ
(
inputs
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Softmax op only support one input"
));
auto
x_shape
=
inputs
[
0
]
->
Var
()
->
GetShape
();
int
x_rank
=
x_shape
.
size
();
if
(
axis
<
0
)
{
axis
=
axis
+
x_rank
;
}
if
(
axis
==
x_rank
-
1
)
{
return
CreateBaseOp
(
graph
,
node
,
"popart_softmax"
,
inputs
,
outputs
,
{{
"axis"
,
int64_t
{
-
1
}}});
}
else
{
auto
perm
=
std
::
vector
<
int64_t
>
(
x_rank
);
std
::
iota
(
perm
.
begin
(),
perm
.
end
(),
0
);
perm
[
x_rank
-
1
]
=
axis
;
perm
[
axis
]
=
x_rank
-
1
;
auto
new_transpose_pre
=
CreateBaseOp
(
graph
,
node
,
"popart_transpose"
,
inputs
,
{},
{{
"perm"
,
perm
}});
auto
new_softmax
=
CreateBaseOp
(
graph
,
node
,
"popart_softmax"
,
new_transpose_pre
->
outputs
,
{},
{{
"axis"
,
int64_t
{
-
1
}}});
return
CreateBaseOp
(
graph
,
node
,
"popart_transpose"
,
new_softmax
->
outputs
,
outputs
,
{{
"perm"
,
perm
}});
}
}
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
0 → 100644
浏览文件 @
883ee1a3
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/ipu/common.h"
#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
namespace
paddle
{
namespace
platform
{
namespace
ipu
{
using
paddle
::
framework
::
AttributeMap
;
template
<
typename
T
>
AttributeMap
MakeConstAttrMap
(
std
::
vector
<
T
>
value
,
std
::
vector
<
int64_t
>
dims
,
int
dtype
)
{
return
AttributeMap
{{
"value"
,
value
},
{
"dims"
,
dims
},
{
"dtype"
,
dtype
}};
}
template
<
typename
T
>
AttributeMap
MakeConstAttrMapFromValue
(
T
v
,
std
::
vector
<
int64_t
>
dims
,
int
dtype
)
{
size_t
size
=
1
;
for
(
auto
&
dim
:
dims
)
{
size
*=
dim
;
}
return
MakeConstAttrMap
<
T
>
(
std
::
vector
<
T
>
(
size
,
v
),
dims
,
dtype
);
}
const
std
::
string
GenerateVarName
();
const
std
::
string
CreateOpIdentifyId
(
Node
*
node
);
Node
*
MakeVarNode
(
Graph
*
graph
,
Node
*
node
);
Node
*
MakeOpNode
(
Graph
*
graph
,
Node
*
node
,
const
std
::
string
&
type
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
);
Node
*
CreateBaseOp
(
Graph
*
graph
,
Node
*
node
,
const
std
::
string
&
type
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
AttributeMap
&
attrs
=
{});
Node
*
CreateConst
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
AttributeMap
&
attrs
);
// otype is proto::VarType::Type
Node
*
CreateCast
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
int
otype
);
Node
*
CreateGemm
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
int64_t
transA
=
0
,
int64_t
transB
=
0
,
float
alpha
=
1.0
f
,
float
beta
=
1.0
f
);
Node
*
CreateReshape
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
std
::
vector
<
int64_t
>
&
oshape
);
Node
*
CreateConv
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
const
std
::
vector
<
int64_t
>
&
dilations
=
{
1
,
1
},
int64_t
group
=
1
,
const
std
::
vector
<
int64_t
>
&
kernel_shape
=
{},
const
std
::
vector
<
int64_t
>
&
pads
=
{
0
,
0
,
0
,
0
},
const
std
::
vector
<
int64_t
>
&
strides
=
{
1
,
1
});
Node
*
CreateSoftmaxOpset11
(
Graph
*
graph
,
Node
*
node
,
const
std
::
vector
<
Node
*>
&
inputs
,
const
std
::
vector
<
Node
*>
&
outputs
,
int64_t
axis
);
}
// namespace ipu
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/device/ipu/supported_ops_autogen.h
浏览文件 @
883ee1a3
...
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
...
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
OP_DECL
(
popart_tanh
,
aiOnnxOpset
.
tanh
,
NONE
)
// NOLINT
OP_DECL
(
popart_tanh
,
aiOnnxOpset
.
tanh
,
NONE
)
// NOLINT
OP_DECL
(
popart_tile
,
aiOnnxOpset
.
tile
,
NONE
)
// NOLINT
OP_DECL
(
popart_tile
,
aiOnnxOpset
.
tile
,
NONE
)
// NOLINT
OP_DECL
(
popart_transpose
,
aiOnnxOpset
.
transpose
,
ARG
(
INT_VEC
,
perm
)
)
// NOLINT
OP_DECL
(
popart_transpose
,
aiOnnxOpset
.
transpose
,
ARG
(
INT_VEC
,
perm
)
)
// NOLINT
// clang-format on
paddle/fluid/platform/device_context.cc
浏览文件 @
883ee1a3
...
@@ -16,6 +16,9 @@ limitations under the License. */
...
@@ -16,6 +16,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#endif
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#endif
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler.h"
...
@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
...
@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
if
(
it
==
device_contexts_
.
end
())
{
if
(
it
==
device_contexts_
.
end
())
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Place %s is not supported. Please check that your paddle compiles "
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
"with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
"your train process set the correct device id if you use Executor."
,
"that your train process set the correct device id if you use "
"Executor."
,
place
));
place
));
}
}
return
it
->
second
.
get
().
get
();
return
it
->
second
.
get
().
get
();
...
@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
...
@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW
(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported. Please "
platform
::
errors
::
Unimplemented
(
"XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."
));
"re-compile with WITH_XPU option."
));
#endif
}
else
if
(
platform
::
is_ipu_place
(
p
))
{
#ifdef PADDLE_WITH_IPU
EmplaceDeviceContext
<
IPUDeviceContext
,
IPUPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."
));
#endif
#endif
}
else
if
(
platform
::
is_npu_place
(
p
))
{
}
else
if
(
platform
::
is_npu_place
(
p
))
{
#ifdef PADDLE_WITH_ASCEND_CL
#ifdef PADDLE_WITH_ASCEND_CL
...
@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
...
@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
Place
CPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
Place
CPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
#ifdef PADDLE_WITH_IPU
IPUDeviceContext
::
IPUDeviceContext
(
IPUPlace
place
)
:
place_
(
place
)
{
int
id
=
place
.
GetDeviceId
();
std
::
shared_ptr
<
platform
::
ipu
::
IpuBackend
>
ipu_backend
=
platform
::
ipu
::
IpuBackend
::
GetInstance
();
device_
=
ipu_backend
->
GetDevice
(
id
);
}
Place
IPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
void
IPUDeviceContext
::
Wait
()
const
{
/*! \brief Wait for all operations completion in the stream. */
}
IPUDeviceContext
::~
IPUDeviceContext
()
{}
#endif
#ifdef PADDLE_WITH_XPU
#ifdef PADDLE_WITH_XPU
XPUDeviceContext
::
XPUDeviceContext
()
{
XPUDeviceContext
::
XPUDeviceContext
()
{
context_
=
xpu
::
create_context
();
context_
=
xpu
::
create_context
();
...
...
paddle/fluid/platform/place.cc
浏览文件 @
883ee1a3
...
@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
...
@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
void
operator
()(
const
XPUPlace
&
p
)
{
os_
<<
"XPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
XPUPlace
&
p
)
{
os_
<<
"XPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPlace
&
p
)
{
os_
<<
"NPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPlace
&
p
)
{
os_
<<
"NPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPinnedPlace
&
p
)
{
os_
<<
"NPUPinnedPlace"
;
}
void
operator
()(
const
NPUPinnedPlace
&
p
)
{
os_
<<
"NPUPinnedPlace"
;
}
void
operator
()(
const
IPUPlace
&
p
)
{
os_
<<
"IPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
CUDAPinnedPlace
&
p
)
{
os_
<<
"CUDAPinnedPlace"
;
}
void
operator
()(
const
CUDAPinnedPlace
&
p
)
{
os_
<<
"CUDAPinnedPlace"
;
}
private:
private:
...
@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
...
@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
return
boost
::
apply_visitor
(
IsNPUPlace
(),
p
);
return
boost
::
apply_visitor
(
IsNPUPlace
(),
p
);
}
}
bool
is_ipu_place
(
const
Place
&
p
)
{
return
boost
::
apply_visitor
(
IsIPUPlace
(),
p
);
}
bool
is_cpu_place
(
const
Place
&
p
)
{
bool
is_cpu_place
(
const
Place
&
p
)
{
return
boost
::
apply_visitor
(
IsCPUPlace
(),
p
);
return
boost
::
apply_visitor
(
IsCPUPlace
(),
p
);
}
}
...
@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
...
@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
return
BOOST_GET_CONST
(
XPUPlace
,
p1
)
==
BOOST_GET_CONST
(
XPUPlace
,
p2
);
return
BOOST_GET_CONST
(
XPUPlace
,
p1
)
==
BOOST_GET_CONST
(
XPUPlace
,
p2
);
}
else
if
(
is_npu_place
(
p1
))
{
}
else
if
(
is_npu_place
(
p1
))
{
return
BOOST_GET_CONST
(
NPUPlace
,
p1
)
==
BOOST_GET_CONST
(
NPUPlace
,
p2
);
return
BOOST_GET_CONST
(
NPUPlace
,
p1
)
==
BOOST_GET_CONST
(
NPUPlace
,
p2
);
}
else
if
(
is_ipu_place
(
p1
))
{
return
BOOST_GET_CONST
(
IPUPlace
,
p1
)
==
BOOST_GET_CONST
(
IPUPlace
,
p2
);
}
else
{
}
else
{
return
BOOST_GET_CONST
(
CUDAPlace
,
p1
)
==
BOOST_GET_CONST
(
CUDAPlace
,
p2
);
return
BOOST_GET_CONST
(
CUDAPlace
,
p1
)
==
BOOST_GET_CONST
(
CUDAPlace
,
p2
);
}
}
...
...
paddle/fluid/platform/place.h
浏览文件 @
883ee1a3
...
@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
...
@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
inline
bool
operator
!=
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
inline
bool
operator
!=
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
inline
bool
operator
<
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
inline
bool
operator
<
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
};
};
struct
IPUPlace
{
IPUPlace
()
:
IPUPlace
(
0
)
{}
explicit
IPUPlace
(
int
d
)
:
device
(
d
)
{}
inline
int
GetDeviceId
()
const
{
return
device
;
}
// needed for variant equality comparison
inline
bool
operator
==
(
const
IPUPlace
&
o
)
const
{
return
device
==
o
.
device
;
}
inline
bool
operator
!=
(
const
IPUPlace
&
o
)
const
{
return
!
(
*
this
==
o
);
}
inline
bool
operator
<
(
const
IPUPlace
&
o
)
const
{
return
device
<
o
.
device
;
}
int
device
;
};
struct
IsCUDAPlace
:
public
boost
::
static_visitor
<
bool
>
{
struct
IsCUDAPlace
:
public
boost
::
static_visitor
<
bool
>
{
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
};
...
@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
...
@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
};
...
@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
...
@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
cuda_pinned
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
cuda_pinned
)
const
{
return
true
;
}
};
};
...
@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
...
@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
XPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
};
...
@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
...
@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
};
...
@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
...
@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
true
;
}
};
struct
IsIPUPlace
:
public
boost
::
static_visitor
<
bool
>
{
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
IPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
};
};
class
Place
:
public
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
class
Place
:
public
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDAPinnedPlace
,
NPUPinnedPlace
>
{
CUDAPinnedPlace
,
NPUPinnedPlace
,
IPUPlace
>
{
private:
private:
using
PlaceBase
=
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
using
PlaceBase
=
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDAPinnedPlace
,
NPUPinnedPlace
>
;
CUDAPinnedPlace
,
NPUPinnedPlace
,
IPUPlace
>
;
public:
public:
Place
()
=
default
;
Place
()
=
default
;
Place
(
const
CPUPlace
&
cpu_place
)
:
PlaceBase
(
cpu_place
)
{}
// NOLINT
Place
(
const
CPUPlace
&
cpu_place
)
:
PlaceBase
(
cpu_place
)
{}
// NOLINT
Place
(
const
XPUPlace
&
xpu_place
)
:
PlaceBase
(
xpu_place
)
{}
// NOLINT
Place
(
const
XPUPlace
&
xpu_place
)
:
PlaceBase
(
xpu_place
)
{}
// NOLINT
Place
(
const
NPUPlace
&
npu_place
)
:
PlaceBase
(
npu_place
)
{}
// NOLINT
Place
(
const
NPUPlace
&
npu_place
)
:
PlaceBase
(
npu_place
)
{}
// NOLINT
Place
(
const
IPUPlace
&
ipu_place
)
:
PlaceBase
(
ipu_place
)
{}
// NOLINT
Place
(
const
CUDAPlace
&
cuda_place
)
:
PlaceBase
(
cuda_place
)
{}
// NOLINT
Place
(
const
CUDAPlace
&
cuda_place
)
:
PlaceBase
(
cuda_place
)
{}
// NOLINT
Place
(
const
CUDAPinnedPlace
&
cuda_pinned_place
)
// NOLINT
Place
(
const
CUDAPinnedPlace
&
cuda_pinned_place
)
// NOLINT
:
PlaceBase
(
cuda_pinned_place
)
{}
:
PlaceBase
(
cuda_pinned_place
)
{}
...
@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
...
@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
bool
is_gpu_place
(
const
Place
&
);
bool
is_gpu_place
(
const
Place
&
);
bool
is_xpu_place
(
const
Place
&
);
bool
is_xpu_place
(
const
Place
&
);
bool
is_npu_place
(
const
Place
&
);
bool
is_npu_place
(
const
Place
&
);
bool
is_ipu_place
(
const
Place
&
);
bool
is_cpu_place
(
const
Place
&
);
bool
is_cpu_place
(
const
Place
&
);
bool
is_cuda_pinned_place
(
const
Place
&
);
bool
is_cuda_pinned_place
(
const
Place
&
);
bool
is_npu_pinned_place
(
const
Place
&
);
bool
is_npu_pinned_place
(
const
Place
&
);
...
@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
...
@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
return
typename
Visitor
::
result_type
();
return
typename
Visitor
::
result_type
();
#endif
#endif
}
}
typename
Visitor
::
result_type
operator
()(
const
IPUPlace
&
ipu
)
const
{
#ifdef PADDLE_WITH_IPU
return
visitor_
(
ipu
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with IPU. Cannot visit ipu device"
));
return
typename
Visitor
::
result_type
();
#endif
}
typename
Visitor
::
result_type
operator
()(
const
CUDAPlace
&
cuda
)
const
{
typename
Visitor
::
result_type
operator
()(
const
CUDAPlace
&
cuda
)
const
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/pybind/.gitignore
浏览文件 @
883ee1a3
pybind.h
pybind.h
op_function_impl.h
op_function_impl.h
\ No newline at end of file
eager_op_function_impl.h
paddle/fluid/pybind/pybind.cc
浏览文件 @
883ee1a3
...
@@ -130,6 +130,10 @@ limitations under the License. */
...
@@ -130,6 +130,10 @@ limitations under the License. */
#endif
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#include "paddle/fluid/platform/ipu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#include "paddle/fluid/pybind/crypto.h"
...
@@ -199,6 +203,14 @@ bool IsCompiledWithNPU() {
...
@@ -199,6 +203,14 @@ bool IsCompiledWithNPU() {
#endif
#endif
}
}
bool
IsCompiledWithIPU
()
{
#ifndef PADDLE_WITH_IPU
return
false
;
#else
return
true
;
#endif
}
bool
IsCompiledWithMKLDNN
()
{
bool
IsCompiledWithMKLDNN
()
{
#ifndef PADDLE_WITH_MKLDNN
#ifndef PADDLE_WITH_MKLDNN
return
false
;
return
false
;
...
@@ -812,6 +824,8 @@ PYBIND11_MODULE(core_noavx, m) {
...
@@ -812,6 +824,8 @@ PYBIND11_MODULE(core_noavx, m) {
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
NPUPlace
>
,
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
NPUPlace
>
,
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
IPUPlace
>
,
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
)
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
CUDAPinnedPlace
>
,
.
def
(
"set"
,
SetTensorFromPyArray
<
paddle
::
platform
::
CUDAPinnedPlace
>
,
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
,
py
::
arg
(
"array"
),
py
::
arg
(
"place"
),
py
::
arg
(
"zero_copy"
)
=
false
,
R"DOC(
R"DOC(
...
@@ -819,7 +833,7 @@ PYBIND11_MODULE(core_noavx, m) {
...
@@ -819,7 +833,7 @@ PYBIND11_MODULE(core_noavx, m) {
Args:
Args:
lod (numpy.ndarray): The data to set.
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
place (CPUPlace|CUDAPlace|XPUPlace|
IPUPlace|
CUDAPinnedPlace|NPUPlace): The place where the
LoDTensor is to be set.
LoDTensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
This parameter only works with CPUPlace. Default: False.
...
@@ -1909,6 +1923,58 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1909,6 +1923,58 @@ All parameter, weight, gradient are variables in Paddle.
[](
const
platform
::
NPUPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
[](
const
platform
::
NPUPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
NPUPlace
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
NPUPlace
&>
);
// IPUPlace
py
::
class_
<
platform
::
IPUPlace
>
(
m
,
"IPUPlace"
,
R"DOC(
IPUPlace is a descriptor of a device.
It represents a IPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: ipu
ipu_place = paddle.IPUPlace()
)DOC"
)
.
def
(
"__init__"
,
[](
platform
::
IPUPlace
&
self
)
{
#ifdef PADDLE_WITH_IPU
if
(
platform
::
GetIPUDeviceCount
()
==
0
)
{
LOG
(
ERROR
)
<<
"Cannot use IPU because there is no IPU "
"detected on your "
"machine."
;
std
::
exit
(
-
1
);
}
// use ipu(0) to comile, while run with the number user configure
// in sharding and pipline.
new
(
&
self
)
platform
::
IPUPlace
(
0
);
#else
LOG
(
ERROR
)
<<
string
::
Sprintf
(
"Cannot use IPU because you didn't install IPU version "
"PaddlePaddle.
\n
"
"If you want to use IPU, please try to install IPU version "
"PaddlePaddle by: pip install paddlepaddle*
\n
"
"If you only have CPU, please change IPUPlace to be "
"CPUPlace().
\n
"
);
std
::
exit
(
-
1
);
#endif
})
.
def
(
"_type"
,
&
PlaceIndex
<
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
Place
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
CUDAPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
CPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
XPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
IPUPlace
,
platform
::
CUDAPinnedPlace
>
)
#ifdef PADDLE_WITH_IPU
.
def
(
"get_device_id"
,
[](
const
platform
::
IPUPlace
&
self
)
{
return
self
.
GetDeviceId
();
})
#endif
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
IPUPlace
&>
);
py
::
class_
<
platform
::
Place
>
platformplace
(
m
,
"Place"
);
py
::
class_
<
platform
::
Place
>
platformplace
(
m
,
"Place"
);
g_place_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
platformplace
.
ptr
());
g_place_pytype
=
reinterpret_cast
<
PyTypeObject
*>
(
platformplace
.
ptr
());
platformplace
.
def
(
py
::
init
<>
())
platformplace
.
def
(
py
::
init
<>
())
...
@@ -1918,6 +1984,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1918,6 +1984,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
XPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
XPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
NPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
IPUPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CUDAPinnedPlace
>
)
.
def
(
"_equals"
,
&
IsSamePlace
<
platform
::
Place
,
platform
::
CUDAPinnedPlace
>
)
.
def
(
"is_gpu_place"
,
.
def
(
"is_gpu_place"
,
[](
platform
::
Place
&
self
)
{
return
platform
::
is_gpu_place
(
self
);
})
[](
platform
::
Place
&
self
)
{
return
platform
::
is_gpu_place
(
self
);
})
...
@@ -1927,6 +1994,8 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1927,6 +1994,8 @@ All parameter, weight, gradient are variables in Paddle.
[](
platform
::
Place
&
self
)
{
return
platform
::
is_xpu_place
(
self
);
})
[](
platform
::
Place
&
self
)
{
return
platform
::
is_xpu_place
(
self
);
})
.
def
(
"is_npu_place"
,
.
def
(
"is_npu_place"
,
[](
platform
::
Place
&
self
)
{
return
platform
::
is_npu_place
(
self
);
})
[](
platform
::
Place
&
self
)
{
return
platform
::
is_npu_place
(
self
);
})
.
def
(
"is_ipu_place"
,
[](
platform
::
Place
&
self
)
{
return
platform
::
is_ipu_place
(
self
);
})
.
def
(
"is_cuda_pinned_place"
,
.
def
(
"is_cuda_pinned_place"
,
[](
platform
::
Place
&
self
)
{
[](
platform
::
Place
&
self
)
{
return
platform
::
is_cuda_pinned_place
(
self
);
return
platform
::
is_cuda_pinned_place
(
self
);
...
@@ -1943,6 +2012,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1943,6 +2012,10 @@ All parameter, weight, gradient are variables in Paddle.
[](
platform
::
Place
&
self
)
{
[](
platform
::
Place
&
self
)
{
return
BOOST_GET_CONST
(
platform
::
NPUPlace
,
self
).
device
;
return
BOOST_GET_CONST
(
platform
::
NPUPlace
,
self
).
device
;
})
})
.
def
(
"ipu_device_id"
,
[](
platform
::
Place
&
self
)
{
return
BOOST_GET_CONST
(
platform
::
IPUPlace
,
self
).
device
;
})
.
def
(
"set_place"
,
[](
platform
::
Place
&
self
,
.
def
(
"set_place"
,
[](
platform
::
Place
&
self
,
const
platform
::
Place
&
other
)
{
self
=
other
;
})
const
platform
::
Place
&
other
)
{
self
=
other
;
})
.
def
(
"set_place"
,
.
def
(
"set_place"
,
...
@@ -1966,6 +2039,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1966,6 +2039,10 @@ All parameter, weight, gradient are variables in Paddle.
[](
platform
::
Place
&
self
,
const
platform
::
NPUPlace
&
npu_place
)
{
[](
platform
::
Place
&
self
,
const
platform
::
NPUPlace
&
npu_place
)
{
self
=
npu_place
;
self
=
npu_place
;
})
})
.
def
(
"set_place"
,
[](
platform
::
Place
&
self
,
const
platform
::
IPUPlace
&
ipu_place
)
{
self
=
ipu_place
;
})
.
def
(
"__repr__"
,
string
::
to_string
<
const
platform
::
Place
&>
)
.
def
(
"__repr__"
,
string
::
to_string
<
const
platform
::
Place
&>
)
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
Place
&>
);
.
def
(
"__str__"
,
string
::
to_string
<
const
platform
::
Place
&>
);
...
@@ -2197,6 +2274,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2197,6 +2274,7 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"is_compiled_with_ascend"
,
IsCompiledWithAscend
);
m
.
def
(
"is_compiled_with_ascend"
,
IsCompiledWithAscend
);
m
.
def
(
"is_compiled_with_rocm"
,
IsCompiledWithROCM
);
m
.
def
(
"is_compiled_with_rocm"
,
IsCompiledWithROCM
);
m
.
def
(
"is_compiled_with_npu"
,
IsCompiledWithNPU
);
m
.
def
(
"is_compiled_with_npu"
,
IsCompiledWithNPU
);
m
.
def
(
"is_compiled_with_ipu"
,
IsCompiledWithIPU
);
m
.
def
(
"is_compiled_with_xpu"
,
IsCompiledWithXPU
);
m
.
def
(
"is_compiled_with_xpu"
,
IsCompiledWithXPU
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"is_compiled_with_cinn"
,
IsCompiledWithCINN
);
m
.
def
(
"is_compiled_with_cinn"
,
IsCompiledWithCINN
);
...
@@ -2516,6 +2594,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2516,6 +2594,10 @@ All parameter, weight, gradient are variables in Paddle.
});
});
#endif
#endif
#ifdef PADDLE_WITH_IPU
m
.
def
(
"get_ipu_device_count"
,
platform
::
GetIPUDeviceCount
);
#endif
py
::
enum_
<
platform
::
TracerOption
>
(
m
,
"TracerOption"
,
py
::
arithmetic
())
py
::
enum_
<
platform
::
TracerOption
>
(
m
,
"TracerOption"
,
py
::
arithmetic
())
.
value
(
"kDefault"
,
platform
::
TracerOption
::
kDefault
)
.
value
(
"kDefault"
,
platform
::
TracerOption
::
kDefault
)
.
value
(
"kOpDetail"
,
platform
::
TracerOption
::
kOpDetail
)
.
value
(
"kOpDetail"
,
platform
::
TracerOption
::
kOpDetail
)
...
@@ -2593,6 +2675,11 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -2593,6 +2675,11 @@ All parameter, weight, gradient are variables in Paddle.
bool
val
)
{
self
.
Set
<
bool
>
(
name
,
new
bool
(
val
));
})
bool
val
)
{
self
.
Set
<
bool
>
(
name
,
new
bool
(
val
));
})
.
def
(
"set"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
.
def
(
"set"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
int
val
)
{
self
.
Set
<
const
int
>
(
name
,
new
int
(
val
));
})
int
val
)
{
self
.
Set
<
const
int
>
(
name
,
new
int
(
val
));
})
.
def
(
"set"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
std
::
vector
<
std
::
string
>
set
)
{
self
.
Set
(
name
,
new
std
::
vector
<
std
::
string
>
(
set
));
})
.
def
(
"set"
,
.
def
(
"set"
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
[](
ir
::
Pass
&
self
,
const
std
::
string
&
name
,
std
::
unordered_set
<
std
::
string
>
set
)
{
std
::
unordered_set
<
std
::
string
>
set
)
{
...
@@ -3425,6 +3512,118 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -3425,6 +3512,118 @@ All parameter, weight, gradient are variables in Paddle.
})
})
.
def
(
"device_count"
,
&
ParallelExecutor
::
DeviceCount
);
.
def
(
"device_count"
,
&
ParallelExecutor
::
DeviceCount
);
#ifdef PADDLE_WITH_IPU
py
::
class_
<
platform
::
ipu
::
IpuBackend
,
std
::
shared_ptr
<
platform
::
ipu
::
IpuBackend
>>
(
m
,
"IpuBackend"
)
.
def
(
py
::
init
(
&
platform
::
ipu
::
IpuBackend
::
GetNewInstance
))
.
def
(
"clear"
,
&
platform
::
ipu
::
IpuBackend
::
Clear
)
.
def
(
"set_scope"
,
&
platform
::
ipu
::
IpuBackend
::
SetScope
)
.
def
(
"set_ipu_strategy"
,
&
platform
::
ipu
::
IpuBackend
::
SetIpuStrategy
);
py
::
class_
<
platform
::
ipu
::
IpuStrategy
>
(
m
,
"IpuStrategy"
)
.
def
(
py
::
init
())
.
def_property
(
"num_ipus"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
num_ipus
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
int
num_ipus
)
{
self
.
num_ipus
=
num_ipus
;
},
R"DOC(
Int type, set the number ipu we need. Default 1.
)DOC"
)
.
def_property
(
"accumulationFactor"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
popart_options_
.
accumulationFactor
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
int
accumulationFactor
)
{
self
.
popart_options_
.
accumulationFactor
=
accumulationFactor
;
},
R"DOC(
Specify the number of micro-batches to accumulate before
applying the varUpdate. Default 1.
)DOC"
)
.
def_property
(
"batches_per_step"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
batches_per_step
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
int
batches_per_step
)
{
self
.
batches_per_step
=
batches_per_step
;
},
R"DOC(
Int type, set batches_per_step. Default 1.
)DOC"
)
.
def_property
(
"is_training"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
is_training
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
bool
is_training
)
{
self
.
is_training
=
is_training
;
},
R"DOC(
Bool type, True for training, False inference. Default True.
)DOC"
)
.
def_property
(
"enable_pipelining"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
popart_options_
.
enablePipelining
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
bool
enable_pipelining
)
{
self
.
popart_options_
.
enablePipelining
=
enable_pipelining
;
},
R"DOC(
Bool type, True enable pipeline, otherwise disable. Default False.
)DOC"
)
.
def_property
(
"enable_manual_shard"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
popart_options_
.
virtualGraphMode
==
platform
::
ipu
::
VirtualGraphMode
::
Manual
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
bool
enable_ipu_shard
)
{
if
(
enable_ipu_shard
)
{
self
.
popart_options_
.
virtualGraphMode
=
platform
::
ipu
::
VirtualGraphMode
::
Manual
;
}
else
{
self
.
popart_options_
.
virtualGraphMode
=
platform
::
ipu
::
VirtualGraphMode
::
Off
;
}
},
R"DOC(
Bool type, True enable model sharding, otherwise disable. Default "
"False.
)DOC"
)
.
def_property
(
"need_avg_shard"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
need_avg_shard
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
bool
need_avg_shard
)
{
self
.
need_avg_shard
=
need_avg_shard
;
},
R"DOC(
Bool type, True enable avg shard, otherwise disable. Default False.
)DOC"
)
.
def_property
(
"batch_size"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
batch_size
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
int
batch_size
)
{
self
.
batch_size
=
batch_size
;
},
R"DOC(
Int type, used to make batch size fixed. Default 1.
)DOC"
)
.
def_property
(
"enable_fp16"
,
[](
const
platform
::
ipu
::
IpuStrategy
&
self
)
{
return
self
.
enable_fp16
;
},
[](
platform
::
ipu
::
IpuStrategy
&
self
,
bool
enable_fp16
)
{
self
.
enable_fp16
=
enable_fp16
;
},
R"DOC(
Bool type, True enable float16 mode, otherwise disable. Default False.)DOC"
);
#endif
BindFleetWrapper
(
&
m
);
BindFleetWrapper
(
&
m
);
BindIO
(
&
m
);
BindIO
(
&
m
);
...
...
paddle/fluid/pybind/reader_py.cc
浏览文件 @
883ee1a3
...
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
...
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
"If set true, the queue.pop will only get data from queue but not "
"If set true, the queue.pop will only get data from queue but not "
"remove the data from queue for speed testing"
);
"remove the data from queue for speed testing"
);
// disable auto conversion to list in Python
PYBIND11_MAKE_OPAQUE
(
paddle
::
framework
::
LoDTensorArray
);
namespace
paddle
{
namespace
paddle
{
namespace
pybind
{
namespace
pybind
{
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
883ee1a3
...
@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
...
@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use XPUPlace in CPU/GPU version, "
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."
));
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
else
if
(
paddle
::
platform
::
is_ipu_place
(
place
))
{
#ifdef PADDLE_WITH_IPU
if
(
zero_copy
)
{
auto
holder
=
std
::
make_shared
<
details
::
NumpyAllocation
<
T
>>
(
array
);
auto
type
=
framework
::
ToDataType
(
std
::
type_index
(
typeid
(
T
)));
self
->
ResetHolderWithType
(
holder
,
type
);
}
else
{
auto
dst
=
self
->
mutable_data
<
T
>
(
place
);
std
::
memcpy
(
dst
,
array
.
data
(),
array
.
nbytes
());
}
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with IPU support."
));
#endif
#endif
}
else
if
(
paddle
::
platform
::
is_npu_place
(
place
))
{
}
else
if
(
paddle
::
platform
::
is_npu_place
(
place
))
{
#ifdef PADDLE_WITH_ASCEND_CL
#ifdef PADDLE_WITH_ASCEND_CL
...
...
paddle/pten/api/lib/CMakeLists.txt
浏览文件 @
883ee1a3
...
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
...
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
set
(
api_header_file_tmp
${
api_header_file
}
.tmp
)
set
(
api_header_file_tmp
${
api_header_file
}
.tmp
)
set
(
api_source_file_tmp
${
api_source_file
}
.tmp
)
set
(
api_source_file_tmp
${
api_source_file
}
.tmp
)
if
(
NOT PYTHON_EXECUTABLE
)
find_package
(
PythonInterp REQUIRED
)
endif
()
add_custom_command
(
add_custom_command
(
OUTPUT
${
api_header_file
}
${
api_source_file
}
OUTPUT
${
api_header_file
}
${
api_source_file
}
COMMAND
${
PYTHON_EXECUTABLE
}
-m pip install pyyaml
COMMAND
${
PYTHON_EXECUTABLE
}
-m pip install pyyaml
...
...
paddle/pten/api/lib/kernel_declare.h
0 → 100644
浏览文件 @
883ee1a3
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/core/kernel_registry.h"
// TODO(chenweihang) After the kernel is split into a single file,
// the kernel declare statement is automatically generated according to the
// file name of the kernel, and this header file will be removed
PT_DECLARE_KERNEL
(
full_like
,
CPU
);
PT_DECLARE_KERNEL
(
dot
,
CPU
);
PT_DECLARE_KERNEL
(
flatten
,
CPU
);
PT_DECLARE_KERNEL
(
sign
,
CPU
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_KERNEL
(
full_like
,
CUDA
);
PT_DECLARE_KERNEL
(
dot
,
CUDA
);
PT_DECLARE_KERNEL
(
flatten
,
CUDA
);
PT_DECLARE_KERNEL
(
sign
,
CUDA
);
#endif
#ifdef PADDLE_WITH_XPU
PT_DECLARE_KERNEL
(
flatten
,
XPU
);
#endif
paddle/pten/api/lib/utils.cc
浏览文件 @
883ee1a3
...
@@ -25,10 +25,14 @@ limitations under the License. */
...
@@ -25,10 +25,14 @@ limitations under the License. */
#include "paddle/pten/include/core.h"
#include "paddle/pten/include/core.h"
#include "paddle/pten/include/infermeta.h"
#include "paddle/pten/include/infermeta.h"
PT_DECLARE_
MODULE
(
Utils
CPU
);
PT_DECLARE_
KERNEL
(
copy
,
CPU
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE
(
UtilsCUDA
);
PT_DECLARE_KERNEL
(
copy
,
CUDA
);
#endif
#ifdef PADDLE_WITH_XPU
PT_DECLARE_KERNEL
(
copy
,
XPU
);
#endif
#endif
namespace
paddle
{
namespace
paddle
{
...
...
paddle/pten/core/kernel_alias_name.h
浏览文件 @
883ee1a3
...
@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
...
@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
{
"fill_any_like"
,
"full_like"
},
{
"fill_any_like"
,
"full_like"
},
{
"fill_constant"
,
"full"
},
{
"fill_constant"
,
"full"
},
{
"flatten_contiguous_range"
,
"flatten"
},
{
"flatten_contiguous_range"
,
"flatten"
},
//
{"matmul_v2", "matmul"},
{
"matmul_v2"
,
"matmul"
},
{
"reduce_mean"
,
"mean"
},
{
"reduce_mean"
,
"mean"
},
{
"reduce_sum"
,
"sum"
},
{
"reduce_sum"
,
"sum"
},
{
"reshape2"
,
"reshape"
},
{
"reshape2"
,
"reshape"
},
// fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
// fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
{
"flatten"
,
"deprecated"
},
{
"flatten"
,
"deprecated"
},
//
{"matmul", "deprecated"},
{
"matmul"
,
"deprecated"
},
{
"mean"
,
"deprecated"
},
{
"mean"
,
"deprecated"
},
{
"reshape"
,
"deprecated"
},
{
"reshape"
,
"deprecated"
},
{
"sum"
,
"deprecated"
}};
{
"sum"
,
"deprecated"
}};
...
...
paddle/pten/core/kernel_factory.h
浏览文件 @
883ee1a3
...
@@ -265,12 +265,8 @@ class KernelFactory {
...
@@ -265,12 +265,8 @@ class KernelFactory {
KernelMap
&
kernels
()
{
return
kernels_
;
}
KernelMap
&
kernels
()
{
return
kernels_
;
}
void
InsertCompatibleOpType
(
const
std
::
string
&
op_type
)
{
compatible_op_types_
.
insert
(
op_type
);
}
bool
HasCompatiblePtenKernel
(
const
std
::
string
&
op_type
)
const
{
bool
HasCompatiblePtenKernel
(
const
std
::
string
&
op_type
)
const
{
return
compatible_op_types_
.
count
(
TransToPtenKernelName
(
op_type
))
>
0
;
return
kernels_
.
find
(
TransToPtenKernelName
(
op_type
))
!=
kernels_
.
end
()
;
}
}
const
Kernel
&
SelectKernelOrThrowError
(
const
KernelName
&
kernel_name
,
const
Kernel
&
SelectKernelOrThrowError
(
const
KernelName
&
kernel_name
,
...
@@ -288,9 +284,6 @@ class KernelFactory {
...
@@ -288,9 +284,6 @@ class KernelFactory {
KernelFactory
()
=
default
;
KernelFactory
()
=
default
;
KernelMap
kernels_
;
KernelMap
kernels_
;
// Used to be compatible with the original execution system and
// quickly confirm whether the new kernel can be called
std
::
unordered_set
<
std
::
string
>
compatible_op_types_
;
};
};
/** operator << overload **/
/** operator << overload **/
...
...
paddle/pten/core/kernel_registry.h
浏览文件 @
883ee1a3
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#pragma once
#pragma once
#include <cstring>
#include <cstring>
#include <string>
#include <type_traits>
#include <type_traits>
#include <typeindex>
#include <typeindex>
#include <typeinfo>
#include <typeinfo>
...
@@ -24,6 +25,8 @@
...
@@ -24,6 +25,8 @@
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/pten/core/kernel_factory.h"
#include "paddle/pten/core/kernel_utils.h"
#include "paddle/pten/core/kernel_utils.h"
#include "paddle/fluid/platform/enforce.h"
namespace
pten
{
namespace
pten
{
#define BACKEND(arg__) pten::Backend::arg__
#define BACKEND(arg__) pten::Backend::arg__
...
@@ -140,7 +143,6 @@ struct KernelRegistrar {
...
@@ -140,7 +143,6 @@ struct KernelRegistrar {
Kernel
kernel
(
kernel_fn
);
Kernel
kernel
(
kernel_fn
);
args_parse_fn
(
kernel_key
,
kernel
.
mutable_args_def
());
args_parse_fn
(
kernel_key
,
kernel
.
mutable_args_def
());
args_def_fn
(
&
kernel
);
args_def_fn
(
&
kernel
);
KernelFactory
::
Instance
().
InsertCompatibleOpType
(
kernel_name
.
name
());
KernelFactory
::
Instance
().
kernels
()[
kernel_name
][
kernel_key
]
=
kernel
;
KernelFactory
::
Instance
().
kernels
()[
kernel_name
][
kernel_key
]
=
kernel
;
}
}
};
};
...
@@ -193,64 +195,35 @@ struct KernelRegistrar {
...
@@ -193,64 +195,35 @@ struct KernelRegistrar {
#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
#define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
/** PT_REGISTER_KERNEL
*
* The most frequently used kernel registration macro, used for kernel
* registration with only data type as template parameter, and the function
* pointer of the corresponding data type is automatically instantiated
* during registration.
*/
#define PT_REGISTER_KERNEL( \
#define PT_REGISTER_KERNEL( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
_PT_REGISTER_KERNEL(kernel_name, \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_ID, \
pt_register_kernel_ns_check_##kernel_name, \
backend, \
"PT_REGISTER_KERNEL must be called in global namespace."); \
layout, \
_PT_REGISTER_KERNEL( \
meta_kernel_fn, \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
cpp_dtype, \
__VA_ARGS__)
#ifndef _WIN32
#ifndef _WIN32
#define _PT_REGISTER_KERNEL( \
#define _PT_REGISTER_KERNEL( \
kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
"PT_REGISTER_KERNEL must be called in global namespace."); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \
backend, \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
layout, \
func_id)(::pten::Kernel*); \
&__PT_KERNEL_args_def_FN_##kernel_name, \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
meta_kernel_fn, \
func_id, \
cpp_dtype, \
backend, \
__VA_ARGS__); \
layout, \
void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
&PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
#else
#else
#define _PT_REGISTER_KERNEL( \
kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
"PT_REGISTER_KERNEL must be called in global namespace."); \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
func_id, \
backend, \
layout, \
&PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
#endif
#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
_PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \
(meta_kernel_fn, cpp_dtype, __VA_ARGS__)
/**
/**
* `template decltype(fn) fn` can work on gcc and clang,
* `template decltype(fn) fn` can work on gcc and clang,
* but msvc will failed, error like:
* but msvc will failed, error like:
...
@@ -261,8 +234,30 @@ struct KernelRegistrar {
...
@@ -261,8 +234,30 @@ struct KernelRegistrar {
*
*
* https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
* https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
*
*
*
So we solve the explict instantiation of kernel by CMake
*
And msvc can work without template instantiation
*/
*/
#define _PT_REGISTER_KERNEL( \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
PT_KERNEL_REGISTRAR_INIT(kernel_name, \
backend, \
layout, \
&__PT_KERNEL_args_def_FN_##kernel_name, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
#endif
#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
_PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \
(meta_kernel_fn, cpp_dtype, __VA_ARGS__)
#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
...
@@ -309,22 +304,15 @@ struct KernelRegistrar {
...
@@ -309,22 +304,15 @@ struct KernelRegistrar {
template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
#define PT_KERNEL_REGISTRAR_INIT(kernel_name, \
#define PT_KERNEL_REGISTRAR_INIT( \
func_id, \
kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
backend, \
_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
layout, \
kernel_name, \
args_def_fn, \
backend, \
meta_kernel_fn, \
layout, \
cpp_dtype, \
args_def_fn, \
...) \
meta_kernel_fn, \
_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
cpp_dtype, \
kernel_name, \
func_id, \
backend, \
layout, \
args_def_fn, \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__)
__VA_ARGS__)
// clang-format off
// clang-format off
...
@@ -333,7 +321,6 @@ struct KernelRegistrar {
...
@@ -333,7 +321,6 @@ struct KernelRegistrar {
and multi-line macros cannot be skipped with NOLINT.*/
and multi-line macros cannot be skipped with NOLINT.*/
#define _PT_KERNEL_REGISTRAR_INIT(N, \
#define _PT_KERNEL_REGISTRAR_INIT(N, \
kernel_name, \
kernel_name, \
func_id, \
backend, \
backend, \
layout, \
layout, \
args_def_fn, \
args_def_fn, \
...
@@ -342,7 +329,6 @@ struct KernelRegistrar {
...
@@ -342,7 +329,6 @@ struct KernelRegistrar {
...) \
...) \
PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
kernel_name, \
kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -354,7 +340,6 @@ struct KernelRegistrar {
...
@@ -354,7 +340,6 @@ struct KernelRegistrar {
// clang-format on
// clang-format on
#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -363,17 +348,17 @@ struct KernelRegistrar {
...
@@ -363,17 +348,17 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::pten::KernelArgsParseFunctor<decltype( \
::pten::KernelArgsParseFunctor<decltype( \
&meta_kernel_fn<cpp_dtype>)>::Parse, \
&meta_kernel_fn<cpp_dtype>)>::Parse, \
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>));
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }
#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -382,8 +367,8 @@ struct KernelRegistrar {
...
@@ -382,8 +367,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -392,7 +377,6 @@ struct KernelRegistrar {
...
@@ -392,7 +377,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -400,7 +384,6 @@ struct KernelRegistrar {
...
@@ -400,7 +384,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -409,8 +392,8 @@ struct KernelRegistrar {
...
@@ -409,8 +392,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -419,7 +402,6 @@ struct KernelRegistrar {
...
@@ -419,7 +402,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -427,7 +409,6 @@ struct KernelRegistrar {
...
@@ -427,7 +409,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -436,8 +417,8 @@ struct KernelRegistrar {
...
@@ -436,8 +417,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -446,7 +427,6 @@ struct KernelRegistrar {
...
@@ -446,7 +427,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -454,7 +434,6 @@ struct KernelRegistrar {
...
@@ -454,7 +434,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -463,8 +442,8 @@ struct KernelRegistrar {
...
@@ -463,8 +442,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -473,7 +452,6 @@ struct KernelRegistrar {
...
@@ -473,7 +452,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -481,7 +459,6 @@ struct KernelRegistrar {
...
@@ -481,7 +459,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -490,8 +467,8 @@ struct KernelRegistrar {
...
@@ -490,8 +467,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -500,7 +477,6 @@ struct KernelRegistrar {
...
@@ -500,7 +477,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -508,7 +484,6 @@ struct KernelRegistrar {
...
@@ -508,7 +484,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -517,8 +492,8 @@ struct KernelRegistrar {
...
@@ -517,8 +492,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -527,7 +502,6 @@ struct KernelRegistrar {
...
@@ -527,7 +502,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -535,7 +509,6 @@ struct KernelRegistrar {
...
@@ -535,7 +509,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -544,8 +517,8 @@ struct KernelRegistrar {
...
@@ -544,8 +517,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -554,7 +527,6 @@ struct KernelRegistrar {
...
@@ -554,7 +527,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -562,7 +534,6 @@ struct KernelRegistrar {
...
@@ -562,7 +534,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -571,8 +542,8 @@ struct KernelRegistrar {
...
@@ -571,8 +542,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -581,7 +552,6 @@ struct KernelRegistrar {
...
@@ -581,7 +552,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -589,7 +559,6 @@ struct KernelRegistrar {
...
@@ -589,7 +559,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -598,8 +567,8 @@ struct KernelRegistrar {
...
@@ -598,8 +567,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -608,7 +577,6 @@ struct KernelRegistrar {
...
@@ -608,7 +577,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -616,7 +584,6 @@ struct KernelRegistrar {
...
@@ -616,7 +584,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -625,8 +592,8 @@ struct KernelRegistrar {
...
@@ -625,8 +592,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -635,7 +602,6 @@ struct KernelRegistrar {
...
@@ -635,7 +602,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -643,7 +609,6 @@ struct KernelRegistrar {
...
@@ -643,7 +609,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -652,8 +617,8 @@ struct KernelRegistrar {
...
@@ -652,8 +617,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -662,7 +627,6 @@ struct KernelRegistrar {
...
@@ -662,7 +627,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -670,7 +634,6 @@ struct KernelRegistrar {
...
@@ -670,7 +634,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -679,8 +642,8 @@ struct KernelRegistrar {
...
@@ -679,8 +642,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -689,7 +652,6 @@ struct KernelRegistrar {
...
@@ -689,7 +652,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -697,7 +659,6 @@ struct KernelRegistrar {
...
@@ -697,7 +659,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -706,8 +667,8 @@ struct KernelRegistrar {
...
@@ -706,8 +667,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -716,7 +677,6 @@ struct KernelRegistrar {
...
@@ -716,7 +677,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -724,7 +684,6 @@ struct KernelRegistrar {
...
@@ -724,7 +684,6 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name, \
#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name, \
func_id, \
registrar_id, \
registrar_id, \
backend, \
backend, \
layout, \
layout, \
...
@@ -733,8 +692,8 @@ struct KernelRegistrar {
...
@@ -733,8 +692,8 @@ struct KernelRegistrar {
cpp_dtype, \
cpp_dtype, \
...) \
...) \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
static const ::pten::KernelRegistrar PT_CONCATENATE( \
__reg_pt_
op_kernel_##func_id##_, registrar_id)(
\
__reg_pt_
kernel_##kernel_name##_, registrar_id)(
\
kernel_name,
\
#kernel_name,
\
BACKEND(backend), \
BACKEND(backend), \
DATALAYOUT(layout), \
DATALAYOUT(layout), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
...
@@ -743,7 +702,6 @@ struct KernelRegistrar {
...
@@ -743,7 +702,6 @@ struct KernelRegistrar {
args_def_fn, \
args_def_fn, \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_KERNEL(meta_kernel_fn<cpp_dtype>)); \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \
PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \
func_id, \
PT_ID, \
PT_ID, \
backend, \
backend, \
layout, \
layout, \
...
@@ -751,90 +709,59 @@ struct KernelRegistrar {
...
@@ -751,90 +709,59 @@ struct KernelRegistrar {
meta_kernel_fn, \
meta_kernel_fn, \
__VA_ARGS__))
__VA_ARGS__))
#define PT_REGISTER_KERNEL_STANDARD( \
/** PT_REGISTER_SINGLE_KERNEL
kernel_name, backend, layout, dtype, kernel_fn) \
*
_PT_REGISTER_KERNEL_STANDARD( \
* Used to register a single kernel, pass in the complete function pointer
kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
* of the kernel, this registration macro will not do automatic template
* instantiation.
#define _PT_REGISTER_KERNEL_STANDARD( \
*/
kernel_name, func_id, backend, layout, dtype, kernel_fn) \
#define PT_REGISTER_SINGLE_KERNEL( \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
kernel_name, backend, layout, dtype, kernel_fn) \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
"_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
pt_register_single_kernel_ns_check_##kernel_name, \
template decltype(kernel_fn) kernel_fn; \
"PT_REGISTER_SINGLE_KERNEL must be called in global namespace."); \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
func_id)(::pten::Kernel*); \
static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \
static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
#kernel_name, \
func_id)( \
BACKEND(backend), \
kernel_name, \
DATALAYOUT(layout), \
BACKEND(backend), \
DATATYPE(dtype), \
DATALAYOUT(layout), \
::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse, \
DATATYPE(dtype), \
args_def_fn, \
::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse, \
PT_KERNEL(kernel_fn)); \
args_def_fn, \
int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \
PT_KERNEL(kernel_fn)); \
void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*)
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
/** PT_REGISTER_KERNEL_ALL_DTYPE
// use to declare symbol
*
#define PT_REGISTER_MODULE(name) \
* Used to register a kernel that supports all data types, such as copy and
int RegisterSymbolsFor##name() { return 0; }
* reshape that are not sensitive to data types.
*/
#define PT_DECLARE_MODULE(name) \
#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \
extern int RegisterSymbolsFor##name(); \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
pt_register_kernel_all_dtype_ns_check_##kernel_name, \
"PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace."); \
// only used in cpp tests
static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name( \
::pten::Kernel*); \
#define PT_REGISTER_KERNEL_FOR_TEST( \
static const ::pten::KernelRegistrar \
kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
__reg_pt_kernel_all_dtype_##kernel_name( \
_PT_REGISTER_KERNEL_FOR_TEST(kernel_name, \
#kernel_name, \
PT_ID, \
BACKEND(backend), \
backend, \
DATALAYOUT(layout), \
layout, \
::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse, \
meta_kernel_fn, \
&__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name, \
cpp_dtype, \
PT_KERNEL(kernel_fn)); \
__VA_ARGS__)
int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \
void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel)
#define _PT_REGISTER_KERNEL_FOR_TEST( \
kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
/** PT_DECLARE_KERNEL
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
*
PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id), \
* Used to export the symbols of the file where the kernel is located,
"PT_REGISTER_KERNEL must be called in global namespace."); \
* to avoid being removed by linker
static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, \
*/
func_id)(::pten::Kernel*); \
#define PT_DECLARE_KERNEL(kernel_name, backend) \
PT_KERNEL_REGISTRAR_INIT( \
extern int TouchKernelSymbolFor_##kernel_name##_##backend(); \
kernel_name, \
UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \
func_id, \
TouchKernelSymbolFor_##kernel_name##_##backend()
backend, \
layout, \
&PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id), \
meta_kernel_fn, \
cpp_dtype, \
__VA_ARGS__); \
void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
#define PT_REGISTER_KERNEL_WITH_NO_TYPE( \
kernel_name, backend, layout, meta_kernel_fn) \
_PT_REGISTER_KERNEL_WITH_NO_TYPE( \
kernel_name, PT_ID, backend, layout, meta_kernel_fn)
#define _PT_REGISTER_KERNEL_WITH_NO_TYPE( \
kernel_name, func_id, backend, layout, meta_kernel_fn) \
PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \
PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \
"PT_REGISTER_KERNEL must be called in global namespace."); \
decltype(meta_kernel_fn) meta_kernel_fn; \
static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel*); \
static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
func_id)( \
kernel_name, \
BACKEND(backend), \
DATALAYOUT(layout), \
::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse, \
&PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
PT_KERNEL(meta_kernel_fn)); \
void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \
func_id)(::pten::Kernel * kernel)
}
// namespace pten
}
// namespace pten
paddle/pten/kernels/cpu/creation.cc
浏览文件 @
883ee1a3
...
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,
...
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,
}
// namespace pten
}
// namespace pten
PT_REGISTER_MODULE
(
CreationCPU
);
PT_REGISTER_KERNEL
(
full_like
,
PT_REGISTER_KERNEL
(
"full_like"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
FillAnyLike
,
pten
::
FillAnyLike
,
...
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
...
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
bool
,
bool
,
paddle
::
platform
::
float16
)
{}
paddle
::
platform
::
float16
)
{}
PT_REGISTER_KERNEL
(
"full"
,
PT_REGISTER_KERNEL
(
full
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
FillConstant
,
pten
::
FillConstant
,
...
...
paddle/pten/kernels/cpu/linalg.cc
浏览文件 @
883ee1a3
...
@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,
...
@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,
}
// namespace pten
}
// namespace pten
PT_REGISTER_MODULE
(
LinalgCPU
);
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
PT_REGISTER_KERNEL
(
"dot"
,
PT_REGISTER_KERNEL
(
dot
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
Dot
,
pten
::
Dot
,
...
@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
...
@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
PT_REGISTER_KERNEL
(
"matmul_v2"
,
CPU
,
ANY
,
pten
::
Matmul
,
float
,
double
,
complex64
,
complex128
)
{
matmul
,
CPU
,
ANY
,
pten
::
Matmul
,
float
,
double
,
complex64
,
complex128
)
{}
}
paddle/pten/kernels/cpu/manipulation.cc
浏览文件 @
883ee1a3
...
@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,
...
@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE
(
ManipulationCPU
);
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// architecture, kernel_name should be "flatten".
// architecture, kernel_name should be "flatten".
PT_REGISTER_KERNEL
(
"flatten"
,
PT_REGISTER_KERNEL
(
flatten
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
Flatten
,
pten
::
Flatten
,
...
@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
...
@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
int8_t
,
int8_t
,
int
,
int
,
int64_t
)
{}
int64_t
)
{}
PT_REGISTER_KERNEL
(
flatten_mid
,
PT_REGISTER_KERNEL
(
"flatten.mid"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
FlattenWithXShape
,
pten
::
FlattenWithXShape
,
...
@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
...
@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
int8_t
,
int8_t
,
int
,
int
,
int64_t
)
{}
int64_t
)
{}
PT_REGISTER_KERNEL
(
"cast"
,
PT_REGISTER_KERNEL
(
cast
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
Cast
,
pten
::
Cast
,
...
@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
...
@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
kernel
->
OutputAt
(
0
).
SetDataType
(
paddle
::
experimental
::
DataType
::
UNDEFINED
);
kernel
->
OutputAt
(
0
).
SetDataType
(
paddle
::
experimental
::
DataType
::
UNDEFINED
);
}
}
// TODO(yuanrisheng): "reshape2" is compatible with old kernel
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape
,
CPU
,
ANY
,
pten
::
ReshapeFromVectorVal
)
{}
// architecture, kernel_name should be "reshape".
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_mid
,
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ReshapeFromVectorValWithXShape
)
{}
pten
::
ReshapeFromVectorVal
)
{}
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_host
,
CPU
,
ANY
,
pten
::
ReshapeFromDT
)
{
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.mid"
,
CPU
,
ANY
,
pten
::
ReshapeFromVectorValWithXShape
)
{}
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.host"
,
CPU
,
ANY
,
pten
::
ReshapeFromDT
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_host_mid
,
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.host.mid"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ReshapeFromDTWithXShape
)
{
pten
::
ReshapeFromDTWithXShape
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
PT_REGISTER_KERNEL_
WITH_NO_TYPE
(
"reshape.mulhost"
,
PT_REGISTER_KERNEL_
ALL_DTYPE
(
reshape_mulhost
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ReshapeFromVectorDT
)
{
pten
::
ReshapeFromVectorDT
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_mulhost_mid
,
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.mulhost.mid"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ReshapeFromVectorDTWithXShape
)
{
pten
::
ReshapeFromVectorDTWithXShape
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
paddle/pten/kernels/cpu/math.cc
浏览文件 @
883ee1a3
...
@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)
...
@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE
(
MathCPU
);
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// using bfloat16 = ::paddle::platform::bfloat16;
// using bfloat16 = ::paddle::platform::bfloat16;
PT_REGISTER_KERNEL
(
sign
,
CPU
,
ANY
,
pten
::
Sign
,
float
,
double
)
{}
PT_REGISTER_KERNEL
(
"sign"
,
CPU
,
ANY
,
pten
::
Sign
,
float
,
double
)
{}
PT_REGISTER_KERNEL
(
mean
,
CPU
,
ANY
,
pten
::
Mean
,
float
,
double
,
bool
)
{}
PT_REGISTER_KERNEL
(
"mean"
,
CPU
,
ANY
,
pten
::
Mean
,
float
,
double
,
bool
)
{}
PT_REGISTER_KERNEL
(
scale
,
PT_REGISTER_KERNEL
(
"scale"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
Scale
,
pten
::
Scale
,
...
@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
...
@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
int16_t
,
int16_t
,
int
,
int
,
int64_t
)
{}
int64_t
)
{}
PT_REGISTER_KERNEL
(
add
,
PT_REGISTER_KERNEL
(
"add"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ElementwiseAdd
,
pten
::
ElementwiseAdd
,
...
@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
...
@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
int64_t
,
int64_t
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"subtract"
,
PT_REGISTER_KERNEL
(
subtract
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ElementwiseSub
,
pten
::
ElementwiseSub
,
...
@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
...
@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
int64_t
,
int64_t
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"divide"
,
PT_REGISTER_KERNEL
(
divide
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ElementwiseDiv
,
pten
::
ElementwiseDiv
,
...
@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
...
@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
int64_t
,
int64_t
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"multiply"
,
PT_REGISTER_KERNEL
(
multiply
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
ElementwiseMul
,
pten
::
ElementwiseMul
,
...
@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
...
@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
bool
,
bool
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
sum
,
PT_REGISTER_KERNEL
(
"sum"
,
CPU
,
CPU
,
ANY
,
ANY
,
pten
::
Sum
,
pten
::
Sum
,
...
...
paddle/pten/kernels/cpu/utils.cc
浏览文件 @
883ee1a3
...
@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,
...
@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_KERNEL_ALL_DTYPE
(
copy
,
CPU
,
ANY
,
pten
::
Copy
)
{}
PT_REGISTER_MODULE
(
UtilsCPU
);
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"copy"
,
CPU
,
ANY
,
pten
::
Copy
)
{}
paddle/pten/kernels/cuda/creation.cu
浏览文件 @
883ee1a3
...
@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,
...
@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,
}
// namespace pten
}
// namespace pten
PT_REGISTER_MODULE
(
CreationCUDA
);
PT_REGISTER_KERNEL
(
full_like
,
PT_REGISTER_KERNEL
(
"full_like"
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
FillAnyLike
,
pten
::
FillAnyLike
,
...
@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
...
@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
bool
,
bool
,
paddle
::
platform
::
float16
)
{}
paddle
::
platform
::
float16
)
{}
PT_REGISTER_KERNEL
(
"full"
,
PT_REGISTER_KERNEL
(
full
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
FillConstant
,
pten
::
FillConstant
,
...
...
paddle/pten/kernels/cuda/linalg.cu
浏览文件 @
883ee1a3
...
@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,
...
@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,
}
// namespace pten
}
// namespace pten
PT_REGISTER_MODULE
(
LinalgCUDA
);
using
float16
=
paddle
::
platform
::
float16
;
using
float16
=
paddle
::
platform
::
float16
;
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
PT_REGISTER_KERNEL
(
"dot"
,
PT_REGISTER_KERNEL
(
dot
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
Dot
,
pten
::
Dot
,
...
@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
...
@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"matmul_v2"
,
PT_REGISTER_KERNEL
(
matmul
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
Matmul
,
pten
::
Matmul
,
...
...
paddle/pten/kernels/cuda/manipulation.cu
浏览文件 @
883ee1a3
...
@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,
...
@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE
(
ManipulationCUDA
);
using
float16
=
paddle
::
platform
::
float16
;
using
float16
=
paddle
::
platform
::
float16
;
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// architecture, kernel_name should be "flatten".
PT_REGISTER_KERNEL
(
flatten
,
PT_REGISTER_KERNEL
(
"flatten"
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
Flatten
,
pten
::
Flatten
,
...
@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
...
@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
int8_t
,
int8_t
,
int
,
int
,
int64_t
)
{}
int64_t
)
{}
PT_REGISTER_KERNEL
(
flatten_mid
,
PT_REGISTER_KERNEL
(
"flatten.mid"
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
FlattenWithXShape
,
pten
::
FlattenWithXShape
,
...
@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
...
@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
int64_t
)
{}
int64_t
)
{}
#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
PT_REGISTER_KERNEL(
"cast",
\
PT_REGISTER_KERNEL(
cast,
\
CUDA, \
CUDA, \
ANY, \
ANY, \
pten::Cast, \
pten::Cast, \
...
@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
...
@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE
(
cast
)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE
(
cast
)
#endif
#endif
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape"
,
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape
,
CUDA
,
ANY
,
pten
::
ReshapeFromVectorVal
)
{}
CUDA
,
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_mid
,
ANY
,
CUDA
,
pten
::
ReshapeFromVectorVal
)
{}
ANY
,
pten
::
ReshapeFromVectorValWithXShape
)
{}
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.mid"
,
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_host
,
CUDA
,
ANY
,
pten
::
ReshapeFromDT
)
{
CUDA
,
ANY
,
pten
::
ReshapeFromVectorValWithXShape
)
{}
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.host"
,
CUDA
,
ANY
,
pten
::
ReshapeFromDT
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_host_mid
,
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.host.mid"
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ReshapeFromDTWithXShape
)
{
pten
::
ReshapeFromDTWithXShape
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_mulhost
,
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.mulhost"
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ReshapeFromVectorDT
)
{
pten
::
ReshapeFromVectorDT
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape_mulhost_mid
,
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape.mulhost.mid"
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ReshapeFromVectorDTWithXShape
)
{
pten
::
ReshapeFromVectorDTWithXShape
)
{
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetBackend
(
pten
::
Backend
::
CPU
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
kernel
->
InputAt
(
1
).
SetDataType
(
paddle
::
experimental
::
DataType
::
INT32
);
}
}
paddle/pten/kernels/cuda/math.cu
浏览文件 @
883ee1a3
...
@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,
...
@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_MODULE
(
MathCUDA
);
using
float16
=
paddle
::
platform
::
float16
;
using
float16
=
paddle
::
platform
::
float16
;
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex64
=
::
paddle
::
platform
::
complex
<
float
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
using
complex128
=
::
paddle
::
platform
::
complex
<
double
>
;
PT_REGISTER_KERNEL
(
"sign"
,
CUDA
,
ANY
,
pten
::
Sign
,
float
,
double
,
float16
)
{}
PT_REGISTER_KERNEL
(
sign
,
CUDA
,
ANY
,
pten
::
Sign
,
float
,
double
,
float16
)
{}
PT_REGISTER_KERNEL
(
"mean"
,
CUDA
,
ANY
,
pten
::
Mean
,
float
,
double
,
bool
)
{}
PT_REGISTER_KERNEL
(
mean
,
CUDA
,
ANY
,
pten
::
Mean
,
float
,
double
,
bool
)
{}
PT_REGISTER_KERNEL
(
"scale"
,
PT_REGISTER_KERNEL
(
scale
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
Scale
,
pten
::
Scale
,
...
@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
...
@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
int16_t
,
int16_t
,
int
,
int
,
int64_t
)
{}
int64_t
)
{}
PT_REGISTER_KERNEL
(
"add"
,
PT_REGISTER_KERNEL
(
add
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ElementwiseAdd
,
pten
::
ElementwiseAdd
,
...
@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
...
@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
float16
,
float16
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"subtract"
,
PT_REGISTER_KERNEL
(
subtract
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ElementwiseSub
,
pten
::
ElementwiseSub
,
...
@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
...
@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
float16
,
float16
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"divide"
,
PT_REGISTER_KERNEL
(
divide
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ElementwiseDiv
,
pten
::
ElementwiseDiv
,
...
@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
...
@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
float16
,
float16
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"multiply"
,
PT_REGISTER_KERNEL
(
multiply
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
ElementwiseMul
,
pten
::
ElementwiseMul
,
...
@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
...
@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
float16
,
float16
,
complex64
,
complex64
,
complex128
)
{}
complex128
)
{}
PT_REGISTER_KERNEL
(
"sum"
,
PT_REGISTER_KERNEL
(
sum
,
CUDA
,
CUDA
,
ANY
,
ANY
,
pten
::
Sum
,
pten
::
Sum
,
...
...
paddle/pten/kernels/cuda/utils.cu
浏览文件 @
883ee1a3
...
@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
...
@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
}
}
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_KERNEL_ALL_DTYPE
(
copy
,
CUDA
,
ANY
,
pten
::
Copy
)
{}
PT_REGISTER_MODULE
(
UtilsCUDA
);
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"copy"
,
CUDA
,
ANY
,
pten
::
Copy
)
{}
paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
浏览文件 @
883ee1a3
...
@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
...
@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
}
}
}
}
void
TensorCopy
(
const
DenseTensor
&
src
,
DenseTensor
*
dst
)
{
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
const
paddle
::
platform
::
CUDADeviceContext
*
dev_ctx
;
if
(
paddle
::
platform
::
is_gpu_place
(
dst
->
place
())
||
paddle
::
platform
::
is_npu_place
(
dst
->
place
()))
{
dev_ctx
=
static_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
dst
->
place
()));
}
else
{
dev_ctx
=
static_cast
<
paddle
::
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
src
.
place
()));
}
pten
::
Copy
(
*
dev_ctx
,
src
,
false
,
dst
);
}
template
<
typename
Tx
,
template
<
typename
Tx
,
typename
Ty
,
typename
Ty
,
template
<
typename
,
typename
>
class
ReduceOp
>
template
<
typename
,
typename
>
class
ReduceOp
>
...
@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
...
@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
if
(
config
.
reduce_num
==
1
)
{
if
(
config
.
reduce_num
==
1
)
{
auto
out_dims
=
y
->
dims
();
auto
out_dims
=
y
->
dims
();
if
(
x
.
dtype
()
==
y
->
dtype
())
{
if
(
x
.
dtype
()
==
y
->
dtype
())
{
pten
::
Copy
(
*
dev_ctx
,
x
,
true
,
y
);
TensorCopy
(
x
,
y
);
y
->
Resize
(
out_dims
);
y
->
Resize
(
out_dims
);
}
else
{
}
else
{
PD_VISIT_ALL_TYPES
(
y
->
dtype
(),
"CastKernelImpl"
,
([
&
]
{
PD_VISIT_ALL_TYPES
(
y
->
dtype
(),
"CastKernelImpl"
,
([
&
]
{
...
...
paddle/pten/kernels/xpu/manipulation.cc
浏览文件 @
883ee1a3
...
@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,
...
@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_KERNEL
(
flatten
,
PT_REGISTER_MODULE
(
ManipulationXPU
);
// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
// architecture, kernel_name should be "flatten".
PT_REGISTER_KERNEL
(
"flatten_contiguous_range"
,
XPU
,
XPU
,
ANY
,
ANY
,
pten
::
Flatten
,
pten
::
Flatten
,
...
@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
...
@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
int
,
int
,
int64_t
)
{}
int64_t
)
{}
PT_REGISTER_KERNEL
(
"flatten_contiguous_range.mid"
,
PT_REGISTER_KERNEL
(
flatten_mid
,
XPU
,
XPU
,
ANY
,
ANY
,
pten
::
FlattenWithXShape
,
pten
::
FlattenWithXShape
,
...
@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
...
@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
int
,
int
,
int64_t
)
{}
int64_t
)
{}
// TODO(yuanrisheng): "reshape2" is compatible with old kernel
PT_REGISTER_KERNEL_ALL_DTYPE
(
reshape
,
XPU
,
ANY
,
pten
::
ReshapeFromVectorVal
)
{}
// architecture, kernel_name should be "reshape".
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"reshape2"
,
XPU
,
ANY
,
pten
::
ReshapeFromVectorVal
)
{}
paddle/pten/kernels/xpu/utils.cc
浏览文件 @
883ee1a3
...
@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,
...
@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,
}
// namespace pten
}
// namespace pten
// TODO(chenweihang): replace by better impl
PT_REGISTER_KERNEL_ALL_DTYPE
(
copy
,
XPU
,
ANY
,
pten
::
Copy
)
{}
PT_REGISTER_MODULE
(
UtilsXPU
);
PT_REGISTER_KERNEL_WITH_NO_TYPE
(
"copy"
,
XPU
,
ANY
,
pten
::
Copy
)
{}
paddle/pten/tests/api/test_reshape_api.cc
浏览文件 @
883ee1a3
...
@@ -21,12 +21,6 @@ limitations under the License. */
...
@@ -21,12 +21,6 @@ limitations under the License. */
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/core/kernel_registry.h"
PT_DECLARE_MODULE
(
ManipulationCPU
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE
(
ManipulationCUDA
);
#endif
namespace
paddle
{
namespace
paddle
{
namespace
tests
{
namespace
tests
{
...
...
python/paddle/__init__.py
浏览文件 @
883ee1a3
...
@@ -156,6 +156,9 @@ from .tensor.manipulation import roll # noqa: F401
...
@@ -156,6 +156,9 @@ from .tensor.manipulation import roll # noqa: F401
from
.tensor.manipulation
import
chunk
# noqa: F401
from
.tensor.manipulation
import
chunk
# noqa: F401
from
.tensor.manipulation
import
tolist
# noqa: F401
from
.tensor.manipulation
import
tolist
# noqa: F401
from
.tensor.manipulation
import
tensordot
# noqa: F401
from
.tensor.manipulation
import
tensordot
# noqa: F401
from
.tensor.manipulation
import
as_complex
# noqa: F401
from
.tensor.manipulation
import
as_real
# noqa: F401
from
.tensor.math
import
abs
# noqa: F401
from
.tensor.math
import
abs
# noqa: F401
from
.tensor.math
import
acos
# noqa: F401
from
.tensor.math
import
acos
# noqa: F401
from
.tensor.math
import
asin
# noqa: F401
from
.tensor.math
import
asin
# noqa: F401
...
@@ -227,6 +230,8 @@ from .tensor.math import lgamma # noqa: F401
...
@@ -227,6 +230,8 @@ from .tensor.math import lgamma # noqa: F401
from
.tensor.math
import
lerp
# noqa: F401
from
.tensor.math
import
lerp
# noqa: F401
from
.tensor.math
import
rad2deg
# noqa: F401
from
.tensor.math
import
rad2deg
# noqa: F401
from
.tensor.math
import
deg2rad
# noqa: F401
from
.tensor.math
import
deg2rad
# noqa: F401
from
.tensor.math
import
gcd
# noqa: F401
from
.tensor.math
import
lcm
# noqa: F401
from
.tensor.math
import
diff
# noqa: F401
from
.tensor.math
import
diff
# noqa: F401
from
.tensor.math
import
angle
# noqa: F401
from
.tensor.math
import
angle
# noqa: F401
...
@@ -260,6 +265,7 @@ from .framework.random import set_cuda_rng_state # noqa: F401
...
@@ -260,6 +265,7 @@ from .framework.random import set_cuda_rng_state # noqa: F401
from
.framework
import
ParamAttr
# noqa: F401
from
.framework
import
ParamAttr
# noqa: F401
from
.framework
import
create_parameter
# noqa: F401
from
.framework
import
create_parameter
# noqa: F401
from
.framework
import
CPUPlace
# noqa: F401
from
.framework
import
CPUPlace
# noqa: F401
from
.framework
import
IPUPlace
# noqa: F401
from
.framework
import
CUDAPlace
# noqa: F401
from
.framework
import
CUDAPlace
# noqa: F401
from
.framework
import
NPUPlace
# noqa: F401
from
.framework
import
NPUPlace
# noqa: F401
from
.framework
import
CUDAPinnedPlace
# noqa: F401
from
.framework
import
CUDAPinnedPlace
# noqa: F401
...
@@ -291,6 +297,7 @@ from .fluid.framework import get_flags # noqa: F401
...
@@ -291,6 +297,7 @@ from .fluid.framework import get_flags # noqa: F401
from
.fluid.framework
import
set_flags
# noqa: F401
from
.fluid.framework
import
set_flags
# noqa: F401
from
.device
import
is_compiled_with_xpu
# noqa: F401
from
.device
import
is_compiled_with_xpu
# noqa: F401
from
.device
import
is_compiled_with_npu
# noqa: F401
from
.device
import
is_compiled_with_npu
# noqa: F401
from
.device
import
is_compiled_with_ipu
# noqa: F401
from
.device
import
XPUPlace
# noqa: F401
from
.device
import
XPUPlace
# noqa: F401
from
.fluid.dygraph.base
import
enable_dygraph
as
disable_static
# noqa: F401
from
.fluid.dygraph.base
import
enable_dygraph
as
disable_static
# noqa: F401
...
@@ -478,6 +485,8 @@ __all__ = [ # noqa
...
@@ -478,6 +485,8 @@ __all__ = [ # noqa
'atan2'
,
'atan2'
,
'rad2deg'
,
'rad2deg'
,
'deg2rad'
,
'deg2rad'
,
'gcd'
,
'lcm'
,
'expand'
,
'expand'
,
'broadcast_to'
,
'broadcast_to'
,
'ones_like'
,
'ones_like'
,
...
@@ -553,6 +562,8 @@ __all__ = [ # noqa
...
@@ -553,6 +562,8 @@ __all__ = [ # noqa
'einsum'
,
'einsum'
,
'set_flags'
,
'set_flags'
,
'get_flags'
,
'get_flags'
,
'as_complex'
,
'as_real'
,
'diff'
,
'diff'
,
'angle'
,
'angle'
,
]
]
python/paddle/device/__init__.py
浏览文件 @
883ee1a3
...
@@ -28,7 +28,9 @@ __all__ = [ # noqa
...
@@ -28,7 +28,9 @@ __all__ = [ # noqa
'set_device'
,
'set_device'
,
'get_device'
,
'get_device'
,
'XPUPlace'
,
'XPUPlace'
,
'IPUPlace'
,
'is_compiled_with_xpu'
,
'is_compiled_with_xpu'
,
'is_compiled_with_ipu'
,
'is_compiled_with_cinn'
,
'is_compiled_with_cinn'
,
'is_compiled_with_cuda'
,
'is_compiled_with_cuda'
,
'is_compiled_with_rocm'
,
'is_compiled_with_rocm'
,
...
@@ -55,6 +57,36 @@ def is_compiled_with_npu():
...
@@ -55,6 +57,36 @@ def is_compiled_with_npu():
return
core
.
is_compiled_with_npu
()
return
core
.
is_compiled_with_npu
()
def
is_compiled_with_ipu
():
"""
Whether paddle was built with WITH_IPU=ON to support Graphcore IPU.
Returns (bool): `True` if IPU is supported, otherwise `False`.
Examples:
.. code-block:: python
import paddle
support_ipu = paddle.is_compiled_with_ipu()
"""
return
core
.
is_compiled_with_ipu
()
def
IPUPlace
():
"""
Return a Graphcore IPU Place
Examples:
.. code-block:: python
# required: ipu
import paddle
place = paddle.device.IPUPlace()
"""
return
core
.
IPUPlace
()
def
is_compiled_with_xpu
():
def
is_compiled_with_xpu
():
"""
"""
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
...
@@ -143,13 +175,19 @@ def _convert_to_place(device):
...
@@ -143,13 +175,19 @@ def _convert_to_place(device):
selected_npus
=
os
.
getenv
(
"FLAGS_selected_npus"
,
"0"
).
split
(
","
)
selected_npus
=
os
.
getenv
(
"FLAGS_selected_npus"
,
"0"
).
split
(
","
)
device_id
=
int
(
selected_npus
[
0
])
device_id
=
int
(
selected_npus
[
0
])
place
=
core
.
NPUPlace
(
device_id
)
place
=
core
.
NPUPlace
(
device_id
)
elif
lower_device
==
'ipu'
:
if
not
core
.
is_compiled_with_ipu
():
raise
ValueError
(
"The device should not be 'ipu', "
\
"since PaddlePaddle is not compiled with IPU"
)
place
=
core
.
IPUPlace
()
else
:
else
:
avaliable_gpu_device
=
re
.
match
(
r
'gpu:\d+'
,
lower_device
)
avaliable_gpu_device
=
re
.
match
(
r
'gpu:\d+'
,
lower_device
)
avaliable_xpu_device
=
re
.
match
(
r
'xpu:\d+'
,
lower_device
)
avaliable_xpu_device
=
re
.
match
(
r
'xpu:\d+'
,
lower_device
)
avaliable_npu_device
=
re
.
match
(
r
'npu:\d+'
,
lower_device
)
avaliable_npu_device
=
re
.
match
(
r
'npu:\d+'
,
lower_device
)
if
not
avaliable_gpu_device
and
not
avaliable_xpu_device
and
not
avaliable_npu_device
:
if
not
avaliable_gpu_device
and
not
avaliable_xpu_device
and
not
avaliable_npu_device
:
raise
ValueError
(
raise
ValueError
(
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu'
or 'npu:x'
"
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu'
, 'npu:x' or ipu
"
)
)
if
avaliable_gpu_device
:
if
avaliable_gpu_device
:
if
not
core
.
is_compiled_with_cuda
():
if
not
core
.
is_compiled_with_cuda
():
...
@@ -183,13 +221,13 @@ def _convert_to_place(device):
...
@@ -183,13 +221,13 @@ def _convert_to_place(device):
def
set_device
(
device
):
def
set_device
(
device
):
"""
"""
Paddle supports running calculations on various types of devices, including CPU, GPU, XPU
and N
PU.
Paddle supports running calculations on various types of devices, including CPU, GPU, XPU
, NPU and I
PU.
They are represented by string identifiers. This function can specify the global device
They are represented by string identifiers. This function can specify the global device
which the OP will run.
which the OP will run.
Parameters:
Parameters:
device(str): This parameter determines the specific running device.
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``
and ``npu:x
``,
It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``
, ``npu:x`` and ``ipu
``,
where ``x`` is the index of the GPUs, XPUs or NPUs.
where ``x`` is the index of the GPUs, XPUs or NPUs.
Examples:
Examples:
...
@@ -236,5 +274,10 @@ def get_device():
...
@@ -236,5 +274,10 @@ def get_device():
elif
isinstance
(
place
,
core
.
NPUPlace
):
elif
isinstance
(
place
,
core
.
NPUPlace
):
device_id
=
place
.
get_device_id
()
device_id
=
place
.
get_device_id
()
device
=
'npu:'
+
str
(
device_id
)
device
=
'npu:'
+
str
(
device_id
)
elif
isinstance
(
place
,
core
.
IPUPlace
):
num_devices
=
core
.
get_ipu_device_count
()
device
=
"ipus:{{0-{}}}"
.
format
(
num_devices
-
1
)
else
:
raise
ValueError
(
"The device specification {} is invalid"
.
format
(
place
))
return
device
return
device
python/paddle/distributed/auto_parallel/operators/dist_matmul.py
浏览文件 @
883ee1a3
...
@@ -296,6 +296,83 @@ class DistributedMatmulImpl0(DistributedOperatorImpl):
...
@@ -296,6 +296,83 @@ class DistributedMatmulImpl0(DistributedOperatorImpl):
return
False
return
False
return
True
return
True
def
is_auto_compatible
(
self
,
dist_op
):
op_desc
=
dist_op
.
serial_op
.
desc
op_dist_attr
=
dist_op
.
dist_attr
x_name
=
op_desc
.
input
(
'X'
)[
0
]
y_name
=
op_desc
.
input
(
'Y'
)[
0
]
out_name
=
op_desc
.
output
(
'Out'
)[
0
]
out_dims_mapping
=
op_dist_attr
.
get_output_dims_mapping
(
out_name
)
x_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
x_name
)
y_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
y_name
)
assert
len
(
x_dims_mapping
)
>=
len
(
y_dims_mapping
),
"now just support x dims > y dims"
if
len
(
x_dims_mapping
)
==
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
4
:
if
x_dims_mapping
[:
2
]
!=
y_dims_mapping
[:
2
]:
return
False
if
x_dims_mapping
[:
2
]
!=
out_dims_mapping
[:
2
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
elif
len
(
x_dims_mapping
)
!=
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
3
:
if
x_dims_mapping
[
0
]
!=
out_dims_mapping
[
0
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
if
is_dim_replicate
(
out_dims_mapping
[
-
1
]):
return
False
for
mapping
in
out_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
input_dims_mapping
=
[]
ordered_input_shard_dims_mapping
=
[]
for
dim
in
(
x_dims_mapping
+
y_dims_mapping
):
input_dims_mapping
.
append
(
dim
)
for
item
in
input_dims_mapping
:
if
item
not
in
ordered_input_shard_dims_mapping
and
item
!=
-
1
:
ordered_input_shard_dims_mapping
.
append
(
item
)
for
mapping
in
out_dims_mapping
:
if
mapping
not
in
input_dims_mapping
:
return
False
if
is_dim_shard
(
x_dims_mapping
[
0
]):
order_index
=
0
for
idx
,
item
in
enumerate
(
out_dims_mapping
):
if
item
!=
-
1
:
if
item
!=
ordered_input_shard_dims_mapping
[
order_index
]:
return
False
else
:
order_index
+=
1
if
order_index
!=
len
(
ordered_input_shard_dims_mapping
):
return
False
if
is_dim_shard
(
x_dims_mapping
[
-
1
]):
return
False
if
is_dim_shard
(
y_dims_mapping
[
0
])
or
is_dim_replicate
(
y_dims_mapping
[
1
]):
return
False
for
mapping
in
x_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
if
is_dim_shard
(
x_dims_mapping
[
0
]):
for
mapping
in
y_dims_mapping
[
1
:]:
if
is_dim_shard
(
mapping
)
and
mapping
==
x_dims_mapping
[
0
]:
return
False
return
True
def
update_dims_mapping
(
self
,
dist_op
):
def
update_dims_mapping
(
self
,
dist_op
):
changed
=
False
changed
=
False
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
...
@@ -510,6 +587,95 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
...
@@ -510,6 +587,95 @@ class DistributedMatmulImpl1(DistributedOperatorImpl):
return
False
return
False
return
True
return
True
def
is_auto_compatible
(
self
,
dist_op
):
op_desc
=
dist_op
.
serial_op
.
desc
op_dist_attr
=
dist_op
.
dist_attr
x_name
=
op_desc
.
input
(
'X'
)[
0
]
y_name
=
op_desc
.
input
(
'Y'
)[
0
]
x_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
x_name
)
y_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
y_name
)
if
op_desc
.
attr
(
'transpose_X'
)
or
op_desc
.
attr
(
'transpose_Y'
):
return
False
out_name
=
op_desc
.
output
(
'Out'
)[
0
]
out_dims_mapping
=
op_dist_attr
.
get_output_dims_mapping
(
out_name
)
# for gpt2, x dims > y dims, this is a temporary solution
assert
len
(
x_dims_mapping
)
>=
len
(
y_dims_mapping
),
"now just support x dims > y dims"
if
len
(
x_dims_mapping
)
==
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
4
:
if
x_dims_mapping
[:
2
]
!=
y_dims_mapping
[:
2
]:
return
False
if
x_dims_mapping
[:
2
]
!=
out_dims_mapping
[:
2
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
elif
len
(
x_dims_mapping
)
!=
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
3
:
if
x_dims_mapping
[
0
]
!=
out_dims_mapping
[
0
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
if
is_dim_shard
(
out_dims_mapping
[
-
1
]):
return
False
# Other dimensions must be replicate except the batch dimension
for
mapping
in
out_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
if
is_dim_replicate
(
x_dims_mapping
[
-
1
]):
return
False
if
is_dim_replicate
(
y_dims_mapping
[
-
2
])
or
is_dim_shard
(
y_dims_mapping
[
-
1
]):
return
False
# Other dimensions must be replicate except the batch dimension
for
mapping
in
x_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
x_shard_dim_count
=
0
x_shard_dims
=
[]
y_shard_dim_count
=
0
y_shard_dims
=
[]
for
dim
in
x_dims_mapping
:
if
is_dim_shard
(
dim
):
x_shard_dim_count
+=
1
x_shard_dims
.
append
(
dim
)
for
dim
in
y_dims_mapping
:
if
is_dim_shard
(
dim
):
y_shard_dim_count
+=
1
y_shard_dims
.
append
(
dim
)
if
not
x_shard_dims
and
not
y_shard_dims
:
return
False
if
x_shard_dims
[
-
1
]
!=
y_shard_dims
[
0
]:
return
False
if
x_shard_dim_count
==
y_shard_dim_count
:
for
dim
in
out_dims_mapping
:
if
is_dim_shard
(
dim
):
return
False
if
x_shard_dims
!=
y_shard_dims
:
return
False
else
:
if
x_shard_dim_count
<
y_shard_dim_count
:
return
False
output_shard_dims
=
[]
for
dim
in
out_dims_mapping
:
if
is_dim_shard
(
dim
):
output_shard_dims
.
append
(
dim
)
if
not
output_shard_dims
or
output_shard_dims
[
0
]
!=
x_shard_dims
[
0
]:
return
False
return
True
def
update_dims_mapping
(
self
,
dist_op
):
def
update_dims_mapping
(
self
,
dist_op
):
changed
=
False
changed
=
False
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
...
@@ -710,6 +876,59 @@ class DistributedMatmulImpl2(DistributedOperatorImpl):
...
@@ -710,6 +876,59 @@ class DistributedMatmulImpl2(DistributedOperatorImpl):
return
True
return
True
def
is_auto_compatible
(
self
,
dist_op
):
op_desc
=
dist_op
.
serial_op
.
desc
op_dist_attr
=
dist_op
.
dist_attr
x_name
=
op_desc
.
input
(
'X'
)[
0
]
y_name
=
op_desc
.
input
(
'Y'
)[
0
]
out_name
=
op_desc
.
output
(
'Out'
)[
0
]
out_dims_mapping
=
op_dist_attr
.
get_output_dims_mapping
(
out_name
)
x_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
x_name
)
y_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
y_name
)
assert
len
(
x_dims_mapping
)
>=
len
(
y_dims_mapping
),
"now just support x dims > y dims,but x:{0} and y:{1}"
.
format
(
x_dims_mapping
,
y_dims_mapping
)
if
len
(
x_dims_mapping
)
==
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
4
:
if
x_dims_mapping
[:
2
]
!=
y_dims_mapping
[:
2
]:
return
False
if
x_dims_mapping
[:
2
]
!=
out_dims_mapping
[:
2
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
elif
len
(
x_dims_mapping
)
!=
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
3
:
if
x_dims_mapping
[
0
]
!=
out_dims_mapping
[
0
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
if
is_dim_shard
(
out_dims_mapping
[
-
1
]):
return
False
if
is_valid_list_index
(
out_dims_mapping
,
-
2
)
and
is_dim_shard
(
out_dims_mapping
[
-
2
]):
return
False
if
is_dim_shard
(
x_dims_mapping
[
-
1
]):
return
False
if
is_valid_list_index
(
x_dims_mapping
,
-
2
)
and
is_dim_shard
(
x_dims_mapping
[
-
2
]):
return
False
if
is_dim_shard
(
y_dims_mapping
[
-
1
]):
return
False
if
is_valid_list_index
(
y_dims_mapping
,
-
2
)
and
is_dim_shard
(
y_dims_mapping
[
-
2
]):
return
False
return
True
def
update_dims_mapping
(
self
,
dist_op
):
def
update_dims_mapping
(
self
,
dist_op
):
changed
=
False
changed
=
False
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
...
@@ -777,6 +996,86 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
...
@@ -777,6 +996,86 @@ class DistributedMatmulV2Impl0(DistributedOperatorImpl):
return
False
return
False
return
True
return
True
def
is_auto_compatible
(
self
,
dist_op
):
op_desc
=
dist_op
.
serial_op
.
desc
op_dist_attr
=
dist_op
.
dist_attr
x_name
=
op_desc
.
input
(
'X'
)[
0
]
y_name
=
op_desc
.
input
(
'Y'
)[
0
]
out_name
=
op_desc
.
output
(
'Out'
)[
0
]
out_dims_mapping
=
op_dist_attr
.
get_output_dims_mapping
(
out_name
)
x_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
x_name
)
y_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
y_name
)
if
op_desc
.
attr
(
'trans_x'
)
or
op_desc
.
attr
(
'trans_y'
):
return
False
assert
len
(
x_dims_mapping
)
>=
len
(
y_dims_mapping
),
"now just support x dims > y dims"
if
len
(
x_dims_mapping
)
==
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
4
:
if
x_dims_mapping
[:
2
]
!=
y_dims_mapping
[:
2
]:
return
False
if
x_dims_mapping
[:
2
]
!=
out_dims_mapping
[:
2
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
elif
len
(
x_dims_mapping
)
!=
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
3
:
if
x_dims_mapping
[
0
]
!=
out_dims_mapping
[
0
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
if
is_dim_replicate
(
out_dims_mapping
[
-
1
]):
return
False
for
mapping
in
out_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
input_dims_mapping
=
[]
ordered_input_shard_dims_mapping
=
[]
for
dim
in
(
x_dims_mapping
+
y_dims_mapping
):
input_dims_mapping
.
append
(
dim
)
for
item
in
input_dims_mapping
:
if
item
not
in
ordered_input_shard_dims_mapping
and
item
!=
-
1
:
ordered_input_shard_dims_mapping
.
append
(
item
)
for
mapping
in
out_dims_mapping
:
if
mapping
not
in
input_dims_mapping
:
return
False
if
is_dim_shard
(
x_dims_mapping
[
0
]):
order_index
=
0
for
idx
,
item
in
enumerate
(
out_dims_mapping
):
if
item
!=
-
1
:
if
item
!=
ordered_input_shard_dims_mapping
[
order_index
]:
return
False
else
:
order_index
+=
1
if
order_index
!=
len
(
ordered_input_shard_dims_mapping
):
return
False
if
is_dim_shard
(
x_dims_mapping
[
-
1
]):
return
False
if
is_dim_shard
(
y_dims_mapping
[
0
])
or
is_dim_replicate
(
y_dims_mapping
[
1
]):
return
False
for
mapping
in
x_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
if
is_dim_shard
(
x_dims_mapping
[
0
]):
for
mapping
in
y_dims_mapping
[
1
:]:
if
is_dim_shard
(
mapping
)
and
mapping
==
x_dims_mapping
[
0
]:
return
False
return
True
def
update_dims_mapping
(
self
,
dist_op
):
def
update_dims_mapping
(
self
,
dist_op
):
changed
=
False
changed
=
False
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
...
@@ -985,6 +1284,94 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
...
@@ -985,6 +1284,94 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl):
return
False
return
False
return
True
return
True
def
is_auto_compatible
(
self
,
dist_op
):
op_desc
=
dist_op
.
serial_op
.
desc
op_dist_attr
=
dist_op
.
dist_attr
x_name
=
op_desc
.
input
(
'X'
)[
0
]
y_name
=
op_desc
.
input
(
'Y'
)[
0
]
x_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
x_name
)
y_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
y_name
)
if
op_desc
.
attr
(
'trans_x'
)
or
op_desc
.
attr
(
'trans_y'
):
return
False
out_name
=
op_desc
.
output
(
'Out'
)[
0
]
out_dims_mapping
=
op_dist_attr
.
get_output_dims_mapping
(
out_name
)
assert
len
(
x_dims_mapping
)
>=
len
(
y_dims_mapping
),
"now just support x dims > y dims"
if
len
(
x_dims_mapping
)
==
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
4
:
if
x_dims_mapping
[:
2
]
!=
y_dims_mapping
[:
2
]:
return
False
if
x_dims_mapping
[:
2
]
!=
out_dims_mapping
[:
2
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
elif
len
(
x_dims_mapping
)
!=
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
3
:
if
x_dims_mapping
[
0
]
!=
out_dims_mapping
[
0
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
if
is_dim_shard
(
out_dims_mapping
[
-
1
]):
return
False
# Other dimensions must be replicate except the batch dimension
for
mapping
in
out_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
if
is_dim_replicate
(
x_dims_mapping
[
-
1
]):
return
False
if
is_dim_replicate
(
y_dims_mapping
[
-
2
])
or
is_dim_shard
(
y_dims_mapping
[
-
1
]):
return
False
# Other dimensions must be replicate except the batch dimension
for
mapping
in
x_dims_mapping
[
1
:
-
1
]:
if
is_dim_shard
(
mapping
):
return
False
x_shard_dim_count
=
0
x_shard_dims
=
[]
y_shard_dim_count
=
0
y_shard_dims
=
[]
for
dim
in
x_dims_mapping
:
if
is_dim_shard
(
dim
):
x_shard_dim_count
+=
1
x_shard_dims
.
append
(
dim
)
for
dim
in
y_dims_mapping
:
if
is_dim_shard
(
dim
):
y_shard_dim_count
+=
1
y_shard_dims
.
append
(
dim
)
if
not
x_shard_dims
and
not
y_shard_dims
:
return
False
if
x_shard_dims
[
-
1
]
!=
y_shard_dims
[
0
]:
return
False
if
x_shard_dim_count
==
y_shard_dim_count
:
for
dim
in
out_dims_mapping
:
if
is_dim_shard
(
dim
):
return
False
if
x_shard_dims
!=
y_shard_dims
:
return
False
else
:
if
x_shard_dim_count
<
y_shard_dim_count
:
return
False
output_shard_dims
=
[]
for
dim
in
out_dims_mapping
:
if
is_dim_shard
(
dim
):
output_shard_dims
.
append
(
dim
)
if
not
output_shard_dims
or
output_shard_dims
[
0
]
!=
x_shard_dims
[
0
]:
return
False
return
True
def
update_dims_mapping
(
self
,
dist_op
):
def
update_dims_mapping
(
self
,
dist_op
):
changed
=
False
changed
=
False
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
...
@@ -1183,6 +1570,61 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
...
@@ -1183,6 +1570,61 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl):
return
True
return
True
def
is_auto_compatible
(
self
,
dist_op
):
op_desc
=
dist_op
.
serial_op
.
desc
op_dist_attr
=
dist_op
.
dist_attr
x_name
=
op_desc
.
input
(
'X'
)[
0
]
y_name
=
op_desc
.
input
(
'Y'
)[
0
]
out_name
=
op_desc
.
output
(
'Out'
)[
0
]
out_dims_mapping
=
op_dist_attr
.
get_output_dims_mapping
(
out_name
)
x_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
x_name
)
y_dims_mapping
=
op_dist_attr
.
get_input_dims_mapping
(
y_name
)
assert
len
(
x_dims_mapping
)
>=
len
(
y_dims_mapping
),
"now just support x dims > y dims,but x:{0} and y:{1}"
.
format
(
x_dims_mapping
,
y_dims_mapping
)
if
len
(
x_dims_mapping
)
==
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
4
:
if
x_dims_mapping
[:
2
]
!=
y_dims_mapping
[:
2
]:
return
False
if
x_dims_mapping
[:
2
]
!=
out_dims_mapping
[:
2
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
elif
len
(
x_dims_mapping
)
!=
len
(
y_dims_mapping
)
and
len
(
x_dims_mapping
)
==
3
:
if
x_dims_mapping
[
0
]
!=
out_dims_mapping
[
0
]:
return
False
x_dims_mapping
=
x_dims_mapping
[
-
2
:]
y_dims_mapping
=
y_dims_mapping
[
-
2
:]
out_dims_mapping
=
out_dims_mapping
[
-
2
:]
if
is_dim_shard
(
out_dims_mapping
[
-
1
]):
return
False
if
is_valid_list_index
(
out_dims_mapping
,
-
2
)
and
is_dim_shard
(
out_dims_mapping
[
-
2
]):
return
False
if
is_dim_shard
(
x_dims_mapping
[
-
1
]):
return
False
if
is_valid_list_index
(
x_dims_mapping
,
-
2
)
and
is_dim_shard
(
x_dims_mapping
[
-
2
]):
return
False
if
is_dim_shard
(
y_dims_mapping
[
-
1
]):
return
False
if
is_valid_list_index
(
y_dims_mapping
,
-
2
)
and
is_dim_shard
(
y_dims_mapping
[
-
2
]):
return
False
return
True
def
update_dims_mapping
(
self
,
dist_op
):
def
update_dims_mapping
(
self
,
dist_op
):
changed
=
False
changed
=
False
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
dim_changed
=
_update_dims_mapping_for_matmul
(
dist_op
)
...
...
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
浏览文件 @
883ee1a3
...
@@ -27,11 +27,13 @@ from collections import OrderedDict
...
@@ -27,11 +27,13 @@ from collections import OrderedDict
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
paddle
import
framework
from
paddle
import
framework
from
paddle.fluid
import
core
import
paddle.distributed
as
dist
import
paddle.distributed
as
dist
from
paddle.optimizer
import
Optimizer
from
paddle.optimizer
import
Optimizer
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
...utils.internal_storage
import
ParamStorage
from
...utils.internal_storage
import
ParamStorage
from
...meta_parallel.sharding.sharding_utils
import
Type
from
...meta_parallel.sharding.sharding_utils
import
Type
,
device_guard
,
ShardingClipGrad
# CUDA alignment 256 bytes
# CUDA alignment 256 bytes
alignment
=
{
"gpu"
:
256
,
}
alignment
=
{
"gpu"
:
256
,
}
...
@@ -99,16 +101,41 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -99,16 +101,41 @@ class ShardingOptimizerStage2(Optimizer):
self
.
broadcast_fp16
=
broadcast_fp16
self
.
broadcast_fp16
=
broadcast_fp16
self
.
param_storages
=
{}
# {dtype: {rank: InternalStorage}}
self
.
param_storages
=
{}
# {dtype: {rank: InternalStorage}}
if
isinstance
(
self
.
_optim
.
_grad_clip
,
ClipGradByGlobalNorm
):
logging
.
warning
(
"While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
)
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
group
,
paddle
.
get_device
())
if
offload
:
assert
self
.
_pfp16
,
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
self
.
offload
=
offload
# Using for offload
self
.
offload
=
offload
# Using for offload
self
.
offload_device
=
"cpu"
self
.
_master_params
=
{}
# Update optimizer parameters and adjust parameter storage and use according to rank.
# Update optimizer parameters and adjust parameter storage and use according to rank.
self
.
update_opt_status
()
self
.
update_opt_status
()
def
_generate_master_params
(
self
,
trainable_params
):
def
_generate_master_params
(
self
,
trainable_params
):
for
param
in
trainable_params
:
if
self
.
offload
:
if
param
.
dtype
==
Type
.
fp16
.
value
:
for
param
in
trainable_params
:
self
.
_optim
.
_master_weights
[
param
.
name
]
=
paddle
.
cast
(
if
param
.
name
not
in
self
.
_master_params
.
keys
():
param
,
Type
.
fp32
.
value
)
self
.
_master_params
[
param
.
name
]
=
core
.
VarBase
(
name
=
param
.
name
,
value
=
param
.
cast
(
dtype
=
Type
.
fp32
.
value
).
numpy
(),
place
=
core
.
CPUPlace
(),
stop_gradient
=
param
.
stop_gradient
)
self
.
_optim
.
_master_weights
=
self
.
_master_params
else
:
for
param
in
trainable_params
:
if
param
.
dtype
==
Type
.
fp16
.
value
:
self
.
_optim
.
_master_weights
[
param
.
name
]
=
paddle
.
cast
(
param
,
Type
.
fp32
.
value
)
def
update_opt_status
(
self
):
def
update_opt_status
(
self
):
"""Update optimizer status and parameter storage information, and special functions to be developed.
"""Update optimizer status and parameter storage information, and special functions to be developed.
...
@@ -243,22 +270,43 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -243,22 +270,43 @@ class ShardingOptimizerStage2(Optimizer):
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
A wrapper for Optimizer's step function to finish the update operation of the optimizer.
"""
"""
# Synchronize optimizer parameters for the current rank
if
self
.
offload
:
if
len
(
self
.
dtype_rank_params
.
keys
(
self
.
_optim
.
_parameter_list
=
[
))
==
1
and
Type
.
fp32
.
value
in
self
.
dtype_rank_params
.
keys
():
param
for
name
,
param
in
self
.
_master_params
.
items
()
self
.
_optim
.
_parameter_list
=
self
.
dtype_rank_params
[
]
Type
.
fp32
.
value
][
self
.
rank
]
elif
len
(
self
.
dtype_rank_params
.
keys
(
))
==
1
and
Type
.
fp16
.
value
in
self
.
dtype_rank_params
.
keys
():
self
.
_optim
.
_parameter_list
=
self
.
dtype_rank_params
[
Type
.
fp16
.
value
][
self
.
rank
]
else
:
else
:
self
.
_optim
.
_parameter_list
=
self
.
dtype_rank_params
[
# Synchronize optimizer parameters for the current rank
Type
.
fp16
.
value
][
self
.
rank
]
+
self
.
dtype_rank_params
[
if
len
(
self
.
dtype_rank_params
.
keys
(
))
==
1
and
Type
.
fp32
.
value
in
self
.
dtype_rank_params
.
keys
():
self
.
_optim
.
_parameter_list
=
self
.
dtype_rank_params
[
Type
.
fp32
.
value
][
self
.
rank
]
Type
.
fp32
.
value
][
self
.
rank
]
elif
len
(
self
.
dtype_rank_params
.
keys
(
))
==
1
and
Type
.
fp16
.
value
in
self
.
dtype_rank_params
.
keys
():
self
.
_optim
.
_parameter_list
=
self
.
dtype_rank_params
[
Type
.
fp16
.
value
][
self
.
rank
]
else
:
self
.
_optim
.
_parameter_list
=
self
.
dtype_rank_params
[
Type
.
fp16
.
value
][
self
.
rank
]
+
self
.
dtype_rank_params
[
Type
.
fp32
.
value
][
self
.
rank
]
# Run the optimizer of the current rank step
# Run the optimizer of the current rank step
self
.
_optim
.
step
()
if
self
.
offload
:
with
device_guard
(
self
.
rank
,
self
.
offload_device
):
self
.
_optim
.
step
()
for
param
in
self
.
_optim
.
_parameter_list
:
self
.
_master_params
[
param
.
name
].
set_value
(
param
)
dev_id
=
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
for
param
in
self
.
_local_params
:
if
param
.
name
in
self
.
_master_params
.
keys
():
param
.
set_value
(
self
.
_master_params
[
param
.
name
].
cuda
(
dev_id
)
.
cast
(
dtype
=
param
.
dtype
))
self
.
_master_params
[
param
.
name
].
clear_gradient
(
False
)
else
:
self
.
_optim
.
step
()
# Synchronize all the updated shards in between the ranks
# Synchronize all the updated shards in between the ranks
self
.
_broadcast_params
()
self
.
_broadcast_params
()
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
浏览文件 @
883ee1a3
...
@@ -112,6 +112,18 @@ class ShardingStage2(nn.Layer):
...
@@ -112,6 +112,18 @@ class ShardingStage2(nn.Layer):
self
.
_has_grad_storage
=
[]
self
.
_has_grad_storage
=
[]
self
.
_grad_storage_list
=
[]
self
.
_grad_storage_list
=
[]
# offload
# TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy
self
.
_offload_optims
=
list
(
filter
(
lambda
optim
:
optim
.
offload
,
self
.
_sharding_optimizers
))
if
len
(
self
.
_offload_optims
)
>
0
:
assert
len
(
self
.
_sharding_optimizers
)
==
1
,
"Only support offload strategy for single optimizer"
self
.
_offload
=
self
.
_sharding_optimizers
[
0
].
offload
self
.
_offload_device
=
"cpu"
# Set backward pass hooks
# Set backward pass hooks
self
.
_bw_hooks
=
[]
self
.
_bw_hooks
=
[]
...
@@ -156,7 +168,8 @@ class ShardingStage2(nn.Layer):
...
@@ -156,7 +168,8 @@ class ShardingStage2(nn.Layer):
# Release grad storages
# Release grad storages
for
dtype
in
self
.
_grad_storages
.
keys
():
for
dtype
in
self
.
_grad_storages
.
keys
():
if
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
if
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
zero_
()
if
not
self
.
_offload
:
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
zero_
()
# Release params
# Release params
for
param
in
self
.
_trainable_params
:
for
param
in
self
.
_trainable_params
:
...
@@ -167,17 +180,24 @@ class ShardingStage2(nn.Layer):
...
@@ -167,17 +180,24 @@ class ShardingStage2(nn.Layer):
"""
"""
Before the gradient accumulation, scale the gradient.
Before the gradient accumulation, scale the gradient.
"""
"""
# Scale grad storages
if
self
.
_offload
:
for
dtype
in
self
.
_grad_storages
.
keys
():
for
param
in
self
.
_trainable_params
:
if
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
if
param
.
name
in
self
.
_sharding_optimizers
[
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
scale_
(
0
].
_master_params
.
keys
():
scale
=
self
.
_world_size_scaling
)
self
.
_sharding_optimizers
[
0
].
_master_params
[
param
.
name
].
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
# Scale params
else
:
for
param
in
self
.
_trainable_params
:
# Scale grad storages
if
param
.
name
in
self
.
_param_grads
and
param
.
grad
is
not
None
:
for
dtype
in
self
.
_grad_storages
.
keys
():
param
.
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
if
self
.
_rank
in
self
.
_grad_storages
[
dtype
].
keys
():
param
.
_reset_grad_inplace_version
(
True
)
self
.
_grad_storages
[
dtype
][
self
.
_rank
].
buffer
.
scale_
(
scale
=
self
.
_world_size_scaling
)
# Scale params
for
param
in
self
.
_trainable_params
:
if
param
.
name
in
self
.
_param_grads
and
param
.
grad
is
not
None
:
param
.
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
param
.
_reset_grad_inplace_version
(
True
)
def
_init_internal_storage
(
self
,
needs_fresh
):
def
_init_internal_storage
(
self
,
needs_fresh
):
"""
"""
...
@@ -195,8 +215,14 @@ class ShardingStage2(nn.Layer):
...
@@ -195,8 +215,14 @@ class ShardingStage2(nn.Layer):
"""
"""
Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
"""
"""
assert
isinstance
(
device
,
str
),
"Device must be type str"
assert
device
==
self
.
_default_device
,
"New devices are not supported, because of the optimizer state is not sync"
assert
device
==
self
.
_default_device
,
"New devices are not supported, because of the optimizer state is not sync"
self
.
_layer
.
to
(
device
=
device
,
dtype
=
dtype
,
blocking
=
blocking
)
# Re-build the buckets, hooks, etc..
self
.
_fresh_trainable
()
def
_fresh_trainable
(
self
):
def
_fresh_trainable
(
self
):
""" Whether to update training parameters. """
""" Whether to update training parameters. """
...
@@ -283,12 +309,17 @@ class ShardingStage2(nn.Layer):
...
@@ -283,12 +309,17 @@ class ShardingStage2(nn.Layer):
self
.
_grad_reduced
[
index
]
=
False
self
.
_grad_reduced
[
index
]
=
False
if
not
self
.
_accumulate_grads
:
if
not
self
.
_accumulate_grads
:
param
.
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
param
.
grad
.
scale_
(
scale
=
self
.
_world_size_scaling
)
param
.
_reset_grad_inplace_version
(
True
)
param
.
_reset_grad_inplace_version
(
True
)
# Clear the gradient that does not belong to the current rank through the callback function
# Clear the gradient that does not belong to the current rank through the callback function
def
cleanup
():
def
cleanup
():
if
dst_rank
!=
self
.
_rank
:
if
dst_rank
!=
self
.
_rank
:
param
.
clear_gradient
(
False
)
param
.
clear_gradient
(
False
)
elif
self
.
_offload
:
self
.
_sharding_optimizers
[
0
].
_master_params
[
param
.
name
].
_copy_gradient_from
(
param
.
grad
.
cpu
(
).
cast
(
dtype
=
Type
.
fp32
.
value
))
param
.
clear_gradient
(
False
)
# Synchronize the reduce parameter gradient
# Synchronize the reduce parameter gradient
self
.
_tasks_flow
.
append
(
self
.
_tasks_flow
.
append
(
...
@@ -339,6 +370,15 @@ class ShardingStage2(nn.Layer):
...
@@ -339,6 +370,15 @@ class ShardingStage2(nn.Layer):
grad_storage
.
buffer
.
value
().
get_tensor
().
_clear
(
grad_storage
.
buffer
.
value
().
get_tensor
().
_clear
(
)
)
elif
self
.
_offload
:
grad_storage
.
to
(
device
=
self
.
_offload_device
)
for
param
in
grad_storage
.
_params
:
self
.
_sharding_optimizers
[
0
].
_master_params
[
param
.
name
].
_copy_gradient_from
(
param
.
grad
.
cast
(
dtype
=
Type
.
fp32
.
value
))
grad_storage
.
buffer
.
value
().
get_tensor
().
_clear
(
)
# Reduce the bucket
# Reduce the bucket
grad_storage
.
sent
=
True
grad_storage
.
sent
=
True
...
@@ -478,7 +518,7 @@ class ShardingStage2(nn.Layer):
...
@@ -478,7 +518,7 @@ class ShardingStage2(nn.Layer):
# Rebuild fp16/fp32 grad storages
# Rebuild fp16/fp32 grad storages
for
dtype
in
self
.
_grad_storages
.
keys
():
for
dtype
in
self
.
_grad_storages
.
keys
():
for
dst_rank
,
grad_storage
in
self
.
_grad_storages
[
dtype
].
items
():
for
dst_rank
,
grad_storage
in
self
.
_grad_storages
[
dtype
].
items
():
if
dst_rank
!=
self
.
_rank
:
if
self
.
_offload
or
dst_rank
!=
self
.
_rank
:
grad_storage
.
manumal_relase
()
grad_storage
.
manumal_relase
()
grad_storage
.
rebuild
()
grad_storage
.
rebuild
()
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
浏览文件 @
883ee1a3
...
@@ -17,10 +17,17 @@ import contextlib
...
@@ -17,10 +17,17 @@ import contextlib
from
collections
import
abc
from
collections
import
abc
from
enum
import
Enum
from
enum
import
Enum
from
math
import
inf
from
math
import
inf
import
numpy
as
np
from
types
import
MethodType
import
paddle
import
paddle
import
paddle.distributed
as
dist
import
paddle.distributed
as
dist
from
paddle
import
_C_ops
from
paddle.fluid
import
core
from
paddle.fluid
import
core
from
paddle.fluid
import
layers
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.framework
import
dygraph_only
from
paddle.fluid.dygraph
import
base
as
imperative_base
class
Taskflow
:
class
Taskflow
:
...
@@ -41,6 +48,88 @@ class Type(Enum):
...
@@ -41,6 +48,88 @@ class Type(Enum):
fp32
=
paddle
.
float32
fp32
=
paddle
.
float32
class
ShardingClipGrad
:
def
__init__
(
self
,
clip
,
group
,
device
):
self
.
_clip
=
clip
self
.
_group
=
group
self
.
_device
=
device
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_fp16
=
[]
sum_square_fp32
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
or
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
layers
.
merge_selected_rows
(
g
))
square
=
layers
.
square
(
merge_grad
)
sum_square
=
layers
.
reduce_sum
(
square
)
if
p
.
dtype
==
paddle
.
float16
:
sum_square_fp16
.
append
(
sum_square
)
elif
p
.
dtype
==
paddle
.
float32
:
sum_square_fp32
.
append
(
sum_square
)
# global norm of non-distributed FP16 params_and_grads
if
len
(
sum_square_fp16
)
==
0
:
global_norm_fp16
=
paddle
.
to_tensor
([
0.
],
dtype
=
paddle
.
float32
)
else
:
global_norm_fp16
=
layers
.
concat
(
sum_square_fp16
)
global_norm_fp16
=
layers
.
reduce_sum
(
global_norm_fp16
)
global_norm_fp16
=
paddle
.
cast
(
global_norm_fp16
,
dtype
=
paddle
.
float32
)
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32
=
layers
.
concat
(
sum_square_fp32
)
if
len
(
sum_square_fp32
)
!=
0
else
paddle
.
to_tensor
(
[
0.
],
dtype
=
paddle
.
float32
)
global_norm_fp32
=
layers
.
reduce_sum
(
global_norm_fp32
)
global_norm_var
=
global_norm_fp16
+
global_norm_fp32
# add all reduce to get global norm of distributed params_and_grads
dev_id
=
int
(
self
.
_device
.
split
(
":"
)[
1
])
with
device_guard
(
dev_id
,
"gpu"
):
paddle
.
distributed
.
all_reduce
(
global_norm_var
,
group
=
self
.
_group
)
global_norm_var
=
layers
.
sqrt
(
global_norm_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
clip_var
=
layers
.
elementwise_div
(
x
=
max_global_norm
,
y
=
layers
.
elementwise_max
(
x
=
global_norm_var
,
y
=
max_global_norm
))
clip_var_fp16
=
paddle
.
cast
(
clip_var
,
paddle
.
float16
)
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
if
p
.
dtype
==
paddle
.
float16
:
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var_fp16
)
else
:
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_clip
,
item
)
def
__call__
(
self
,
params_grads
):
return
self
.
_dygraph_clip
(
params_grads
)
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
def
device_guard
(
dev_id
,
device
=
"cpu"
):
def
device_guard
(
dev_id
,
device
=
"cpu"
):
origin_device
=
paddle
.
device
.
get_device
()
origin_device
=
paddle
.
device
.
get_device
()
...
@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
...
@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
yield
yield
finally
:
finally
:
paddle
.
set_device
(
origin_device
)
paddle
.
set_device
(
origin_device
)
@
dygraph_only
def
ShardingScaler
(
scaler
,
sharding_group
):
def
unscale_method
(
self
,
optimizer
):
if
not
self
.
_enable
:
return
param_grads
=
[]
param_grads_fp16
=
[]
param_grads_fp32
=
[]
if
getattr
(
optimizer
,
'_param_groups'
,
None
)
and
isinstance
(
optimizer
.
_param_groups
[
0
],
dict
):
for
group
in
optimizer
.
_param_groups
:
for
param
in
group
[
'params'
]:
if
param
.
_grad_ivar
()
is
not
None
:
param_grads
.
append
(
param
.
_grad_ivar
())
if
param
.
_grad_ivar
(
).
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
param_grads_fp16
.
append
(
param
.
_grad_ivar
())
else
:
param_grads_fp32
.
append
(
param
.
_grad_ivar
())
else
:
param_grads
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
if
param
.
_grad_ivar
()
is
not
None
]
param_grads_fp16
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
().
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
]
param_grads_fp32
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
().
dtype
==
core
.
VarDesc
.
VarType
.
FP32
)
]
temp_found_inf_fp16
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
temp_found_inf_fp32
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
if
len
(
param_grads_fp16
):
_C_ops
.
check_finite_and_unscale
(
param_grads_fp16
,
self
.
_scale
,
param_grads_fp16
,
temp_found_inf_fp16
)
if
len
(
param_grads_fp32
):
_C_ops
.
check_finite_and_unscale
(
param_grads_fp32
,
self
.
_scale
,
param_grads_fp32
,
temp_found_inf_fp32
)
self
.
_found_inf
=
1
if
temp_found_inf_fp16
or
temp_found_inf_fp32
else
0
is_found_inf
=
paddle
.
to_tensor
([
self
.
_found_inf
],
dtype
=
"int32"
)
paddle
.
distributed
.
all_reduce
(
is_found_inf
,
op
=
paddle
.
distributed
.
ReduceOp
.
MAX
,
group
=
sharding_group
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
return
scaler
python/paddle/distributed/fleet/utils/internal_storage.py
浏览文件 @
883ee1a3
...
@@ -50,6 +50,29 @@ class InternalStorage:
...
@@ -50,6 +50,29 @@ class InternalStorage:
else
:
else
:
self
.
buffer
=
paddle
.
zeros
(
size
,
dtype
=
dtype
)
self
.
buffer
=
paddle
.
zeros
(
size
,
dtype
=
dtype
)
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
assert
self
.
buffer
is
not
None
,
"Cannot move a collapsed bucket, please rebuild it"
assert
(
dtype
==
Type
.
fp32
.
value
or
Type
.
fp16
.
value
),
"Conversion type is not supported now"
dev_id
=
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
()
.
split
(
":"
)[
1
])
if
self
.
_device
!=
device
:
tmp_buffer
=
self
.
buffer
.
cuda
(
dev_id
)
if
device
==
"gpu"
else
self
.
buffer
.
cpu
()
for
param
in
self
.
_params
:
param
.
clear_gradient
(
False
)
param
.
_gradient_set_empty
(
False
)
self
.
buffer
.
value
().
get_tensor
().
_clear
()
self
.
buffer
=
tmp_buffer
if
dtype
is
not
None
:
self
.
buffer
=
self
.
buffer
.
cast
(
dtype
=
dtype
)
class
ParamStorage
(
InternalStorage
):
class
ParamStorage
(
InternalStorage
):
"""
"""
...
@@ -60,6 +83,16 @@ class ParamStorage(InternalStorage):
...
@@ -60,6 +83,16 @@ class ParamStorage(InternalStorage):
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
=
True
)
super
().
__init__
(
size
,
dtype
,
device
,
convert_cpu
=
True
)
self
.
param2align
=
None
self
.
param2align
=
None
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
super
().
to
(
device
,
dtype
)
if
keep_alignment
:
self
.
_array_params
()
@
fluid
.
dygraph
.
no_grad
@
fluid
.
dygraph
.
no_grad
def
add_rank_params
(
self
,
trainable_params
,
param2align
):
def
add_rank_params
(
self
,
trainable_params
,
param2align
):
"""
"""
...
@@ -78,7 +111,7 @@ class ParamStorage(InternalStorage):
...
@@ -78,7 +111,7 @@ class ParamStorage(InternalStorage):
p_shape
=
self
.
_add_param_as_view
(
param
,
param2align
[
param
.
name
])
p_shape
=
self
.
_add_param_as_view
(
param
,
param2align
[
param
.
name
])
cpu_param_shape
.
append
(
p_shape
)
cpu_param_shape
.
append
(
p_shape
)
# buffer covert from cpu to cuda
# buffer co
n
vert from cpu to cuda
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
self
.
buffer
=
self
.
buffer
.
cuda
(
dev_id
)
self
.
buffer
=
self
.
buffer
.
cuda
(
dev_id
)
self
.
_fill
=
0
self
.
_fill
=
0
...
@@ -109,7 +142,8 @@ class ParamStorage(InternalStorage):
...
@@ -109,7 +142,8 @@ class ParamStorage(InternalStorage):
param
.
stop_gradient
=
origin_state
param
.
stop_gradient
=
origin_state
# Copy the current param value
# Copy the current param value
dev_id
=
int
(
paddle
.
get_device
().
split
(
":"
)[
1
])
dev_id
=
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
()
.
split
(
":"
)[
1
])
with
device_guard
(
dev_id
,
"cpu"
):
with
device_guard
(
dev_id
,
"cpu"
):
tmp_var
=
core
.
VarBase
(
tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
tmp_var
=
core
.
VarBase
(
tensor
=
self
.
buffer
.
_slice
(
self
.
_fill
,
var_end
))
var_end
))
...
@@ -134,6 +168,18 @@ class ParamStorage(InternalStorage):
...
@@ -134,6 +168,18 @@ class ParamStorage(InternalStorage):
self
.
_fill
=
offset
self
.
_fill
=
offset
@
fluid
.
dygraph
.
no_grad
def
_array_params
(
self
):
"""
Given the parameters which have been registered previously, rebuild the whole InternalStorage.
"""
assert
len
(
self
.
_params
)
>
0
assert
self
.
param2align
is
not
None
self
.
_fill
=
0
for
p
in
self
.
_params
:
self
.
_convert_buffer
(
p
,
p
.
shape
,
self
.
param2align
[
p
.
name
])
# modify
class
GradStorage
(
InternalStorage
):
class
GradStorage
(
InternalStorage
):
"""
"""
...
@@ -171,6 +217,18 @@ class GradStorage(InternalStorage):
...
@@ -171,6 +217,18 @@ class GradStorage(InternalStorage):
param
.
shape
)
+
align
<=
self
.
_max_size
and
id
(
param
.
shape
)
+
align
<=
self
.
_max_size
and
id
(
param
)
not
in
self
.
_param_ids
param
)
not
in
self
.
_param_ids
def
to
(
self
,
device
,
dtype
=
None
,
keep_alignment
=
True
):
"""
Move the underlying buffer
"""
if
self
.
_release
:
self
.
rebuild
()
super
().
to
(
device
,
dtype
)
if
keep_alignment
:
self
.
_array_grads
()
@
fluid
.
dygraph
.
no_grad
@
fluid
.
dygraph
.
no_grad
def
add_grad
(
self
,
param
,
align
):
def
add_grad
(
self
,
param
,
align
):
"""
"""
...
@@ -206,17 +264,25 @@ class GradStorage(InternalStorage):
...
@@ -206,17 +264,25 @@ class GradStorage(InternalStorage):
"""
"""
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
"""
"""
assert
len
(
self
.
_params
)
>
0
if
self
.
_release
:
if
self
.
_release
:
self
.
buffer
=
paddle
.
zeros
(
self
.
buffer
=
paddle
.
zeros
([
self
.
_max_size
],
dtype
=
self
.
_dtype
)
[
self
.
_max_size
],
dtype
=
self
.
_params
[
0
].
dtype
)
for
p
in
self
.
_params
:
for
p
in
self
.
_params
:
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
self
.
_release
=
False
self
.
_release
=
False
@
fluid
.
dygraph
.
no_grad
def
_array_grads
(
self
):
"""
Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
"""
if
len
(
self
.
_params
)
>
0
:
self
.
_fill
=
0
for
p
in
self
.
_params
:
self
.
_add_grad_as_view
(
p
,
self
.
_parm2align
[
p
.
name
])
@
fluid
.
dygraph
.
no_grad
@
fluid
.
dygraph
.
no_grad
def
_add_grad_as_view
(
self
,
param
,
align
):
def
_add_grad_as_view
(
self
,
param
,
align
):
assert
np
.
prod
(
assert
np
.
prod
(
...
@@ -229,8 +295,17 @@ class GradStorage(InternalStorage):
...
@@ -229,8 +295,17 @@ class GradStorage(InternalStorage):
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
assert
offset
<=
np
.
prod
(
self
.
buffer
.
shape
)
# Copy the current grad value to InternalStorage
# Copy the current grad value to InternalStorage
assert
self
.
_device
==
"gpu"
dev_id
=
0
if
paddle
.
get_device
()
==
"cpu"
else
int
(
paddle
.
get_device
()
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
.
split
(
":"
)[
1
])
param
.
_copy_gradient_from
(
tmp_var
)
if
self
.
_device
==
"cpu"
:
tmp_var
.
value
().
get_tensor
().
_clear
()
with
device_guard
(
dev_id
,
self
.
_device
):
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
param
.
_copy_gradient_from
(
tmp_var
)
tmp_var
.
value
().
get_tensor
().
_clear
()
elif
self
.
_device
==
"gpu"
:
tmp_var
=
core
.
VarBase
(
self
.
buffer
.
_slice
(
self
.
_fill
,
grad_end
))
param
.
_copy_gradient_from
(
tmp_var
)
tmp_var
.
value
().
get_tensor
().
_clear
()
self
.
_fill
=
offset
self
.
_fill
=
offset
python/paddle/distribution.py
浏览文件 @
883ee1a3
...
@@ -305,7 +305,8 @@ class Uniform(Distribution):
...
@@ -305,7 +305,8 @@ class Uniform(Distribution):
else
:
else
:
output_shape
=
shape
+
batch_shape
output_shape
=
shape
+
batch_shape
output
=
nn
.
uniform_random
(
output
=
nn
.
uniform_random
(
output_shape
,
seed
=
seed
,
dtype
=
self
.
dtype
)
*
(
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
,
min
=
0.
,
max
=
1.
,
seed
=
seed
)
*
(
tensor
.
zeros
(
output_shape
,
dtype
=
self
.
dtype
)
+
(
self
.
high
-
self
.
low
))
output_shape
,
dtype
=
self
.
dtype
)
+
(
self
.
high
-
self
.
low
))
output
=
elementwise_add
(
output
,
self
.
low
,
name
=
name
)
output
=
elementwise_add
(
output
,
self
.
low
,
name
=
name
)
if
self
.
all_arg_is_float
:
if
self
.
all_arg_is_float
:
...
...
python/paddle/fluid/__init__.py
浏览文件 @
883ee1a3
...
@@ -71,7 +71,7 @@ from . import distribute_lookup_table
...
@@ -71,7 +71,7 @@ from . import distribute_lookup_table
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.param_attr
import
ParamAttr
,
WeightNormParamAttr
from
.data_feeder
import
DataFeeder
from
.data_feeder
import
DataFeeder
from
.core
import
LoDTensor
,
LoDTensorArray
,
Scope
,
_Scope
from
.core
import
LoDTensor
,
LoDTensorArray
,
Scope
,
_Scope
from
.core
import
CPUPlace
,
XPUPlace
,
CUDAPlace
,
CUDAPinnedPlace
,
NPUPlace
from
.core
import
CPUPlace
,
XPUPlace
,
CUDAPlace
,
CUDAPinnedPlace
,
NPUPlace
,
IPUPlace
from
.incubate
import
fleet
from
.incubate
import
fleet
from
.transpiler
import
DistributeTranspiler
,
\
from
.transpiler
import
DistributeTranspiler
,
\
memory_optimize
,
release_memory
,
DistributeTranspilerConfig
memory_optimize
,
release_memory
,
DistributeTranspilerConfig
...
@@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \
...
@@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'CUDAPlace'
,
'CUDAPlace'
,
'CUDAPinnedPlace'
,
'CUDAPinnedPlace'
,
'NPUPlace'
,
'NPUPlace'
,
'IPUPlace'
,
'Tensor'
,
'Tensor'
,
'ParamAttr'
,
'ParamAttr'
,
'WeightNormParamAttr'
,
'WeightNormParamAttr'
,
...
@@ -197,6 +198,11 @@ def __bootstrap__():
...
@@ -197,6 +198,11 @@ def __bootstrap__():
if
os
.
name
==
'nt'
:
if
os
.
name
==
'nt'
:
remove_flag_if_exists
(
'cpu_deterministic'
)
remove_flag_if_exists
(
'cpu_deterministic'
)
if
core
.
is_compiled_with_ipu
():
# Currently we request all ipu available for training and testing
# finer control of pod of IPUs will be added later
read_env_flags
+=
[]
core
.
init_gflags
([
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
core
.
init_gflags
([
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
# Note(zhouwei25): sys may not have argv in some cases,
# Note(zhouwei25): sys may not have argv in some cases,
# Such as: use Python/C API to call Python from C++
# Such as: use Python/C API to call Python from C++
...
...
python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
浏览文件 @
883ee1a3
...
@@ -484,7 +484,7 @@ class ImperativeQuantizeOutputs(object):
...
@@ -484,7 +484,7 @@ class ImperativeQuantizeOutputs(object):
model_filename
=
model_filename
,
model_filename
=
model_filename
,
params_filename
=
params_filename
))
params_filename
=
params_filename
))
self
.
_gather_scales
(
infer_program
,
scope
)
self
.
_gather_scales
(
infer_program
,
scope
,
fetch_targets
)
self
.
_set_skip_quant_attr
(
infer_program
)
self
.
_set_skip_quant_attr
(
infer_program
)
...
@@ -520,10 +520,10 @@ class ImperativeQuantizeOutputs(object):
...
@@ -520,10 +520,10 @@ class ImperativeQuantizeOutputs(object):
return
flag
return
flag
def
_gather_scales
(
self
,
program
,
scope
):
def
_gather_scales
(
self
,
program
,
scope
,
fetch_targets
):
"""
"""
Get all scales from fake ops, save them into the corresponding ops
Get all scales from fake ops, save them into the corresponding ops
and delete all moving_average_abs_max_scale ops.
and delete all moving_average_abs_max_scale ops.
"""
"""
def
_gather_input_scale
():
def
_gather_input_scale
():
...
@@ -580,6 +580,11 @@ class ImperativeQuantizeOutputs(object):
...
@@ -580,6 +580,11 @@ class ImperativeQuantizeOutputs(object):
for
next_op
in
next_ops
:
for
next_op
in
next_ops
:
next_op
.
_rename_input
(
out_var_name
,
in_var_name
)
next_op
.
_rename_input
(
out_var_name
,
in_var_name
)
# If next_op is `fetch` and out_var_name in fetch_targets,
# fetch_targets must update to in_var_name when rename input.
for
i
in
range
(
len
(
fetch_targets
)):
if
fetch_targets
[
i
].
name
==
out_var_name
:
fetch_targets
[
i
]
=
block
.
var
(
in_var_name
)
_gather_input_scale
()
_gather_input_scale
()
_gather_output_scale
()
_gather_output_scale
()
...
...
python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
浏览文件 @
883ee1a3
...
@@ -410,6 +410,23 @@ class PostTrainingQuantization(object):
...
@@ -410,6 +410,23 @@ class PostTrainingQuantization(object):
for
op_type
in
self
.
_dynamic_quantize_op_type
):
for
op_type
in
self
.
_dynamic_quantize_op_type
):
self
.
_collect_dynamic_quantize_op_threshold
(
self
.
_collect_dynamic_quantize_op_threshold
(
self
.
_dynamic_quantize_op_type
)
self
.
_dynamic_quantize_op_type
)
# Move sub blocks persistable var to global block
global_block
=
self
.
_program
.
global_block
()
for
_op
in
global_block
.
ops
:
if
_op
.
type
==
"while"
:
_block_id
=
_op
.
attr
(
"sub_block"
).
id
_block
=
self
.
_program
.
block
(
_block_id
)
persistables
=
[]
for
_name
,
_var
in
_block
.
vars
.
items
():
if
_var
.
persistable
:
global_block
.
_clone_variable
(
_var
)
persistables
.
append
(
_name
)
for
_name
in
persistables
:
_block
.
_remove_var
(
_name
)
persistables
.
extend
(
_op
.
input
(
'X'
))
_op
.
desc
.
set_input
(
"X"
,
persistables
)
return
self
.
_program
return
self
.
_program
def
save_quantized_model
(
self
,
def
save_quantized_model
(
self
,
...
@@ -451,10 +468,6 @@ class PostTrainingQuantization(object):
...
@@ -451,10 +468,6 @@ class PostTrainingQuantization(object):
model_filename
=
self
.
_model_filename
,
model_filename
=
self
.
_model_filename
,
params_filename
=
self
.
_params_filename
)
params_filename
=
self
.
_params_filename
)
if
self
.
_program
.
num_blocks
>
1
:
_logger
.
error
(
"The post training quantization requires that the "
"program only has one block."
)
if
self
.
_optimize_model
:
if
self
.
_optimize_model
:
self
.
_optimize_fp32_model
()
self
.
_optimize_fp32_model
()
...
@@ -505,23 +518,26 @@ class PostTrainingQuantization(object):
...
@@ -505,23 +518,26 @@ class PostTrainingQuantization(object):
self
.
_quantized_act_var_name
.
add
(
var_name
)
self
.
_quantized_act_var_name
.
add
(
var_name
)
persistable_var_names
=
_all_persistable_var_names
(
self
.
_program
)
persistable_var_names
=
_all_persistable_var_names
(
self
.
_program
)
for
op
in
self
.
_program
.
global_block
().
ops
:
for
block_id
in
range
(
len
(
self
.
_program
.
blocks
)):
op_type
=
op
.
type
for
op
in
self
.
_program
.
blocks
[
block_id
].
ops
:
if
self
.
_is_full_quantize
and
\
op_type
=
op
.
type
op_type
not
in
self
.
_quantizable_op_type
:
if
self
.
_is_full_quantize
and
\
_logger
.
warning
(
op_type
+
" is not supported for quantization."
)
op_type
not
in
self
.
_quantizable_op_type
:
# For quantized ops, sample inputs and outputs
_logger
.
warning
(
op_type
+
if
op_type
in
self
.
_quantizable_op_type
:
" is not supported for quantization."
)
collect_var_name
(
# For quantized ops, sample inputs and outputs
_get_op_input_var_names
(
op
),
persistable_var_names
,
op_type
)
if
op_type
in
self
.
_quantizable_op_type
:
collect_var_name
(
collect_var_name
(
_get_op_output_var_names
(
op
),
persistable_var_names
,
_get_op_input_var_names
(
op
),
persistable_var_names
,
op_type
)
op_type
)
# For other op, only sample output scale
collect_var_name
(
elif
op_type
in
self
.
_out_scale_op_list
:
_get_op_output_var_names
(
op
),
persistable_var_names
,
collect_var_name
(
op_type
)
_get_op_output_var_names
(
op
),
persistable_var_names
,
# For other op, only sample output scale
op_type
)
elif
op_type
in
self
.
_out_scale_op_list
:
collect_var_name
(
_get_op_output_var_names
(
op
),
persistable_var_names
,
op_type
)
def
_set_activation_persistable
(
self
):
def
_set_activation_persistable
(
self
):
'''
'''
...
@@ -696,16 +712,17 @@ class PostTrainingQuantization(object):
...
@@ -696,16 +712,17 @@ class PostTrainingQuantization(object):
'''
'''
assert
self
.
_algo
==
"min_max"
,
\
assert
self
.
_algo
==
"min_max"
,
\
"The algo should be min_max to save input threshold."
"The algo should be min_max to save input threshold."
for
op
in
self
.
_program
.
global_block
().
ops
:
for
block_id
in
range
(
len
(
self
.
_program
.
blocks
)):
if
op
.
type
in
self
.
_quantizable_op_type
:
for
op
in
self
.
_program
.
blocks
[
block_id
].
ops
:
for
var_name
in
_get_op_input_var_names
(
op
):
if
op
.
type
in
self
.
_quantizable_op_type
:
assert
var_name
in
self
.
_quantized_var_min
for
var_name
in
_get_op_input_var_names
(
op
):
assert
var_name
in
self
.
_quantized_var_max
assert
var_name
in
self
.
_quantized_var_min
op
.
_set_attr
(
var_name
+
".min"
,
assert
var_name
in
self
.
_quantized_var_max
self
.
_quantized_var_min
[
var_name
])
op
.
_set_attr
(
var_name
+
".min"
,
op
.
_set_attr
(
var_name
+
".max"
,
self
.
_quantized_var_min
[
var_name
])
self
.
_quantized_var_max
[
var_name
])
op
.
_set_attr
(
var_name
+
".max"
,
op
.
_set_attr
(
"with_quant_attr"
,
True
)
self
.
_quantized_var_max
[
var_name
])
op
.
_set_attr
(
"with_quant_attr"
,
True
)
def
_collect_activation_abs_min_max
(
self
):
def
_collect_activation_abs_min_max
(
self
):
'''
'''
...
@@ -795,7 +812,12 @@ class PostTrainingQuantization(object):
...
@@ -795,7 +812,12 @@ class PostTrainingQuantization(object):
activation_quantize_type
=
self
.
_activation_quantize_type
,
activation_quantize_type
=
self
.
_activation_quantize_type
,
weight_quantize_type
=
self
.
_weight_quantize_type
,
weight_quantize_type
=
self
.
_weight_quantize_type
,
quantizable_op_type
=
major_quantizable_op_types
)
quantizable_op_type
=
major_quantizable_op_types
)
transform_pass
.
apply
(
graph
)
for
sub_graph
in
graph
.
all_sub_graphs
():
# Insert fake_quant/fake_dequantize op must in test graph, so
# set per graph's _for_test is True.
sub_graph
.
_for_test
=
True
transform_pass
.
apply
(
sub_graph
)
# use AddQuantDequantPass to insert fake_quant_dequant op
# use AddQuantDequantPass to insert fake_quant_dequant op
minor_quantizable_op_types
=
[]
minor_quantizable_op_types
=
[]
...
@@ -806,7 +828,10 @@ class PostTrainingQuantization(object):
...
@@ -806,7 +828,10 @@ class PostTrainingQuantization(object):
scope
=
self
.
_scope
,
scope
=
self
.
_scope
,
place
=
self
.
_place
,
place
=
self
.
_place
,
quantizable_op_type
=
minor_quantizable_op_types
)
quantizable_op_type
=
minor_quantizable_op_types
)
add_quant_dequant_pass
.
apply
(
graph
)
for
sub_graph
in
graph
.
all_sub_graphs
():
sub_graph
.
_for_test
=
True
add_quant_dequant_pass
.
apply
(
sub_graph
)
# save threshold to scale var node
# save threshold to scale var node
if
self
.
_algo
in
[
"KL"
,
"hist"
]:
if
self
.
_algo
in
[
"KL"
,
"hist"
]:
...
@@ -836,7 +861,11 @@ class PostTrainingQuantization(object):
...
@@ -836,7 +861,11 @@ class PostTrainingQuantization(object):
activation_bits
=
self
.
_activation_bits
,
activation_bits
=
self
.
_activation_bits
,
weight_quantize_type
=
self
.
_weight_quantize_type
,
weight_quantize_type
=
self
.
_weight_quantize_type
,
quantizable_op_type
=
major_quantizable_op_types
)
quantizable_op_type
=
major_quantizable_op_types
)
freeze_pass
.
apply
(
graph
)
for
sub_graph
in
graph
.
all_sub_graphs
():
sub_graph
.
_for_test
=
True
freeze_pass
.
apply
(
sub_graph
)
self
.
_program
=
graph
.
to_program
()
self
.
_program
=
graph
.
to_program
()
def
_save_output_threshold
(
self
):
def
_save_output_threshold
(
self
):
...
@@ -888,13 +917,15 @@ class PostTrainingQuantization(object):
...
@@ -888,13 +917,15 @@ class PostTrainingQuantization(object):
save_info
(
op_node
,
out_var_name
,
self
.
_quantized_var_max
,
save_info
(
op_node
,
out_var_name
,
self
.
_quantized_var_max
,
"out_max"
,
"post_min_max"
)
"out_max"
,
"post_min_max"
)
for
op
in
self
.
_program
.
global_block
().
ops
:
for
block_id
in
range
(
len
(
self
.
_program
.
blocks
)):
if
op
.
type
in
(
self
.
_quantizable_op_type
+
self
.
_out_scale_op_list
):
for
op
in
self
.
_program
.
blocks
[
block_id
].
ops
:
out_var_names
=
_get_op_output_var_names
(
op
)
if
op
.
type
in
(
assert
len
(
out_var_names
)
==
1
,
"Post training "
+
\
self
.
_quantizable_op_type
+
self
.
_out_scale_op_list
):
"quantization only support one output for "
+
op
.
type
out_var_names
=
_get_op_output_var_names
(
op
)
for
var_name
in
out_var_names
:
assert
len
(
out_var_names
)
==
1
,
"Post training "
+
\
analysis_and_save_info
(
op
,
var_name
)
"quantization only support one output for "
+
op
.
type
for
var_name
in
out_var_names
:
analysis_and_save_info
(
op
,
var_name
)
def
_collect_dynamic_quantize_op_threshold
(
self
,
target_ops_type
):
def
_collect_dynamic_quantize_op_threshold
(
self
,
target_ops_type
):
"""
"""
...
...
python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
浏览文件 @
883ee1a3
...
@@ -139,6 +139,7 @@ endfunction()
...
@@ -139,6 +139,7 @@ endfunction()
if
(
WIN32
)
if
(
WIN32
)
list
(
REMOVE_ITEM TEST_OPS test_light_nas
)
list
(
REMOVE_ITEM TEST_OPS test_light_nas
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_while
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model
)
list
(
REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model
)
...
@@ -336,6 +337,7 @@ if(NOT WIN32)
...
@@ -336,6 +337,7 @@ if(NOT WIN32)
set_tests_properties
(
test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS
"RUN_TYPE=NIGHTLY"
)
set_tests_properties
(
test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS
"RUN_TYPE=NIGHTLY"
)
set_tests_properties
(
test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS
"RUN_TYPE=NIGHTLY"
)
set_tests_properties
(
test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS
"RUN_TYPE=NIGHTLY"
)
set_tests_properties
(
test_post_training_quantization_mnist PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_post_training_quantization_mnist PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_post_training_quantization_while PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_imperative_ptq PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_imperative_ptq PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120
)
endif
()
endif
()
...
...
python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
0 → 100644
浏览文件 @
883ee1a3
# copyright (c) 2021 paddlepaddle authors. all rights reserved.
#
# licensed under the apache license, version 2.0 (the "license");
# you may not use this file except in compliance with the license.
# you may obtain a copy of the license at
#
# http://www.apache.org/licenses/license-2.0
#
# unless required by applicable law or agreed to in writing, software
# distributed under the license is distributed on an "as is" basis,
# without warranties or conditions of any kind, either express or implied.
# see the license for the specific language governing permissions and
# limitations under the license.
import
unittest
import
os
import
time
import
sys
import
random
import
math
import
functools
import
contextlib
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.dataset.common
import
download
from
paddle.fluid.contrib.slim.quantization
import
PostTrainingQuantization
paddle
.
enable_static
()
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
class
TestPostTrainingQuantization
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
download_path
=
'int8/download'
self
.
cache_folder
=
os
.
path
.
expanduser
(
'~/.cache/paddle/dataset/'
+
self
.
download_path
)
self
.
timestamp
=
time
.
strftime
(
'%Y-%m-%d-%H-%M-%S'
,
time
.
localtime
())
self
.
int8_model_path
=
os
.
path
.
join
(
os
.
getcwd
(),
"post_training_"
+
self
.
timestamp
)
try
:
os
.
system
(
"mkdir -p "
+
self
.
int8_model_path
)
except
Exception
as
e
:
print
(
"Failed to create {} due to {}"
.
format
(
self
.
int8_model_path
,
str
(
e
)))
sys
.
exit
(
-
1
)
def
tearDown
(
self
):
try
:
os
.
system
(
"rm -rf {}"
.
format
(
self
.
int8_model_path
))
except
Exception
as
e
:
print
(
"Failed to delete {} due to {}"
.
format
(
self
.
int8_model_path
,
str
(
e
)))
def
cache_unzipping
(
self
,
target_folder
,
zip_path
):
cmd
=
'tar xf {0} -C {1}'
.
format
(
zip_path
,
target_folder
)
os
.
system
(
cmd
)
def
download_model
(
self
,
data_url
,
data_md5
,
folder_name
):
download
(
data_url
,
self
.
download_path
,
data_md5
)
file_name
=
data_url
.
split
(
'/'
)[
-
1
]
zip_path
=
os
.
path
.
join
(
self
.
cache_folder
,
file_name
)
print
(
'Data is downloaded at {0}'
.
format
(
zip_path
))
data_cache_folder
=
os
.
path
.
join
(
self
.
cache_folder
,
folder_name
)
self
.
cache_unzipping
(
self
.
cache_folder
,
zip_path
)
return
data_cache_folder
def
run_program
(
self
,
model_path
,
batch_size
,
infer_iterations
):
print
(
"test model path:"
+
model_path
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
[
infer_program
,
feed_dict
,
fetch_targets
]
=
\
fluid
.
io
.
load_inference_model
(
model_path
,
model_filename
=
'model.pdmodel'
,
params_filename
=
'model.pdiparams'
,
executor
=
exe
)
val_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
)
img_shape
=
[
1
,
28
,
28
]
test_info
=
[]
cnt
=
0
periods
=
[]
for
batch_id
,
data
in
enumerate
(
val_reader
()):
image
=
np
.
array
(
[
x
[
0
].
reshape
(
img_shape
)
for
x
in
data
]).
astype
(
"float32"
)
input_label
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
"int64"
)
t1
=
time
.
time
()
out
=
exe
.
run
(
infer_program
,
feed
=
{
feed_dict
[
0
]:
image
},
fetch_list
=
fetch_targets
)
t2
=
time
.
time
()
period
=
t2
-
t1
periods
.
append
(
period
)
out_label
=
np
.
argmax
(
np
.
array
(
out
[
0
]),
axis
=
1
)
top1_num
=
sum
(
input_label
==
out_label
)
test_info
.
append
(
top1_num
)
cnt
+=
len
(
data
)
if
(
batch_id
+
1
)
==
infer_iterations
:
break
throughput
=
cnt
/
np
.
sum
(
periods
)
latency
=
np
.
average
(
periods
)
acc1
=
np
.
sum
(
test_info
)
/
cnt
return
(
throughput
,
latency
,
acc1
)
def
generate_quantized_model
(
self
,
model_path
,
algo
=
"KL"
,
quantizable_op_type
=
[
"conv2d"
],
is_full_quantize
=
False
,
is_use_cache_file
=
False
,
is_optimize_model
=
False
,
batch_size
=
10
,
batch_nums
=
10
):
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
scope
=
fluid
.
global_scope
()
val_reader
=
paddle
.
dataset
.
mnist
.
train
()
ptq
=
PostTrainingQuantization
(
executor
=
exe
,
model_dir
=
model_path
,
model_filename
=
'model.pdmodel'
,
params_filename
=
'model.pdiparams'
,
sample_generator
=
val_reader
,
batch_size
=
batch_size
,
batch_nums
=
batch_nums
,
algo
=
algo
,
quantizable_op_type
=
quantizable_op_type
,
is_full_quantize
=
is_full_quantize
,
optimize_model
=
is_optimize_model
,
is_use_cache_file
=
is_use_cache_file
)
ptq
.
quantize
()
ptq
.
save_quantized_model
(
self
.
int8_model_path
,
model_filename
=
'model.pdmodel'
,
params_filename
=
'model.pdiparams'
)
def
run_test
(
self
,
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
=
10
,
infer_iterations
=
10
,
quant_iterations
=
5
):
origin_model_path
=
self
.
download_model
(
data_url
,
data_md5
,
model_name
)
#origin_model_path = os.path.join(origin_model_path, model_name)
print
(
"Start FP32 inference for {0} on {1} images ..."
.
format
(
model_name
,
infer_iterations
*
batch_size
))
(
fp32_throughput
,
fp32_latency
,
fp32_acc1
)
=
self
.
run_program
(
origin_model_path
,
batch_size
,
infer_iterations
)
print
(
"Start INT8 post training quantization for {0} on {1} images ..."
.
format
(
model_name
,
quant_iterations
*
batch_size
))
self
.
generate_quantized_model
(
origin_model_path
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
batch_size
,
quant_iterations
)
print
(
"Start INT8 inference for {0} on {1} images ..."
.
format
(
model_name
,
infer_iterations
*
batch_size
))
(
int8_throughput
,
int8_latency
,
int8_acc1
)
=
self
.
run_program
(
self
.
int8_model_path
,
batch_size
,
infer_iterations
)
print
(
"---Post training quantization of {} method---"
.
format
(
algo
))
print
(
"FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}."
.
format
(
model_name
,
batch_size
,
fp32_throughput
,
fp32_latency
,
fp32_acc1
))
print
(
"INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.
\n
"
.
format
(
model_name
,
batch_size
,
int8_throughput
,
int8_latency
,
int8_acc1
))
sys
.
stdout
.
flush
()
delta_value
=
fp32_acc1
-
int8_acc1
self
.
assertLess
(
delta_value
,
diff_threshold
)
class
TestPostTrainingKLForWhile
(
TestPostTrainingQuantization
):
def
test_post_training_kl
(
self
):
model_name
=
"mnist_while"
data_url
=
"http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5
=
"2387390beeb37b51dec041c27b8a681f"
algo
=
"KL"
quantizable_op_type
=
[
"conv2d"
,
"depthwise_conv2d"
,
"mul"
]
is_full_quantize
=
False
is_use_cache_file
=
False
is_optimize_model
=
True
diff_threshold
=
0.01
batch_size
=
10
infer_iterations
=
50
quant_iterations
=
5
self
.
run_test
(
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
,
infer_iterations
,
quant_iterations
)
class
TestPostTraininghistForWhile
(
TestPostTrainingQuantization
):
def
test_post_training_hist
(
self
):
model_name
=
"mnist_while"
data_url
=
"http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5
=
"2387390beeb37b51dec041c27b8a681f"
algo
=
"hist"
quantizable_op_type
=
[
"conv2d"
,
"depthwise_conv2d"
,
"mul"
]
is_full_quantize
=
False
is_use_cache_file
=
False
is_optimize_model
=
True
diff_threshold
=
0.01
batch_size
=
10
infer_iterations
=
50
quant_iterations
=
5
self
.
run_test
(
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
,
infer_iterations
,
quant_iterations
)
class
TestPostTrainingmseForWhile
(
TestPostTrainingQuantization
):
def
test_post_training_mse
(
self
):
model_name
=
"mnist_while"
data_url
=
"http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5
=
"2387390beeb37b51dec041c27b8a681f"
algo
=
"mse"
quantizable_op_type
=
[
"conv2d"
,
"depthwise_conv2d"
,
"mul"
]
is_full_quantize
=
False
is_use_cache_file
=
False
is_optimize_model
=
True
diff_threshold
=
0.01
batch_size
=
10
infer_iterations
=
50
quant_iterations
=
5
self
.
run_test
(
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
,
infer_iterations
,
quant_iterations
)
class
TestPostTrainingavgForWhile
(
TestPostTrainingQuantization
):
def
test_post_training_avg
(
self
):
model_name
=
"mnist_while"
data_url
=
"http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5
=
"2387390beeb37b51dec041c27b8a681f"
algo
=
"avg"
quantizable_op_type
=
[
"conv2d"
,
"depthwise_conv2d"
,
"mul"
]
is_full_quantize
=
False
is_use_cache_file
=
False
is_optimize_model
=
True
diff_threshold
=
0.01
batch_size
=
10
infer_iterations
=
50
quant_iterations
=
5
self
.
run_test
(
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
,
infer_iterations
,
quant_iterations
)
class
TestPostTrainingMinMaxForWhile
(
TestPostTrainingQuantization
):
def
test_post_training_min_max
(
self
):
model_name
=
"mnist_while"
data_url
=
"http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5
=
"2387390beeb37b51dec041c27b8a681f"
algo
=
"min_max"
quantizable_op_type
=
[
"conv2d"
,
"depthwise_conv2d"
,
"mul"
]
is_full_quantize
=
False
is_use_cache_file
=
False
is_optimize_model
=
True
diff_threshold
=
0.01
batch_size
=
10
infer_iterations
=
50
quant_iterations
=
5
self
.
run_test
(
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
,
infer_iterations
,
quant_iterations
)
class
TestPostTrainingAbsMaxForWhile
(
TestPostTrainingQuantization
):
def
test_post_training_abs_max
(
self
):
model_name
=
"mnist_while"
data_url
=
"http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
data_md5
=
"2387390beeb37b51dec041c27b8a681f"
algo
=
"abs_max"
quantizable_op_type
=
[
"conv2d"
,
"depthwise_conv2d"
,
"mul"
]
is_full_quantize
=
False
is_use_cache_file
=
False
is_optimize_model
=
True
diff_threshold
=
0.01
batch_size
=
10
infer_iterations
=
50
quant_iterations
=
5
self
.
run_test
(
model_name
,
data_url
,
data_md5
,
algo
,
quantizable_op_type
,
is_full_quantize
,
is_use_cache_file
,
is_optimize_model
,
diff_threshold
,
batch_size
,
infer_iterations
,
quant_iterations
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/dataloader/dataloader_iter.py
浏览文件 @
883ee1a3
...
@@ -273,6 +273,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
...
@@ -273,6 +273,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
else
:
else
:
if
self
.
_return_list
:
if
self
.
_return_list
:
data
=
self
.
_reader
.
read_next_list
()
data
=
self
.
_reader
.
read_next_list
()
for
i
in
range
(
len
(
data
)):
data
[
i
]
=
data
[
i
].
_move_to_list
()
data
=
[
data
=
[
_restore_batch
(
d
,
s
)
_restore_batch
(
d
,
s
)
for
d
,
s
in
zip
(
data
,
self
.
_structure_infos
[:
len
(
for
d
,
s
in
zip
(
data
,
self
.
_structure_infos
[:
len
(
...
@@ -718,6 +720,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
...
@@ -718,6 +720,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
else
:
else
:
if
self
.
_return_list
:
if
self
.
_return_list
:
data
=
self
.
_reader
.
read_next_list
()
data
=
self
.
_reader
.
read_next_list
()
for
i
in
range
(
len
(
data
)):
data
[
i
]
=
data
[
i
].
_move_to_list
()
data
=
[
data
=
[
_restore_batch
(
d
,
s
)
_restore_batch
(
d
,
s
)
for
d
,
s
in
zip
(
data
,
self
.
_structure_infos
[:
len
(
for
d
,
s
in
zip
(
data
,
self
.
_structure_infos
[:
len
(
...
...
python/paddle/fluid/dygraph/dygraph_to_static/utils.py
浏览文件 @
883ee1a3
...
@@ -547,7 +547,11 @@ def func_to_source_code(function, dedent=True):
...
@@ -547,7 +547,11 @@ def func_to_source_code(function, dedent=True):
raise
TypeError
(
raise
TypeError
(
"The type of 'function' should be a function or method, but received {}."
.
"The type of 'function' should be a function or method, but received {}."
.
format
(
type
(
function
).
__name__
))
format
(
type
(
function
).
__name__
))
source_code
=
inspect
.
getsource
(
function
)
source_code_list
,
_
=
inspect
.
getsourcelines
(
function
)
source_code_list
=
[
line
for
line
in
source_code_list
if
not
line
.
lstrip
().
startswith
(
'#'
)
]
source_code
=
''
.
join
(
source_code_list
)
if
dedent
:
if
dedent
:
source_code
=
textwrap
.
dedent
(
source_code
)
source_code
=
textwrap
.
dedent
(
source_code
)
...
...
python/paddle/fluid/dygraph/varbase_patch_methods.py
浏览文件 @
883ee1a3
...
@@ -238,7 +238,7 @@ def monkey_patch_varbase():
...
@@ -238,7 +238,7 @@ def monkey_patch_varbase():
"Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}"
.
format
(
"Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}"
.
format
(
grad_tensor
.
name
,
grad_tensor
.
shape
,
self
.
name
,
self
.
shape
)
grad_tensor
.
name
,
grad_tensor
.
shape
,
self
.
name
,
self
.
shape
)
if
paddle
.
is_compiled_with_xpu
():
if
paddle
.
is_compiled_with_xpu
()
or
paddle
.
is_compiled_with_npu
()
:
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
# TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
scaled_loss
=
scale_loss
(
self
)
scaled_loss
=
scale_loss
(
self
)
core
.
dygraph_run_backward
([
scaled_loss
],
[
grad_tensor
],
core
.
dygraph_run_backward
([
scaled_loss
],
[
grad_tensor
],
...
...
python/paddle/fluid/executor.py
浏览文件 @
883ee1a3
...
@@ -1999,6 +1999,14 @@ class Executor(object):
...
@@ -1999,6 +1999,14 @@ class Executor(object):
fetch_list
=
fetch_list
,
fetch_list
=
fetch_list
,
feed_var_name
=
feed_var_name
,
feed_var_name
=
feed_var_name
,
fetch_var_name
=
fetch_var_name
)
fetch_var_name
=
fetch_var_name
)
main_block
=
cached_program
.
block
(
0
)
for
op
in
main_block
.
ops
:
# set the op_role of fetch op to Optimize to avoid
# erase the fetched vars by gc for pipeline
if
op
.
type
==
'fetch'
:
op
.
_set_attr
(
'op_role'
,
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
)
self
.
_add_program_cache
(
cache_key
,
cached_program
)
self
.
_add_program_cache
(
cache_key
,
cached_program
)
if
cached_ctx
is
None
:
if
cached_ctx
is
None
:
fleet_opt
=
program
.
_pipeline_opt
[
"fleet_opt"
]
fleet_opt
=
program
.
_pipeline_opt
[
"fleet_opt"
]
...
@@ -2007,6 +2015,18 @@ class Executor(object):
...
@@ -2007,6 +2015,18 @@ class Executor(object):
self
.
_add_ctx_cache
(
cache_key
,
cached_ctx
)
self
.
_add_ctx_cache
(
cache_key
,
cached_ctx
)
if
feed
:
if
feed
:
self
.
_feed_data
(
cached_program
,
feed
,
feed_var_name
,
cached_scope
)
self
.
_feed_data
(
cached_program
,
feed
,
feed_var_name
,
cached_scope
)
from
paddle.optimizer.lr
import
LRScheduler
if
hasattr
(
program
,
'lr_sheduler'
):
lr_sheduler
=
program
.
lr_sheduler
assert
isinstance
(
lr_sheduler
,
LRScheduler
),
"must be LRScheduler"
lr_value
=
lr_sheduler
()
lr_var
=
program
.
global_block
().
vars
[
lr_sheduler
.
_var_name
]
data
=
np
.
array
([
lr_value
]).
astype
(
convert_dtype
(
lr_var
.
dtype
))
tensor
=
core
.
get_variable_tensor
(
cached_scope
,
lr_sheduler
.
_var_name
)
tensor
.
set
(
data
,
self
.
place
)
cached_ctx
.
run
()
cached_ctx
.
run
()
if
fetch_list
:
if
fetch_list
:
arr
=
cached_scope
.
find_var
(
fetch_var_name
).
get_fetch_list
()
arr
=
cached_scope
.
find_var
(
fetch_var_name
).
get_fetch_list
()
...
...
python/paddle/fluid/reader.py
浏览文件 @
883ee1a3
...
@@ -1254,7 +1254,10 @@ class GeneratorLoader(DataLoaderBase):
...
@@ -1254,7 +1254,10 @@ class GeneratorLoader(DataLoaderBase):
def
__next__
(
self
):
def
__next__
(
self
):
try
:
try
:
if
self
.
_return_list
:
if
self
.
_return_list
:
return
self
.
_reader
.
read_next_list
()
data
=
self
.
_reader
.
read_next_list
()
for
i
in
range
(
len
(
data
)):
data
[
i
]
=
data
[
i
].
_move_to_list
()
return
data
else
:
else
:
return
self
.
_reader
.
read_next
()
return
self
.
_reader
.
read_next
()
except
StopIteration
:
except
StopIteration
:
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
浏览文件 @
883ee1a3
...
@@ -30,6 +30,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import Shar
...
@@ -30,6 +30,7 @@ from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import Shar
seed
=
2021
seed
=
2021
epoch
=
2
epoch
=
2
batch_size
=
32
batch_size
=
32
linear_size
=
10000
strategy
=
fleet
.
DistributedStrategy
()
strategy
=
fleet
.
DistributedStrategy
()
strategy
.
hybrid_configs
=
{
strategy
.
hybrid_configs
=
{
...
@@ -45,12 +46,12 @@ paddle.seed(seed)
...
@@ -45,12 +46,12 @@ paddle.seed(seed)
class
MLP
(
fluid
.
Layer
):
class
MLP
(
fluid
.
Layer
):
def
__init__
(
self
,
param_attr
=
None
,
bias_attr
=
None
):
def
__init__
(
self
,
linear_size
=
10000
,
param_attr
=
None
,
bias_attr
=
None
):
super
(
MLP
,
self
).
__init__
()
super
(
MLP
,
self
).
__init__
()
self
.
_linear1
=
Linear
(
10000
,
10000
)
self
.
_linear1
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear2
=
Linear
(
10000
,
10000
)
self
.
_linear2
=
Linear
(
linear_size
,
linear_size
)
self
.
_linear3
=
Linear
(
10000
,
10
)
self
.
_linear3
=
Linear
(
linear_size
,
10
)
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
y
=
self
.
_linear1
(
inputs
)
y
=
self
.
_linear1
(
inputs
)
...
@@ -59,10 +60,10 @@ class MLP(fluid.Layer):
...
@@ -59,10 +60,10 @@ class MLP(fluid.Layer):
return
y
return
y
def
reader_decorator
():
def
reader_decorator
(
linear_size
=
10000
):
def
__reader__
():
def
__reader__
():
for
_
in
range
(
100
):
for
_
in
range
(
100
):
img
=
np
.
random
.
rand
(
10000
).
astype
(
'float32'
)
img
=
np
.
random
.
rand
(
linear_size
).
astype
(
'float32'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
label
=
np
.
ones
(
1
).
astype
(
'int64'
)
yield
img
,
label
yield
img
,
label
...
@@ -120,6 +121,9 @@ def train_mlp(model,
...
@@ -120,6 +121,9 @@ def train_mlp(model,
use_multiprocess
=
True
)
use_multiprocess
=
True
)
train_loader
.
set_sample_list_generator
(
train_reader
)
train_loader
.
set_sample_list_generator
(
train_reader
)
if
sharding_stage
==
2
:
model
.
to
(
device
=
"gpu"
)
for
eop
in
range
(
epoch
):
for
eop
in
range
(
epoch
):
model
.
train
()
model
.
train
()
...
@@ -153,9 +157,6 @@ def train_mlp(model,
...
@@ -153,9 +157,6 @@ def train_mlp(model,
if
all_test
and
batch_id
==
2
:
if
all_test
and
batch_id
==
2
:
return
model
.
parameters
()
return
model
.
parameters
()
if
sharding_stage
==
2
:
model
.
to
(
device
=
"gpu"
)
return
model
.
parameters
()
return
model
.
parameters
()
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
0 → 100644
浏览文件 @
883ee1a3
# -*- coding: UTF-8 -*-
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
ast
import
time
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph.nn
import
Linear
from
paddle.distributed
import
fleet
from
paddle.fluid.dygraph
import
nn
from
paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2
import
ShardingOptimizerStage2
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2
import
ShardingStage2
from
paddle.distributed.fleet.meta_parallel.sharding.sharding_utils
import
ShardingScaler
from
dygraph_sharding_stage2
import
MLP
,
reader_decorator
,
optimizer_setting
seed
=
2021
epoch
=
2
batch_size
=
32
linear_size
=
8000
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
def
train_mlp
(
model
,
offload
=
False
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
True
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
ShardingScaler
(
scaler
,
group
)
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
group
=
group
,
offload
=
offload
)
model
=
ShardingStage2
(
model
,
optimizer
,
group
=
group
,
accumulate_grads
=
True
)
train_reader
=
paddle
.
batch
(
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
paddle
.
io
.
DataLoader
.
from_generator
(
capacity
=
32
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
,
use_multiprocess
=
True
)
train_loader
.
set_sample_list_generator
(
train_reader
)
for
eop
in
range
(
epoch
):
model
.
train
()
for
batch_id
,
data
in
enumerate
(
train_loader
()):
img
,
label
=
data
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
scaler
.
scale
(
avg_loss
).
backward
()
model
.
grad_scale
()
scaler
.
step
(
optimizer
)
scaler
.
update
()
model
.
clear_gradients
()
for
dtype
in
optimizer
.
param_storages
:
for
dst_rank
,
param_storage
in
optimizer
.
param_storages
[
dtype
].
items
():
param_storage
.
to
(
device
=
"gpu"
,
dtype
=
dtype
)
return
model
.
parameters
()
def
test_sharding_stage2_offload
():
mlp
=
MLP
(
linear_size
)
mlp_offload
=
MLP
(
linear_size
)
mlp_offload
.
set_state_dict
(
mlp
.
state_dict
())
mlp_params
=
train_mlp
(
mlp
,
offload
=
False
)
mlp_offload_params
=
train_mlp
(
mlp_offload
,
offload
=
True
)
for
i
in
range
(
len
(
mlp_params
)):
for
j
in
range
(
len
(
mlp_offload_params
)):
if
mlp_params
[
i
].
name
==
mlp_offload_params
[
j
].
name
:
np
.
testing
.
assert_allclose
(
mlp_params
[
i
].
numpy
(),
mlp_offload_params
[
j
].
numpy
(),
rtol
=
1e-6
)
return
if
__name__
==
'__main__'
:
test_sharding_stage2_offload
()
python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
浏览文件 @
883ee1a3
...
@@ -26,6 +26,7 @@ import paddle.fluid as fluid
...
@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from
paddle.fluid.dygraph.dygraph_to_static
import
ProgramTranslator
from
paddle.fluid.dygraph.dygraph_to_static
import
ProgramTranslator
from
paddle.fluid.dygraph.jit
import
declarative
from
paddle.fluid.dygraph.jit
import
declarative
from
paddle.fluid.dygraph.nn
import
Linear
from
paddle.fluid.dygraph.nn
import
Linear
from
paddle.fluid.dygraph.dygraph_to_static.utils
import
func_to_source_code
from
ifelse_simple_func
import
dyfunc_with_if_else
from
ifelse_simple_func
import
dyfunc_with_if_else
...
@@ -344,5 +345,18 @@ class TestFunctionTrainEvalMode(unittest.TestCase):
...
@@ -344,5 +345,18 @@ class TestFunctionTrainEvalMode(unittest.TestCase):
net
.
foo
.
train
()
net
.
foo
.
train
()
class
TestRemoveCommentInDy2St
(
unittest
.
TestCase
):
def
func_with_comment
(
self
):
# Comment1
x
=
paddle
.
to_tensor
([
1
,
2
,
3
])
# Comment2
# Comment3
y
=
paddle
.
to_tensor
([
4
,
5
,
6
])
def
test_remove_comment
(
self
):
code_string
=
func_to_source_code
(
self
.
func_with_comment
)
self
.
assertEqual
(
'#'
not
in
code_string
,
True
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
浏览文件 @
883ee1a3
...
@@ -322,14 +322,14 @@ class PassAutoScanTest(AutoScanTest):
...
@@ -322,14 +322,14 @@ class PassAutoScanTest(AutoScanTest):
"Expected operator list after fusion is {}, but now it's {}"
.
format
(
"Expected operator list after fusion is {}, but now it's {}"
.
format
(
op_list_after_fusion
,
after_op_list
),
)
op_list_after_fusion
,
after_op_list
),
)
def
run_and_statis
(
def
run_and_statis
(
self
,
self
,
quant
=
False
,
quant
=
False
,
max_examples
=
100
,
max_examples
=
100
,
reproduce
=
None
,
reproduce
=
None
,
min_success_num
=
25
,
min_success_num
=
25
,
max_duration
=
180
,
max_duration
=
180
,
passes
=
None
,
passes
=
None
,
):
use_gpu_run_baseline
=
False
):
if
os
.
getenv
(
'HYPOTHESIS_TEST_PROFILE'
,
'ci'
)
==
"dev"
:
if
os
.
getenv
(
'HYPOTHESIS_TEST_PROFILE'
,
'ci'
)
==
"dev"
:
max_examples
*=
10
max_examples
*=
10
min_success_num
*=
10
min_success_num
*=
10
...
@@ -354,7 +354,10 @@ class PassAutoScanTest(AutoScanTest):
...
@@ -354,7 +354,10 @@ class PassAutoScanTest(AutoScanTest):
return
self
.
sample_program_config
(
draw
)
return
self
.
sample_program_config
(
draw
)
def
run_test
(
prog_config
):
def
run_test
(
prog_config
):
return
self
.
run_test
(
quant
=
quant
,
prog_configs
=
[
prog_config
])
return
self
.
run_test
(
quant
=
quant
,
prog_configs
=
[
prog_config
],
use_gpu_run_baseline
=
use_gpu_run_baseline
)
generator
=
st
.
composite
(
program_generator
)
generator
=
st
.
composite
(
program_generator
)
loop_func
=
given
(
generator
())(
run_test
)
loop_func
=
given
(
generator
())(
run_test
)
...
@@ -371,8 +374,8 @@ class PassAutoScanTest(AutoScanTest):
...
@@ -371,8 +374,8 @@ class PassAutoScanTest(AutoScanTest):
logging
.
info
(
"Number of Ran Programs: {}"
.
format
(
self
.
num_ran_programs
))
logging
.
info
(
"Number of Ran Programs: {}"
.
format
(
self
.
num_ran_programs
))
logging
.
info
(
"Number of Ignore Tests: {}"
.
format
(
self
.
num_ignore_tests
))
logging
.
info
(
"Number of Ignore Tests: {}"
.
format
(
self
.
num_ignore_tests
))
successful_ran_programs
=
int
(
self
.
num_ran_programs
-
successful_ran_programs
=
int
(
self
.
num_ran_programs
-
self
.
num_ignore_tests
/
self
.
num_ignore_tests
/
max
(
self
.
num_predictor_kinds
)
self
.
num_predictor_kinds
,
1
)
)
logging
.
info
(
logging
.
info
(
"Number of successfully ran programs approximately equal to {}"
.
"Number of successfully ran programs approximately equal to {}"
.
format
(
successful_ran_programs
))
format
(
successful_ran_programs
))
...
@@ -391,7 +394,10 @@ class PassAutoScanTest(AutoScanTest):
...
@@ -391,7 +394,10 @@ class PassAutoScanTest(AutoScanTest):
format
(
max_duration
))
format
(
max_duration
))
assert
False
assert
False
def
run_test
(
self
,
quant
=
False
,
prog_configs
=
None
):
def
run_test
(
self
,
quant
=
False
,
prog_configs
=
None
,
use_gpu_run_baseline
=
False
):
status
=
True
status
=
True
for
prog_config
in
prog_configs
:
for
prog_config
in
prog_configs
:
...
@@ -413,7 +419,9 @@ class PassAutoScanTest(AutoScanTest):
...
@@ -413,7 +419,9 @@ class PassAutoScanTest(AutoScanTest):
results
:
List
[
Dict
[
str
,
np
.
ndarray
]]
=
[]
results
:
List
[
Dict
[
str
,
np
.
ndarray
]]
=
[]
# baseline: cpu no ir_optim run
# baseline: cpu no ir_optim run
base_config
=
self
.
create_inference_config
(
ir_optim
=
False
)
base_config
=
self
.
create_inference_config
(
ir_optim
=
False
,
use_gpu
=
use_gpu_run_baseline
)
logging
.
info
(
'RUN program_config: '
+
str
(
prog_config
))
logging
.
info
(
'RUN program_config: '
+
str
(
prog_config
))
results
.
append
(
results
.
append
(
self
.
run_test_config
(
model
,
params
,
prog_config
,
base_config
,
self
.
run_test_config
(
model
,
params
,
prog_config
,
base_config
,
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
浏览文件 @
883ee1a3
...
@@ -109,7 +109,7 @@ class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
...
@@ -109,7 +109,7 @@ class TestAdaptivePool2dConvertGlobalPass(PassAutoScanTest):
def
test
(
self
):
def
test
(
self
):
self
.
run_and_statis
(
self
.
run_and_statis
(
quant
=
False
,
quant
=
False
,
max_examples
=
1
00
,
max_examples
=
3
00
,
passes
=
[
"adaptive_pool2d_convert_global_pass"
],
passes
=
[
"adaptive_pool2d_convert_global_pass"
],
min_success_num
=
40
)
min_success_num
=
40
)
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
0 → 100644
浏览文件 @
883ee1a3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
auto_scan_test
import
PassAutoScanTest
,
IgnoreReasons
from
program_config
import
TensorConfig
,
ProgramConfig
,
OpConfig
import
numpy
as
np
import
paddle.inference
as
paddle_infer
from
functools
import
partial
from
typing
import
Optional
,
List
,
Callable
,
Dict
,
Any
,
Set
import
unittest
import
hypothesis
from
hypothesis
import
given
,
settings
,
seed
,
example
,
assume
,
reproduce_failure
import
hypothesis.strategies
as
st
class
TestFCElementwiseLayerNormFusePass
(
PassAutoScanTest
):
"""
x_var w(persistable) bias_var(persistable)
\ | /
fc
|
fc_out_var bias_var(persistable)
\ /
elementwise_add bias_var(persistable) scale_var(persistable)
\ | /
layer_norm
/ |
\
Y mean_var variance_var
"""
def
sample_predictor_configs
(
self
,
program_config
):
# for gpu
config
=
self
.
create_inference_config
(
use_gpu
=
True
)
yield
config
,
[
"fused_fc_elementwise_layernorm"
],
(
1e-5
,
1e-5
)
def
sample_program_config
(
self
,
draw
):
# 1. Generate shape of input:X of fc
x_shape
=
draw
(
st
.
lists
(
st
.
integers
(
min_value
=
1
,
max_value
=
8
),
min_size
=
2
,
max_size
=
5
))
x_shape
=
[
2
,
1
]
x_rank
=
len
(
x_shape
)
# 2. Generate attr:in_num_col_dims of fc
in_num_col_dims
=
draw
(
st
.
integers
(
min_value
=
1
,
max_value
=
x_rank
-
1
))
# 3. Generate legal shape of input:W/bias of fc
w_shape
=
draw
(
st
.
lists
(
st
.
integers
(
min_value
=
1
,
max_value
=
8
),
min_size
=
2
,
max_size
=
2
))
w_shape
[
0
]
=
int
(
np
.
prod
(
x_shape
[
in_num_col_dims
:]))
w_shape
=
[
1
,
2
]
fc_bias_shape
=
[
w_shape
[
1
],
]
if
draw
(
st
.
booleans
()):
fc_bias_shape
.
insert
(
0
,
1
)
fc_bias_shape
=
[
2
,
]
fc_out_shape
=
x_shape
[:
in_num_col_dims
]
+
w_shape
[
1
:]
# 4. Generate legal attr:axis/shape of elementwise_add
add_bias_shape
=
fc_out_shape
[:]
axis
=
draw
(
st
.
integers
(
min_value
=-
1
,
max_value
=
0
))
# 5. Generate legal shape of layer_norm
begin_norm_axis
=
draw
(
st
.
integers
(
min_value
=
1
,
max_value
=
len
(
fc_out_shape
)
-
1
))
layer_norm_shape
=
[
int
(
np
.
prod
(
fc_out_shape
[
begin_norm_axis
:]))]
epsilon
=
1e-5
fc_op
=
OpConfig
(
"fc"
,
inputs
=
{
"Input"
:
[
"fc_x"
],
"W"
:
[
"fc_w"
],
"Bias"
:
[
"fc_bias"
]},
outputs
=
{
"Out"
:
[
"fc_out"
]},
in_num_col_dims
=
in_num_col_dims
,
padding_weights
=
False
,
activation_type
=
""
,
use_quantizer
=
False
,
use_mkldnn
=
False
,
)
add_op
=
OpConfig
(
"elementwise_add"
,
inputs
=
{
"X"
:
[
"fc_out"
],
"Y"
:
[
"add_bias"
]},
outputs
=
{
"Out"
:
[
"add_out"
]},
axis
=
axis
,
)
layer_norm_op
=
OpConfig
(
"layer_norm"
,
inputs
=
{
"X"
:
[
"add_out"
],
"Scale"
:
[
"scale"
],
"Bias"
:
[
"layer_norm_bias"
]
},
outputs
=
{
"Y"
:
[
"layer_norm_out"
],
"Mean"
:
[
"layer_norm_mean"
],
"Variance"
:
[
"layer_norm_var"
]
},
begin_norm_axis
=
begin_norm_axis
,
epsilon
=
epsilon
)
ops
=
[
fc_op
,
add_op
,
layer_norm_op
]
program_config
=
ProgramConfig
(
ops
=
ops
,
weights
=
{
"fc_w"
:
TensorConfig
(
shape
=
w_shape
),
"fc_bias"
:
TensorConfig
(
shape
=
fc_bias_shape
),
"add_bias"
:
TensorConfig
(
shape
=
add_bias_shape
),
"scale"
:
TensorConfig
(
shape
=
layer_norm_shape
),
"layer_norm_bias"
:
TensorConfig
(
shape
=
layer_norm_shape
),
},
inputs
=
{
"fc_x"
:
TensorConfig
(
shape
=
x_shape
),
},
outputs
=
ops
[
-
1
].
outputs
[
"Y"
],
)
return
program_config
def
test
(
self
):
self
.
run_and_statis
(
quant
=
False
,
max_examples
=
300
,
passes
=
[
"fc_elementwise_layernorm_fuse_pass"
],
use_gpu_run_baseline
=
True
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
浏览文件 @
883ee1a3
# Copyright (c) 202
0
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -12,72 +12,147 @@
...
@@ -12,72 +12,147 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
auto_scan_test
import
PassAutoScanTest
,
IgnoreReasons
from
program_config
import
TensorConfig
,
ProgramConfig
,
OpConfig
from
functools
import
partial
from
typing
import
Optional
,
List
,
Callable
,
Dict
,
Any
,
Set
import
unittest
import
unittest
import
numpy
as
np
from
inference_pass_test
import
InferencePassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.core
import
PassVersionChecker
import
hypothesis
from
hypothesis
import
given
,
settings
,
seed
,
example
,
assume
,
reproduce_failure
import
hypothesis.strategies
as
st
class
TransposeFlattenConcatFusePassTest
(
InferencePassTest
):
def
setUp
(
self
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data1
=
fluid
.
data
(
name
=
"data1"
,
shape
=
[
5
,
5
,
5
],
dtype
=
"float32"
)
data2
=
fluid
.
data
(
name
=
"data2"
,
shape
=
[
5
,
5
,
5
],
dtype
=
"float32"
)
trans1
=
fluid
.
layers
.
transpose
(
data1
,
perm
=
[
2
,
1
,
0
])
trans2
=
fluid
.
layers
.
transpose
(
data2
,
perm
=
[
2
,
1
,
0
])
flatt1
=
fluid
.
layers
.
flatten
(
trans1
)
flatt2
=
fluid
.
layers
.
flatten
(
trans2
)
concat_out
=
fluid
.
layers
.
concat
([
flatt1
,
flatt2
])
# There is no parameters for above structure.
# Hence, append a batch_norm to avoid failure caused by load_combined.
out
=
fluid
.
layers
.
batch_norm
(
concat_out
,
is_test
=
True
)
self
.
feeds
=
{
class
TestTransposeFlattenConcatFusePass
(
PassAutoScanTest
):
"data1"
:
np
.
random
.
random
([
5
,
5
,
5
]).
astype
(
"float32"
),
"""
"data2"
:
np
.
random
.
random
([
5
,
5
,
5
]).
astype
(
"float32"
)
x_1_var x_2_var
}
| |
self
.
fetch_list
=
[
out
]
transpose2 transpose2
| |
flatten2 flatten2
\ /
flatten2_out_var flatten2_out_var
\ /
concat
"""
def
test_check_output
(
self
):
def
sample_predictor_configs
(
self
,
program_config
):
# There is no cpu pass for transpose_flatten_concat_fuse
# TRT
if
core
.
is_compiled_with_cuda
():
# after tensorrt_subgraph_pass ,The pass needs to be deleted on TRT
use_gpu
=
True
self
.
check_output_with_option
(
use_gpu
)
PassVersionChecker
.
IsCompatible
(
'transpose_flatten_concat_fuse_pass'
)
# for gpu
config
=
self
.
create_inference_config
(
use_gpu
=
True
)
yield
config
,
[
"fusion_transpose_flatten_concat"
,
],
(
1e-5
,
1e-5
)
def
is_program_valid
(
self
,
prog_config
):
concat_axis
=
prog_config
.
ops
[
-
1
].
attrs
[
"axis"
]
ops_num
=
len
(
prog_config
.
ops
)
-
1
if
ops_num
%
2
!=
0
:
return
False
input_num
=
ops_num
//
2
flatten_shape
=
0
x_trans_axis
=
prog_config
.
ops
[
0
].
attrs
[
"axis"
]
x_flatten_axis
=
prog_config
.
ops
[
1
].
attrs
[
"axis"
]
for
i
in
range
(
input_num
):
input_name
=
"transpose2_x"
+
str
(
i
)
input_shape
=
prog_config
.
inputs
[
input_name
].
shape
trans_axis
=
prog_config
.
ops
[
i
*
2
].
attrs
[
"axis"
]
if
x_trans_axis
!=
trans_axis
:
return
False
# calculate shape after transpose
input_shape
=
[
input_shape
[
j
]
for
j
in
trans_axis
]
# calculate shape after flateen
flatten_axis
=
prog_config
.
ops
[
i
*
2
+
1
].
attrs
[
"axis"
]
if
x_flatten_axis
!=
flatten_axis
:
return
False
flatten_shape1
=
flatten_shape2
=
1
for
j
in
range
(
len
(
input_shape
)):
if
j
<
flatten_axis
:
flatten_shape1
*=
input_shape
[
j
]
else
:
flatten_shape2
*=
input_shape
[
j
]
if
concat_axis
==
0
:
if
i
==
0
:
flatten_shape
=
flatten_shape2
elif
flatten_shape
!=
flatten_shape2
:
return
False
else
:
if
i
==
0
:
flatten_shape
=
flatten_shape1
elif
flatten_shape
!=
flatten_shape1
:
return
False
return
True
class
TransposeFlattenConcatFusePassWithAxisTest
(
InferencePassTest
):
def
sample_program_config
(
self
,
draw
):
def
setUp
(
self
):
times
=
draw
(
st
.
integers
(
min_value
=
1
,
max_value
=
6
))
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
concat_axis
=
draw
(
st
.
integers
(
min_value
=
0
,
max_value
=
1
))
data1
=
fluid
.
data
(
name
=
"data1"
,
shape
=
[
5
,
5
,
5
],
dtype
=
"float32"
)
ops
=
[]
data2
=
fluid
.
data
(
name
=
"data2"
,
shape
=
[
5
,
5
,
5
],
dtype
=
"float32"
)
concat_input
=
[]
trans1
=
fluid
.
layers
.
transpose
(
data1
,
perm
=
[
2
,
1
,
0
])
inputs
=
{}
trans2
=
fluid
.
layers
.
transpose
(
data2
,
perm
=
[
2
,
1
,
0
])
x_shape_rank
=
draw
(
st
.
integers
(
min_value
=
2
,
max_value
=
5
))
flatt1
=
fluid
.
layers
.
flatten
(
trans1
,
axis
=
2
)
# Generate axis of transpose
flatt2
=
fluid
.
layers
.
flatten
(
trans2
,
axis
=
2
)
trans_axis
=
[
j
for
j
in
range
(
x_shape_rank
)]
concat_out
=
fluid
.
layers
.
concat
([
flatt1
,
flatt2
],
axis
=
1
)
for
j
in
range
(
x_shape_rank
-
1
):
# There is no parameters for above structure.
if
draw
(
st
.
booleans
()):
# Hence, append a batch_norm to avoid failure caused by load_combined.
trans_axis
[
j
],
trans_axis
[
-
1
]
=
trans_axis
[
-
1
],
trans_axis
[
j
]
out
=
fluid
.
layers
.
batch_norm
(
concat_out
,
is_test
=
True
)
# Generate axis of flatten
flatten_axis
=
draw
(
st
.
integers
(
min_value
=
0
,
max_value
=
x_shape_rank
-
1
))
for
i
in
range
(
times
):
# Generate x_shape of transpose
x_shape
=
draw
(
st
.
lists
(
st
.
integers
(
min_value
=
1
,
max_value
=
10
),
min_size
=
x_shape_rank
,
max_size
=
x_shape_rank
))
self
.
feeds
=
{
str_i
=
str
(
i
)
"data1"
:
np
.
random
.
random
([
5
,
5
,
5
]).
astype
(
"float32"
),
transpose_op
=
OpConfig
(
"data2"
:
np
.
random
.
random
([
5
,
5
,
5
]).
astype
(
"float32"
)
"transpose2"
,
}
inputs
=
{
"X"
:
[
"transpose2_x"
+
str_i
],
},
self
.
fetch_list
=
[
out
]
axis
=
trans_axis
,
outputs
=
{
"Out"
:
[
"trans_out"
+
str_i
],
"XShape"
:
[
"trans_shape"
+
str_i
]
},
)
ops
.
append
(
transpose_op
)
flatten_op
=
OpConfig
(
"flatten2"
,
inputs
=
{
"X"
:
[
"trans_out"
+
str_i
],
},
axis
=
flatten_axis
,
outputs
=
{
"Out"
:
[
"flatten2_out"
+
str_i
],
"XShape"
:
[
"xshape"
+
str_i
]
},
)
concat_input
.
append
(
"flatten2_out"
+
str_i
)
ops
.
append
(
flatten_op
)
inputs
[
"transpose2_x"
+
str_i
]
=
TensorConfig
(
shape
=
x_shape
)
def
test_check_output
(
self
):
concat_op
=
OpConfig
(
# There is no cpu pass for transpose_flatten_concat_fuse
"concat"
,
if
core
.
is_compiled_with_cuda
():
inputs
=
{
use_gpu
=
True
"X"
:
concat_input
,
self
.
check_output_with_option
(
use_gpu
)
"AxisTensor"
:
[],
},
outputs
=
{
"Out"
:
[
"concat_out"
]},
axis
=
concat_axis
,
)
self
.
assertTrue
(
ops
.
append
(
concat_op
)
PassVersionChecker
.
IsCompatible
(
'transpose_flatten_concat_fuse_pass'
))
program_config
=
ProgramConfig
(
ops
=
ops
,
weights
=
{},
inputs
=
inputs
,
outputs
=
ops
[
-
1
].
outputs
[
"Out"
],
)
return
program_config
def
test
(
self
):
self
.
run_and_statis
(
quant
=
False
,
max_examples
=
300
,
passes
=
[
"transpose_flatten_concat_fuse_pass"
])
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
0 → 100644
浏览文件 @
883ee1a3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
copy
import
numpy
as
np
import
paddle
import
paddle.nn
as
nn
import
paddle.static
as
static
import
paddle.nn.functional
as
F
import
paddle.utils
as
utils
import
paddle.fluid.core
as
core
from
paddle.fluid
import
layers
from
paddle.distributed.auto_parallel.operators.common
import
DistributedOperatorImplContainer
from
paddle.distributed.auto_parallel.operators.common
import
DistributedOperatorImpl
from
paddle.distributed.auto_parallel.operators.common
import
get_distributed_operator_impl_container
from
paddle.distributed.auto_parallel.dist_context
import
DistributedContext
,
DistributedOperatorContext
from
paddle.distributed.auto_parallel.dist_attribute
import
OperatorDistributedAttribute
,
TensorDistributedAttribute
from
paddle.distributed.auto_parallel.dist_op
import
DistributedOperator
paddle
.
enable_static
()
device
=
"gpu"
if
core
.
is_compiled_with_cuda
()
else
"cpu"
class
MLPLayer
(
nn
.
Layer
):
def
__init__
(
self
,
hidden_size
=
1024
,
intermediate_size
=
4
*
1024
,
initializer_range
=
0.02
):
super
(
MLPLayer
,
self
).
__init__
()
d_model
=
hidden_size
dim_feedforward
=
intermediate_size
weight_attr
=
paddle
.
ParamAttr
(
initializer
=
nn
.
initializer
.
Normal
(
mean
=
0.0
,
std
=
initializer_range
))
bias_attr
=
None
self
.
linear0
=
nn
.
Linear
(
d_model
,
dim_feedforward
,
weight_attr
,
bias_attr
=
bias_attr
)
self
.
linear1
=
nn
.
Linear
(
dim_feedforward
,
d_model
,
weight_attr
,
bias_attr
=
bias_attr
)
self
.
norm
=
nn
.
LayerNorm
(
d_model
,
epsilon
=
1e-5
)
def
forward
(
self
,
input
):
out
=
self
.
norm
(
input
)
out
=
self
.
linear0
(
out
)
out
=
F
.
gelu
(
out
,
approximate
=
True
)
out
=
self
.
linear1
(
out
)
return
out
def
mlp_forward
(
train_program
,
start_program
):
with
static
.
program_guard
(
train_program
,
start_program
),
utils
.
unique_name
.
guard
():
batch_size
=
4
hidden_size
=
1024
sqrt_hidden_size
=
32
double_hidden_size
=
64
input
=
static
.
data
(
name
=
"input"
,
shape
=
[
8
,
8
,
16
],
dtype
=
'int32'
)
input
=
paddle
.
reshape
(
input
,
[
hidden_size
])
input
=
paddle
.
reshape
(
input
,
[
sqrt_hidden_size
,
sqrt_hidden_size
])
embedding
=
paddle
.
nn
.
Embedding
(
2
,
batch_size
,
sparse
=
True
)
input
=
embedding
(
input
)
input
=
paddle
.
reshape
(
input
,
[
hidden_size
,
batch_size
])
input
=
paddle
.
transpose
(
input
,
perm
=
[
1
,
0
])
matmulinput
=
static
.
data
(
name
=
"matmulinput"
,
shape
=
[
hidden_size
,
hidden_size
],
dtype
=
'float32'
)
input
=
layers
.
matmul
(
x
=
input
,
y
=
matmulinput
)
label
=
static
.
data
(
name
=
"label"
,
shape
=
[
batch_size
,
1
],
dtype
=
'float32'
)
mlp
=
MLPLayer
(
hidden_size
=
hidden_size
,
intermediate_size
=
4
*
hidden_size
,
initializer_range
=
0.02
)
predict
=
mlp
(
input
)
error_cost
=
paddle
.
nn
.
functional
.
square_error_cost
(
predict
,
label
)
loss
=
paddle
.
mean
(
error_cost
)
m
=
paddle
.
nn
.
Softmax
()
loss
=
m
(
loss
)
return
loss
,
train_program
,
start_program
class
Testcompatible
(
unittest
.
TestCase
):
def
test_matmulv2_matmul_2_compatible
(
self
):
valid_op_dist_attr_list
=
[]
program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
loss
,
program
,
start_program
=
mlp_forward
(
program
,
startup_program
)
with
static
.
program_guard
(
program
,
start_program
),
utils
.
unique_name
.
guard
():
matmulx3
=
static
.
data
(
name
=
"matmulx3"
,
shape
=
[
6
,
2
,
6
],
dtype
=
'float32'
)
matmuly3
=
static
.
data
(
name
=
"matmuly3"
,
shape
=
[
6
,
6
],
dtype
=
'float32'
)
output1
=
paddle
.
matmul
(
x
=
matmulx3
,
y
=
matmuly3
)
output_1
=
layers
.
matmul
(
x
=
matmulx3
,
y
=
matmuly3
)
matmulx4
=
static
.
data
(
name
=
"matmulx4"
,
shape
=
[
6
,
6
,
2
,
6
],
dtype
=
'float32'
)
matmuly4
=
static
.
data
(
name
=
"matmuly4"
,
shape
=
[
6
,
6
,
6
,
6
],
dtype
=
'float32'
)
output2
=
paddle
.
matmul
(
x
=
matmulx4
,
y
=
matmuly4
)
output_2
=
layers
.
matmul
(
x
=
matmulx4
,
y
=
matmuly4
)
ops
=
program
.
global_block
().
ops
vars
=
program
.
global_block
().
vars
for
idx
,
op
in
enumerate
(
ops
):
if
op
.
type
==
'matmul_v2'
or
op
.
type
==
'matmul'
:
dist_op_impl_container
=
get_distributed_operator_impl_container
(
op
.
type
)
impls
=
dist_op_impl_container
.
get_impls
()
op_dist_attr
=
OperatorDistributedAttribute
()
X
=
op
.
input_arg_names
[
0
]
Y
=
op
.
input_arg_names
[
1
]
out
=
op
.
output_arg_names
[
0
]
if
len
(
vars
[
X
].
shape
)
==
2
and
len
(
vars
[
Y
].
shape
)
==
2
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
])
self
.
assertTrue
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
len
(
vars
[
X
].
shape
)
==
3
and
len
(
vars
[
Y
].
shape
)
==
2
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
])
self
.
assertTrue
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
1
,
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
len
(
vars
[
X
].
shape
)
==
4
and
len
(
vars
[
Y
].
shape
)
==
4
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
-
1
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
,
-
1
])
self
.
assertTrue
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
0
,
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
0
,
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
2
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
def
test_matmulv2_matmul_1_compatible
(
self
):
valid_op_dist_attr_list
=
[]
program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
loss
,
program
,
start_program
=
mlp_forward
(
program
,
startup_program
)
with
static
.
program_guard
(
program
,
start_program
),
utils
.
unique_name
.
guard
():
matmulx3
=
static
.
data
(
name
=
"matmulx3"
,
shape
=
[
6
,
2
,
6
],
dtype
=
'float32'
)
matmuly3
=
static
.
data
(
name
=
"matmuly3"
,
shape
=
[
6
,
6
],
dtype
=
'float32'
)
output1
=
paddle
.
matmul
(
x
=
matmulx3
,
y
=
matmuly3
)
output_1
=
layers
.
matmul
(
x
=
matmulx3
,
y
=
matmuly3
)
matmulx4
=
static
.
data
(
name
=
"matmulx4"
,
shape
=
[
6
,
6
,
6
,
6
],
dtype
=
'float32'
)
matmuly4
=
static
.
data
(
name
=
"matmuly4"
,
shape
=
[
6
,
6
,
6
,
6
],
dtype
=
'float32'
)
output2
=
paddle
.
matmul
(
x
=
matmulx4
,
y
=
matmuly4
)
output_2
=
layers
.
matmul
(
x
=
matmulx4
,
y
=
matmuly4
)
ops
=
program
.
global_block
().
ops
vars
=
program
.
global_block
().
vars
for
idx
,
op
in
enumerate
(
ops
):
if
op
.
type
==
'matmul_v2'
or
op
.
type
==
'matmul'
:
dist_op_impl_container
=
get_distributed_operator_impl_container
(
op
.
type
)
impls
=
dist_op_impl_container
.
get_impls
()
op_dist_attr
=
OperatorDistributedAttribute
()
X
=
op
.
input_arg_names
[
0
]
Y
=
op
.
input_arg_names
[
1
]
out
=
op
.
output_arg_names
[
0
]
if
len
(
vars
[
X
].
shape
)
==
2
and
len
(
vars
[
Y
].
shape
)
==
2
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
1
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
])
dist_op
=
DistributedOperator
(
op
,
op_dist_attr
)
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
len
(
vars
[
X
].
shape
)
==
3
and
len
(
vars
[
Y
].
shape
)
==
2
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
1
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
])
self
.
assertTrue
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
len
(
vars
[
X
].
shape
)
==
4
and
len
(
vars
[
Y
].
shape
)
==
4
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
-
1
,
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
1
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
,
-
1
])
self
.
assertTrue
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
0
,
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
0
,
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
1
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
def
test_matmulv2_matmul_0_compatible
(
self
):
valid_op_dist_attr_list
=
[]
program
=
paddle
.
static
.
Program
()
startup_program
=
paddle
.
static
.
Program
()
loss
,
program
,
start_program
=
mlp_forward
(
program
,
startup_program
)
with
static
.
program_guard
(
program
,
start_program
),
utils
.
unique_name
.
guard
():
matmulx3
=
static
.
data
(
name
=
"matmulx3"
,
shape
=
[
6
,
2
,
6
],
dtype
=
'float32'
)
matmuly3
=
static
.
data
(
name
=
"matmuly3"
,
shape
=
[
6
,
6
],
dtype
=
'float32'
)
output1
=
paddle
.
matmul
(
x
=
matmulx3
,
y
=
matmuly3
)
output_1
=
layers
.
matmul
(
x
=
matmulx3
,
y
=
matmuly3
)
matmulx4
=
static
.
data
(
name
=
"matmulx4"
,
shape
=
[
6
,
6
,
2
,
6
],
dtype
=
'float32'
)
matmuly4
=
static
.
data
(
name
=
"matmuly4"
,
shape
=
[
6
,
6
,
6
,
6
],
dtype
=
'float32'
)
output2
=
paddle
.
matmul
(
x
=
matmulx4
,
y
=
matmuly4
)
output_2
=
layers
.
matmul
(
x
=
matmulx4
,
y
=
matmuly4
)
ops
=
program
.
global_block
().
ops
vars
=
program
.
global_block
().
vars
for
idx
,
op
in
enumerate
(
ops
):
if
op
.
type
==
'matmul_v2'
or
op
.
type
==
'matmul'
:
dist_op_impl_container
=
get_distributed_operator_impl_container
(
op
.
type
)
impls
=
dist_op_impl_container
.
get_impls
()
op_dist_attr
=
OperatorDistributedAttribute
()
X
=
op
.
input_arg_names
[
0
]
Y
=
op
.
input_arg_names
[
1
]
out
=
op
.
output_arg_names
[
0
]
if
len
(
vars
[
X
].
shape
)
==
2
and
len
(
vars
[
Y
].
shape
)
==
2
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
1
])
self
.
assertTrue
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
0
,
0
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
0
,
-
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
len
(
vars
[
X
].
shape
)
==
3
and
len
(
vars
[
Y
].
shape
)
==
2
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
1
])
self
.
assertTrue
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
0
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
len
(
vars
[
X
].
shape
)
==
4
and
len
(
vars
[
Y
].
shape
)
==
4
:
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
-
1
,
-
1
])
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
-
1
,
1
])
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
-
1
,
1
])
self
.
assertTrue
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
0
,
-
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
X
,
[
-
1
,
-
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
0
,
-
1
,
-
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
1
,
1
,
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
-
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_output_dims_mapping
(
out
,
[
-
1
,
-
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
op_dist_attr
.
set_input_dims_mapping
(
Y
,
[
-
1
,
-
1
,
1
,
-
1
])
self
.
assertFalse
(
impls
[
0
].
is_auto_compatible
(
DistributedOperator
(
op
,
op_dist_attr
)))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_complex_view_op.py
0 → 100644
浏览文件 @
883ee1a3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
paddle
from
paddle.fluid
import
dygraph
from
paddle
import
static
paddle
.
enable_static
()
def
ref_view_as_complex
(
x
):
real
,
imag
=
np
.
take
(
x
,
0
,
axis
=-
1
),
np
.
take
(
x
,
1
,
axis
=-
1
)
return
real
+
1j
*
imag
def
ref_view_as_real
(
x
):
return
np
.
stack
([
x
.
real
,
x
.
imag
],
-
1
)
class
TestViewAsComplexOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"as_complex"
x
=
np
.
random
.
randn
(
10
,
10
,
2
).
astype
(
"float64"
)
out_ref
=
ref_view_as_complex
(
x
)
self
.
out_grad
=
np
.
ones
(
[
10
,
10
],
dtype
=
"float64"
)
+
1j
*
np
.
ones
(
[
10
,
10
],
dtype
=
"float64"
)
self
.
inputs
=
{
'X'
:
x
}
self
.
outputs
=
{
'Out'
:
out_ref
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
(
[
'X'
],
'Out'
,
user_defined_grads
=
[
ref_view_as_real
(
self
.
out_grad
)],
user_defined_grad_outputs
=
[
self
.
out_grad
])
class
TestViewAsRealOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"as_real"
real
=
np
.
random
.
randn
(
10
,
10
).
astype
(
"float64"
)
imag
=
np
.
random
.
randn
(
10
,
10
).
astype
(
"float64"
)
x
=
real
+
1j
*
imag
out_ref
=
ref_view_as_real
(
x
)
self
.
inputs
=
{
'X'
:
x
}
self
.
outputs
=
{
'Out'
:
out_ref
}
self
.
out_grad
=
np
.
ones
([
10
,
10
,
2
],
dtype
=
"float64"
)
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
(
[
'X'
],
'Out'
,
user_defined_grads
=
[
ref_view_as_complex
(
self
.
out_grad
)],
user_defined_grad_outputs
=
[
self
.
out_grad
])
class
TestViewAsComplexAPI
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
x
=
np
.
random
.
randn
(
10
,
10
,
2
)
self
.
out
=
ref_view_as_complex
(
self
.
x
)
def
test_dygraph
(
self
):
with
dygraph
.
guard
():
x
=
paddle
.
to_tensor
(
self
.
x
)
out_np
=
paddle
.
as_complex
(
x
).
numpy
()
self
.
assertTrue
(
np
.
allclose
(
self
.
out
,
out_np
))
def
test_static
(
self
):
mp
,
sp
=
static
.
Program
(),
static
.
Program
()
with
static
.
program_guard
(
mp
,
sp
):
x
=
static
.
data
(
"x"
,
shape
=
[
10
,
10
,
2
],
dtype
=
"float64"
)
out
=
paddle
.
as_complex
(
x
)
exe
=
static
.
Executor
()
exe
.
run
(
sp
)
[
out_np
]
=
exe
.
run
(
mp
,
feed
=
{
"x"
:
self
.
x
},
fetch_list
=
[
out
])
self
.
assertTrue
(
np
.
allclose
(
self
.
out
,
out_np
))
class
TestViewAsRealAPI
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
x
=
np
.
random
.
randn
(
10
,
10
)
+
1j
*
np
.
random
.
randn
(
10
,
10
)
self
.
out
=
ref_view_as_real
(
self
.
x
)
def
test_dygraph
(
self
):
with
dygraph
.
guard
():
x
=
paddle
.
to_tensor
(
self
.
x
)
out_np
=
paddle
.
as_real
(
x
).
numpy
()
self
.
assertTrue
(
np
.
allclose
(
self
.
out
,
out_np
))
def
test_static
(
self
):
mp
,
sp
=
static
.
Program
(),
static
.
Program
()
with
static
.
program_guard
(
mp
,
sp
):
x
=
static
.
data
(
"x"
,
shape
=
[
10
,
10
],
dtype
=
"complex128"
)
out
=
paddle
.
as_real
(
x
)
exe
=
static
.
Executor
()
exe
.
run
(
sp
)
[
out_np
]
=
exe
.
run
(
mp
,
feed
=
{
"x"
:
self
.
x
},
fetch_list
=
[
out
])
self
.
assertTrue
(
np
.
allclose
(
self
.
out
,
out_np
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_cuda_graph.py
浏览文件 @
883ee1a3
...
@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
...
@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
paddle
.
set_flags
({
paddle
.
set_flags
({
'FLAGS_allocator_strategy'
:
'auto_growth'
,
'FLAGS_allocator_strategy'
:
'auto_growth'
,
'FLAGS_sync_nccl_allreduce'
:
False
,
'FLAGS_sync_nccl_allreduce'
:
False
,
'FLAGS_cudnn_deterministic'
:
True
'FLAGS_cudnn_deterministic'
:
True
,
'FLAGS_use_stream_safe_cuda_allocator'
:
False
,
})
})
def
random_tensor
(
self
,
shape
):
def
random_tensor
(
self
,
shape
):
...
@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
...
@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
finally
:
finally
:
graph
.
reset
()
graph
.
reset
()
def
test_dataloader
(
self
):
if
not
can_use_cuda_graph
():
return
class
AutoIncDataset
(
paddle
.
io
.
Dataset
):
def
__init__
(
self
,
n
,
dtype
):
self
.
n
=
n
self
.
dtype
=
dtype
def
__len__
(
self
):
return
self
.
n
def
__getitem__
(
self
,
idx
):
return
np
.
array
([
idx
]).
astype
(
self
.
dtype
)
n
=
100
dtype
=
'int64'
dataset
=
AutoIncDataset
(
n
,
dtype
)
data_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
num_workers
=
2
,
use_buffer_reader
=
True
)
x
=
None
y
=
None
graph
=
None
for
i
,
data
in
enumerate
(
data_loader
):
if
graph
is
None
:
x
=
data
x
=
x
.
cuda
()
graph
=
CUDAGraph
()
graph
.
capture_begin
()
y
=
x
*
x
graph
.
capture_end
()
else
:
x
.
copy_
(
data
,
False
)
x
=
x
.
cuda
()
graph
.
replay
()
actual_x
=
np
.
array
([[
i
]]).
astype
(
dtype
)
actual_y
=
np
.
array
([[
i
*
i
]]).
astype
(
dtype
)
self
.
assertTrue
(
np
.
array_equal
(
actual_x
,
x
.
numpy
()))
self
.
assertTrue
(
np
.
array_equal
(
actual_y
,
y
.
numpy
()))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_distribution.py
浏览文件 @
883ee1a3
...
@@ -336,6 +336,29 @@ class UniformTest11(UniformTest):
...
@@ -336,6 +336,29 @@ class UniformTest11(UniformTest):
name
=
'values'
,
shape
=
[
dims
],
dtype
=
'float32'
)
name
=
'values'
,
shape
=
[
dims
],
dtype
=
'float32'
)
class
UniformTestSample
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
init_param
()
def
init_param
(
self
):
self
.
low
=
3.0
self
.
high
=
4.0
def
test_uniform_sample
(
self
):
paddle
.
disable_static
()
uniform
=
Uniform
(
low
=
self
.
low
,
high
=
self
.
high
)
s
=
uniform
.
sample
([
100
])
self
.
assertTrue
((
s
>=
self
.
low
).
all
())
self
.
assertTrue
((
s
<
self
.
high
).
all
())
paddle
.
enable_static
()
class
UniformTestSample2
(
UniformTestSample
):
def
init_param
(
self
):
self
.
low
=
-
5.0
self
.
high
=
2.0
class
NormalNumpy
(
DistributionNumpy
):
class
NormalNumpy
(
DistributionNumpy
):
def
__init__
(
self
,
loc
,
scale
):
def
__init__
(
self
,
loc
,
scale
):
self
.
loc
=
np
.
array
(
loc
)
self
.
loc
=
np
.
array
(
loc
)
...
...
python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
浏览文件 @
883ee1a3
...
@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
...
@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
def
test_dygraph_sharding_optimizer_stage2
(
self
):
def
test_dygraph_sharding_optimizer_stage2
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage2.py'
)
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage2.py'
)
def
test_dygraph_sharding_optimizer_stage2_offload
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage2_offload.py'
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_executor.py
浏览文件 @
883ee1a3
...
@@ -47,6 +47,18 @@ class TestFleetExecutor(unittest.TestCase):
...
@@ -47,6 +47,18 @@ class TestFleetExecutor(unittest.TestCase):
name
=
'y'
,
shape
=
y_data
.
shape
,
dtype
=
y_data
.
dtype
)
name
=
'y'
,
shape
=
y_data
.
shape
,
dtype
=
y_data
.
dtype
)
z
=
x
+
y
z
=
x
+
y
a
=
2
*
x
+
3
*
y
a
=
2
*
x
+
3
*
y
loss
=
paddle
.
mean
(
a
)
base_lr
=
0.1
passes
=
[
30
,
60
,
80
,
90
]
steps_per_pass
=
10
bd
=
[
steps_per_pass
*
p
for
p
in
passes
]
lr
=
[
base_lr
*
(
0.1
**
i
)
for
i
in
range
(
len
(
bd
)
+
1
)]
lr_val
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
bd
,
values
=
lr
)
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
lr_val
,
grad_clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
1.0
))
opt
.
minimize
(
loss
)
# TODO: section_program will be removed in the future
# TODO: section_program will be removed in the future
empty_program
.
_pipeline_opt
=
{
empty_program
.
_pipeline_opt
=
{
"fleet_opt"
:
self
.
fake_fleet_opt
(),
"fleet_opt"
:
self
.
fake_fleet_opt
(),
...
...
python/paddle/fluid/tests/unittests/test_gcd.py
0 → 100644
浏览文件 @
883ee1a3
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid
import
Program
,
program_guard
from
op_test
import
OpTest
paddle
.
enable_static
()
class
TestGcdAPI
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
x_np
=
12
self
.
y_np
=
20
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
def
test_static_graph
(
self
):
startup_program
=
fluid
.
Program
()
train_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
startup_program
,
train_program
):
x
=
fluid
.
data
(
name
=
'input1'
,
dtype
=
'int32'
,
shape
=
self
.
x_shape
)
y
=
fluid
.
data
(
name
=
'input2'
,
dtype
=
'int32'
,
shape
=
self
.
y_shape
)
out
=
paddle
.
gcd
(
x
,
y
)
place
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
res
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
'input1'
:
self
.
x_np
,
'input2'
:
self
.
y_np
},
fetch_list
=
[
out
])
self
.
assertTrue
((
np
.
array
(
res
[
0
])
==
np
.
gcd
(
self
.
x_np
,
self
.
y_np
)
).
all
())
def
test_dygraph
(
self
):
paddle
.
disable_static
()
x
=
paddle
.
to_tensor
(
self
.
x_np
)
y
=
paddle
.
to_tensor
(
self
.
y_np
)
result
=
paddle
.
gcd
(
x
,
y
)
self
.
assertEqual
(
np
.
allclose
(
np
.
gcd
(
self
.
x_np
,
self
.
y_np
),
result
.
numpy
()),
True
)
paddle
.
enable_static
()
class
TestGcdAPI2
(
TestGcdAPI
):
def
setUp
(
self
):
self
.
x_np
=
np
.
arange
(
6
).
astype
(
np
.
int32
)
self
.
y_np
=
np
.
array
([
20
]).
astype
(
np
.
int32
)
self
.
x_shape
=
[
6
]
self
.
y_shape
=
[
1
]
class
TestGcdAPI3
(
TestGcdAPI
):
def
setUp
(
self
):
self
.
x_np
=
0
self
.
y_np
=
20
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
class
TestGcdAPI4
(
TestGcdAPI
):
def
setUp
(
self
):
self
.
x_np
=
0
self
.
y_np
=
0
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
class
TestGcdAPI5
(
TestGcdAPI
):
def
setUp
(
self
):
self
.
x_np
=
12
self
.
y_np
=
-
20
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
python/paddle/fluid/tests/unittests/test_lcm.py
0 → 100644
浏览文件 @
883ee1a3
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid
import
Program
,
program_guard
from
op_test
import
OpTest
paddle
.
enable_static
()
class
TestLcmAPI
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
x_np
=
12
self
.
y_np
=
20
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
def
test_static_graph
(
self
):
startup_program
=
fluid
.
Program
()
train_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
startup_program
,
train_program
):
x1
=
fluid
.
data
(
name
=
'input1'
,
dtype
=
'int32'
,
shape
=
self
.
x_shape
)
x2
=
fluid
.
data
(
name
=
'input2'
,
dtype
=
'int32'
,
shape
=
self
.
y_shape
)
out
=
paddle
.
lcm
(
x1
,
x2
)
place
=
fluid
.
CUDAPlace
(
0
)
if
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
res
=
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{
'input1'
:
self
.
x_np
,
'input2'
:
self
.
y_np
},
fetch_list
=
[
out
])
self
.
assertTrue
((
np
.
array
(
res
[
0
])
==
np
.
lcm
(
self
.
x_np
,
self
.
y_np
)
).
all
())
def
test_dygraph
(
self
):
paddle
.
disable_static
()
x1
=
paddle
.
to_tensor
(
self
.
x_np
)
x2
=
paddle
.
to_tensor
(
self
.
y_np
)
result
=
paddle
.
lcm
(
x1
,
x2
)
self
.
assertEqual
(
np
.
allclose
(
np
.
lcm
(
self
.
x_np
,
self
.
y_np
),
result
.
numpy
()),
True
)
paddle
.
enable_static
()
class
TestLcmAPI2
(
TestLcmAPI
):
def
setUp
(
self
):
self
.
x_np
=
np
.
arange
(
6
).
astype
(
np
.
int32
)
self
.
y_np
=
np
.
array
([
20
]).
astype
(
np
.
int32
)
self
.
x_shape
=
[
6
]
self
.
y_shape
=
[
1
]
class
TestLcmAPI3
(
TestLcmAPI
):
def
setUp
(
self
):
self
.
x_np
=
0
self
.
y_np
=
20
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
class
TestLcmAPI4
(
TestLcmAPI
):
def
setUp
(
self
):
self
.
x_np
=
0
self
.
y_np
=
0
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
class
TestLcmAPI5
(
TestLcmAPI
):
def
setUp
(
self
):
self
.
x_np
=
12
self
.
y_np
=
-
20
self
.
x_shape
=
[
1
]
self
.
y_shape
=
[
1
]
python/paddle/framework/__init__.py
浏览文件 @
883ee1a3
...
@@ -23,6 +23,7 @@ from .framework import set_grad_enabled # noqa: F401
...
@@ -23,6 +23,7 @@ from .framework import set_grad_enabled # noqa: F401
from
..fluid.param_attr
import
ParamAttr
# noqa: F401
from
..fluid.param_attr
import
ParamAttr
# noqa: F401
from
..fluid.layers.tensor
import
create_parameter
# noqa: F401
from
..fluid.layers.tensor
import
create_parameter
# noqa: F401
from
..fluid.core
import
CPUPlace
# noqa: F401
from
..fluid.core
import
CPUPlace
# noqa: F401
from
..fluid.core
import
IPUPlace
# noqa: F401
from
..fluid.core
import
CUDAPlace
# noqa: F401
from
..fluid.core
import
CUDAPlace
# noqa: F401
from
..fluid.core
import
CUDAPinnedPlace
# noqa: F401
from
..fluid.core
import
CUDAPinnedPlace
# noqa: F401
from
..fluid.core
import
NPUPlace
# noqa: F401
from
..fluid.core
import
NPUPlace
# noqa: F401
...
...
python/paddle/tensor/__init__.py
浏览文件 @
883ee1a3
...
@@ -111,6 +111,9 @@ from .manipulation import unbind # noqa: F401
...
@@ -111,6 +111,9 @@ from .manipulation import unbind # noqa: F401
from
.manipulation
import
roll
# noqa: F401
from
.manipulation
import
roll
# noqa: F401
from
.manipulation
import
chunk
# noqa: F401
from
.manipulation
import
chunk
# noqa: F401
from
.manipulation
import
tensordot
# noqa: F401
from
.manipulation
import
tensordot
# noqa: F401
from
.manipulation
import
as_complex
# noqa: F401
from
.manipulation
import
as_real
# noqa: F401
from
.math
import
abs
# noqa: F401
from
.math
import
abs
# noqa: F401
from
.math
import
acos
# noqa: F401
from
.math
import
acos
# noqa: F401
from
.math
import
asin
# noqa: F401
from
.math
import
asin
# noqa: F401
...
@@ -194,6 +197,8 @@ from .math import lerp # noqa: F401
...
@@ -194,6 +197,8 @@ from .math import lerp # noqa: F401
from
.math
import
lerp_
# noqa: F401
from
.math
import
lerp_
# noqa: F401
from
.math
import
rad2deg
# noqa: F401
from
.math
import
rad2deg
# noqa: F401
from
.math
import
deg2rad
# noqa: F401
from
.math
import
deg2rad
# noqa: F401
from
.math
import
gcd
# noqa: F401
from
.math
import
lcm
# noqa: F401
from
.math
import
diff
# noqa: F401
from
.math
import
diff
# noqa: F401
from
.math
import
angle
# noqa: F401
from
.math
import
angle
# noqa: F401
...
@@ -409,6 +414,12 @@ tensor_method_func = [ #noqa
...
@@ -409,6 +414,12 @@ tensor_method_func = [ #noqa
'multi_dot'
,
'multi_dot'
,
'solve'
,
'solve'
,
'triangular_solve'
,
'triangular_solve'
,
'as_complex'
,
'as_real'
,
'rad2deg'
,
'deg2rad'
,
'gcd'
,
'lcm'
,
'diff'
,
'diff'
,
'lerp'
,
'lerp'
,
'lerp_'
,
'lerp_'
,
...
...
python/paddle/tensor/manipulation.py
浏览文件 @
883ee1a3
...
@@ -34,6 +34,7 @@ from ..fluid import layers
...
@@ -34,6 +34,7 @@ from ..fluid import layers
from
..fluid.dygraph.inplace_utils
import
inplace_apis_in_dygraph_only
from
..fluid.dygraph.inplace_utils
import
inplace_apis_in_dygraph_only
import
paddle
import
paddle
from
paddle
import
_C_ops
from
paddle
import
_C_ops
from
paddle.tensor.attribute
import
_complex_to_real_dtype
,
_real_to_complex_dtype
__all__
=
[]
__all__
=
[]
...
@@ -2488,3 +2489,94 @@ def tensordot(x, y, axes=2, name=None):
...
@@ -2488,3 +2489,94 @@ def tensordot(x, y, axes=2, name=None):
[
contraction_size
,
not_contraction_size_y
])
[
contraction_size
,
not_contraction_size_y
])
out
=
x
.
matmul
(
y
).
reshape
(
shape_out
)
out
=
x
.
matmul
(
y
).
reshape
(
shape_out
)
return
out
return
out
def
as_complex
(
x
,
name
=
None
):
"""Transform a real tensor to a complex tensor.
The data type of the input tensor is 'float32' or 'float64', and the data
type of the returned tensor is 'complex64' or 'complex128', respectively.
The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e.
the size of the last axis shoule be 2, which represent the real and imag part
of a complex number. The shape of the returned tensor is ``(*,)``.
Args:
x (Tensor): The input tensor. Data type is 'float32' or 'float64'.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
Examples:
.. code-block:: python
import paddle
x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
y = paddle.as_complex(x)
print(y.numpy())
# [[ 0. +1.j 2. +3.j 4. +5.j]
# [ 6. +7.j 8. +9.j 10.+11.j]]
"""
if
in_dygraph_mode
():
return
paddle
.
_C_ops
.
as_complex
(
x
)
check_variable_and_dtype
(
x
,
'x'
,
[
'float32'
,
'float64'
],
'as_complex'
)
op_type
=
"as_complex"
helper
=
LayerHelper
(
op_type
,
**
locals
())
inputs
=
{
"X"
:
x
}
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
_real_to_complex_dtype
(
x
.
dtype
))
outputs
=
{
"Out"
:
out
}
attrs
=
{}
helper
.
append_op
(
type
=
op_type
,
inputs
=
inputs
,
attrs
=
attrs
,
outputs
=
outputs
)
return
out
def
as_real
(
x
,
name
=
None
):
"""Transform a complex tensor to a real tensor.
The data type of the input tensor is 'complex64' or 'complex128', and the data
type of the returned tensor is 'float32' or 'float64', respectively.
When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
the shape of the output tensor is ``(*, 2)``, i.e. the shape of the output is
the shape of the input appended by an extra ``2``.
Args:
x (Tensor): The input tensor. Data type is 'complex64' or 'complex128'.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
Examples:
.. code-block:: python
import paddle
x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
y = paddle.as_complex(x)
z = paddle.as_real(y)
print(z.numpy())
# [[[ 0. 1.]
# [ 2. 3.]
# [ 4. 5.]]
# [[ 6. 7.]
# [ 8. 9.]
# [10. 11.]]]
"""
if
in_dygraph_mode
():
return
paddle
.
_C_ops
.
as_real
(
x
)
check_variable_and_dtype
(
x
,
'x'
,
[
'complex64'
,
'complex128'
],
'as_real'
)
op_type
=
"as_real"
helper
=
LayerHelper
(
op_type
,
**
locals
())
inputs
=
{
"X"
:
x
}
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
_complex_to_real_dtype
(
x
.
dtype
))
outputs
=
{
"Out"
:
out
}
helper
.
append_op
(
type
=
op_type
,
inputs
=
inputs
,
outputs
=
outputs
)
return
out
python/paddle/tensor/math.py
浏览文件 @
883ee1a3
...
@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
...
@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
lerp(x, y, weight) = x + weight * (y - x).
lerp(x, y, weight) = x + weight * (y - x).
Args:
Args:
x (Tensor): An N-D Tensor, the data type is float32, float64.
x (Tensor): An N-D Tensor
with starting points
, the data type is float32, float64.
y (Tensor): An N-D Tensor, the data type is float32, float64.
y (Tensor): An N-D Tensor
with ending points
, the data type is float32, float64.
weight (float|Tensor):
the weight for the interpolation formula
.
weight (float|Tensor):
The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64
.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Returns:
...
@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
...
@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
type
=
'scale'
,
inputs
=
{
'X'
:
out_cast
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'scale'
:
deg2rad_scale
})
type
=
'scale'
,
inputs
=
{
'X'
:
out_cast
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'scale'
:
deg2rad_scale
})
return
out
return
out
def
gcd
(
x
,
y
,
name
=
None
):
"""
Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
Both x and y must have integer types.
Note:
gcd(0,0)=0, gcd(0, y)=|y|
Args:
x, y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8.
If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
out (Tensor): An N-D Tensor, the data type is the same with input.
Examples:
.. code-block:: python
import paddle
import numpy as np
x1 = paddle.to_tensor(12)
x2 = paddle.to_tensor(20)
paddle.gcd(x1, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [4])
x3 = paddle.to_tensor(np.arange(6))
paddle.gcd(x3, x2)
# Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [20, 1 , 2 , 1 , 4 , 5])
x4 = paddle.to_tensor(0)
paddle.gcd(x4, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [20])
paddle.gcd(x4, x4)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0])
x5 = paddle.to_tensor(-20)
paddle.gcd(x1, x5)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [4])
"""
shape
=
paddle
.
broadcast_shape
(
x
.
shape
,
y
.
shape
)
x
=
paddle
.
broadcast_to
(
x
,
shape
)
y
=
paddle
.
broadcast_to
(
y
,
shape
)
x
=
paddle
.
abs
(
x
)
y
=
paddle
.
abs
(
y
)
def
_gcd_cond_fn
(
x
,
y
):
return
paddle
.
any
(
y
!=
0
)
def
_gcd_body_fn
(
x
,
y
):
# paddle.mod will raise an error when any element of y is 0. To avoid
# that, we change those zeros to ones. Their values don't matter because
# they won't be used.
y_not_equal_0
=
(
y
!=
0
)
y_safe
=
paddle
.
where
(
y_not_equal_0
,
y
,
paddle
.
ones
(
y
.
shape
,
y
.
dtype
))
x
,
y
=
(
paddle
.
where
(
y_not_equal_0
,
y
,
x
),
paddle
.
where
(
y_not_equal_0
,
paddle
.
mod
(
x
,
y_safe
),
paddle
.
zeros
(
y
.
shape
,
y
.
dtype
)))
return
(
paddle
.
where
(
x
<
y
,
y
,
x
),
paddle
.
where
(
x
<
y
,
x
,
y
))
if
in_dygraph_mode
():
while
_gcd_cond_fn
(
x
,
y
):
x
,
y
=
_gcd_body_fn
(
x
,
y
)
return
x
else
:
check_variable_and_dtype
(
x
,
'x'
,
[
'int32'
,
'int64'
,
'int8'
,
'int16'
,
'uint8'
],
'gcd'
)
check_variable_and_dtype
(
y
,
'y'
,
[
'int32'
,
'int64'
,
'int8'
,
'int16'
,
'uint8'
],
'gcd'
)
out
,
_
=
paddle
.
static
.
nn
.
while_loop
(
_gcd_cond_fn
,
_gcd_body_fn
,
[
x
,
y
])
return
out
def
lcm
(
x
,
y
,
name
=
None
):
"""
Computes the element-wise least common multiple (LCM) of input |x| and |y|.
Both x and y must have integer types.
Note:
lcm(0,0)=0, lcm(0, y)=0
Args:
x, y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8.
If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
out (Tensor): An N-D Tensor, the data type is the same with input.
Examples:
.. code-block:: python
import paddle
import numpy as np
x1 = paddle.to_tensor(12)
x2 = paddle.to_tensor(20)
paddle.lcm(x1, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [60])
x3 = paddle.to_tensor(np.arange(6))
paddle.lcm(x3, x2)
# Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0, 20, 20, 60, 20, 20])
x4 = paddle.to_tensor(0)
paddle.lcm(x4, x2)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0])
paddle.lcm(x4, x4)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [0])
x5 = paddle.to_tensor(-20)
paddle.lcm(x1, x5)
# Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [60])
"""
d
=
paddle
.
gcd
(
x
,
y
)
# paddle.mod will raise an error when any element of y is 0. To avoid
# that, we change those zeros to ones. Their values don't matter because
# they won't be used.
d_equal_0
=
paddle
.
equal
(
d
,
0
)
d_safe
=
paddle
.
where
(
d_equal_0
,
paddle
.
ones
(
d
.
shape
,
d
.
dtype
),
d
)
out
=
paddle
.
where
(
d_equal_0
,
paddle
.
zeros
(
d
.
shape
,
d
.
dtype
),
paddle
.
abs
(
x
*
y
)
//
d_safe
)
return
out
def
diff
(
x
,
n
=
1
,
axis
=-
1
,
prepend
=
None
,
append
=
None
,
name
=
None
):
def
diff
(
x
,
n
=
1
,
axis
=-
1
,
prepend
=
None
,
append
=
None
,
name
=
None
):
r
"""
r
"""
Computes the n-th forward difference along the given axis.
Computes the n-th forward difference along the given axis.
...
@@ -2949,7 +3082,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
...
@@ -2949,7 +3082,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
return
out
return
out
def
angle
(
x
,
name
=
None
):
def
angle
(
x
,
name
=
None
):
r
"""
r
"""
Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while
Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while
...
@@ -2965,7 +3097,7 @@ def angle(x, name=None):
...
@@ -2965,7 +3097,7 @@ def angle(x, name=None):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Returns:
out (Tensor): y (Tensor)
: An N-D Tensor of real data type with the same precision as that of x's data type.
Tensor
: An N-D Tensor of real data type with the same precision as that of x's data type.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
...
python/paddle/utils/code_gen/api.yaml
浏览文件 @
883ee1a3
...
@@ -76,7 +76,7 @@
...
@@ -76,7 +76,7 @@
infer_meta
:
infer_meta
:
func
:
MatmulInferMeta
func
:
MatmulInferMeta
kernel
:
kernel
:
func
:
matmul
_v2
func
:
matmul
-
api
:
mean
-
api
:
mean
args
:
(const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
args
:
(const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
...
...
python/paddle/utils/code_gen/api_gen.py
浏览文件 @
883ee1a3
...
@@ -345,6 +345,7 @@ def source_include(header_file_path):
...
@@ -345,6 +345,7 @@ def source_include(header_file_path):
#include "glog/logging.h"
#include "glog/logging.h"
#include "paddle/pten/api/lib/api_registry.h"
#include "paddle/pten/api/lib/api_registry.h"
#include "paddle/pten/api/lib/kernel_declare.h"
#include "paddle/pten/api/lib/kernel_dispatch.h"
#include "paddle/pten/api/lib/kernel_dispatch.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/api/lib/utils/allocator.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/core/kernel_registry.h"
...
@@ -353,22 +354,6 @@ def source_include(header_file_path):
...
@@ -353,22 +354,6 @@ def source_include(header_file_path):
"""
"""
def
module_declare
():
return
"""
PT_DECLARE_MODULE(CreationCPU);
PT_DECLARE_MODULE(LinalgCPU);
PT_DECLARE_MODULE(ManipulationCPU);
PT_DECLARE_MODULE(MathCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_MODULE(CreationCUDA);
PT_DECLARE_MODULE(LinalgCUDA);
PT_DECLARE_MODULE(ManipulationCUDA);
PT_DECLARE_MODULE(MathCUDA);
#endif
"""
def
api_register
():
def
api_register
():
return
"""
return
"""
PT_REGISTER_API(Creation);
PT_REGISTER_API(Creation);
...
@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
...
@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
include_header_file
=
"paddle/pten/api/include/api.h"
include_header_file
=
"paddle/pten/api/include/api.h"
source_file
.
write
(
source_include
(
include_header_file
))
source_file
.
write
(
source_include
(
include_header_file
))
source_file
.
write
(
module_declare
())
source_file
.
write
(
namespace
[
0
])
source_file
.
write
(
namespace
[
0
])
for
api
in
apis
:
for
api
in
apis
:
...
...
tools/parallel_UT_rule.py
浏览文件 @
883ee1a3
...
@@ -202,7 +202,7 @@ HIGH_PARALLEL_JOB_NEW = [
...
@@ -202,7 +202,7 @@ HIGH_PARALLEL_JOB_NEW = [
'test_fleet_runtime'
,
'test_fleet_runtime'
,
'test_rnn_cudnn_params_packing'
,
'test_rnn_cudnn_params_packing'
,
'test_mkldnn_placement_pass'
,
'test_mkldnn_placement_pass'
,
'test_fc_elementwise_layernorm_fuse_pass'
,
'test_fc_elementwise_layernorm_fuse_pass
_cc
'
,
'program_desc_test'
,
'program_desc_test'
,
'test_simplify_with_basic_ops_pass'
,
'test_simplify_with_basic_ops_pass'
,
'test_dygraph_mode_of_unittest'
,
'test_dygraph_mode_of_unittest'
,
...
@@ -1417,7 +1417,7 @@ CPU_PARALLEL_JOB = [
...
@@ -1417,7 +1417,7 @@ CPU_PARALLEL_JOB = [
'test_fc_mkldnn_op'
,
'test_fc_mkldnn_op'
,
'test_fc_lstm_fuse_pass'
,
'test_fc_lstm_fuse_pass'
,
'test_fc_gru_fuse_pass'
,
'test_fc_gru_fuse_pass'
,
'test_fc_elementwise_layernorm_fuse_pass'
,
'test_fc_elementwise_layernorm_fuse_pass
_cc
'
,
'test_fc_bf16_mkldnn_op'
,
'test_fc_bf16_mkldnn_op'
,
'test_executor_feed_non_tensor'
,
'test_executor_feed_non_tensor'
,
'test_executor_check_feed'
,
'test_executor_check_feed'
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录