Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
98c17a68
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2297
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
98c17a68
编写于
12月 23, 2022
作者:
Q
QingshuChen
提交者:
GitHub
12月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
suport recompute for kunlun (#49069)
上级
644dfc60
变更
11
显示空白变更内容
内联
并排
Showing
11 changed file
with
277 addition
and
38 deletion
+277
-38
paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
...uid/framework/details/bind_threaded_ssa_graph_executor.cc
+7
-3
paddle/fluid/framework/generator.cc
paddle/fluid/framework/generator.cc
+32
-0
paddle/fluid/framework/generator.h
paddle/fluid/framework/generator.h
+2
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+7
-0
paddle/fluid/pybind/generator_py.cc
paddle/fluid/pybind/generator_py.cc
+1
-0
paddle/phi/kernels/xpu/dropout_kernel.cc
paddle/phi/kernels/xpu/dropout_kernel.cc
+3
-0
python/paddle/__init__.py
python/paddle/__init__.py
+4
-0
python/paddle/distributed/fleet/recompute/recompute.py
python/paddle/distributed/fleet/recompute/recompute.py
+11
-26
python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
...ests/collective/fleet/test_dygraph_recompute_for_eager.py
+0
-9
python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
...paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
+116
-0
python/paddle/framework/random.py
python/paddle/framework/random.py
+94
-0
未找到文件。
paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
浏览文件 @
98c17a68
...
...
@@ -254,10 +254,14 @@ void BindThreadedSSAGraphExecutor::RunMultiDeviceOpAsync(
auto
dev_ctxes
=
op
->
DeviceContext
();
auto
&
inputs
=
op
->
Inputs
();
for
(
auto
&
input
:
inputs
)
{
if
(
input
&&
input
->
GeneratedOp
()
!=
nullptr
)
{
auto
dev_ctxes
=
input
->
GeneratedOp
()
->
DeviceContext
();
for
(
auto
&
item
:
dev_ctxes
)
{
((
platform
::
XPUDeviceContext
*
)(
item
.
second
))
->
Wait
();
}
}
else
{
VLOG
(
3
)
<<
"No generated op:"
<<
op
->
Name
();
}
}
op
->
Run
(
strategy_
.
use_device_
);
auto
&
outputs
=
op
->
Outputs
();
...
...
paddle/fluid/framework/generator.cc
浏览文件 @
98c17a68
...
...
@@ -20,11 +20,43 @@ limitations under the License. */
#include <utility>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
const
std
::
shared_ptr
<
Generator
>&
DefaultXPUGenerator
(
int64_t
device_id
)
{
#if defined(PADDLE_WITH_XPU)
static
int64_t
num_xpu_devices
=
-
1
;
static
std
::
once_flag
num_devices_init_flag
;
static
std
::
deque
<
std
::
once_flag
>
xpu_device_flags
;
static
std
::
vector
<
std
::
shared_ptr
<
Generator
>>
default_xpu_generators
;
std
::
call_once
(
num_devices_init_flag
,
[]()
{
num_xpu_devices
=
paddle
::
platform
::
GetXPUDeviceCount
();
xpu_device_flags
.
resize
(
num_xpu_devices
);
default_xpu_generators
.
resize
(
num_xpu_devices
);
});
if
(
device_id
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"xpu device id shoule be greater than 0"
));
}
std
::
call_once
(
xpu_device_flags
[
device_id
],
[
device_id
]()
{
default_xpu_generators
[
device_id
]
=
std
::
make_shared
<
Generator
>
(
GetRandomSeed
(),
device_id
);
VLOG
(
4
)
<<
"initial seed: "
<<
default_xpu_generators
[
device_id
]
->
GetCurrentSeed
();
});
return
default_xpu_generators
[
device_id
];
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"getDefaultXPUGenerator only support in XPU place"
));
#endif
}
const
std
::
shared_ptr
<
Generator
>&
DefaultCUDAGenerator
(
int64_t
device_id
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/framework/generator.h
浏览文件 @
98c17a68
...
...
@@ -107,6 +107,8 @@ const std::shared_ptr<Generator>& DefaultCPUGenerator();
const
std
::
shared_ptr
<
Generator
>&
DefaultCUDAGenerator
(
int64_t
device_id
=
-
1
);
const
std
::
shared_ptr
<
Generator
>&
DefaultXPUGenerator
(
int64_t
device_id
=
-
1
);
std
::
shared_ptr
<
std
::
mt19937_64
>
GetCPURandomEngine
(
uint64_t
);
const
std
::
shared_ptr
<
Generator
>&
SetRandomSeedGenerator
(
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
98c17a68
...
...
@@ -170,6 +170,13 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
cuda_ctx
->
PartialInitWithAllocator
();
dev_ctx
->
SetGenerator
(
framework
::
DefaultCUDAGenerator
(
p
.
GetDeviceId
()).
get
());
#endif
}
else
if
(
is_xpu_place
(
p
))
{
#if defined(PADDLE_WITH_XPU)
dev_ctx
->
SetAllocator
(
memory
::
allocation
::
AllocatorFacade
::
Instance
().
GetAllocator
(
p
).
get
());
dev_ctx
->
SetGenerator
(
framework
::
DefaultXPUGenerator
(
p
.
GetDeviceId
()).
get
());
#endif
}
else
{
dev_ctx
->
SetAllocator
(
...
...
paddle/fluid/pybind/generator_py.cc
浏览文件 @
98c17a68
...
...
@@ -90,6 +90,7 @@ void BindGenerator(py::module* m_ptr) {
.
def
(
"random"
,
&
framework
::
Generator
::
Random64
);
m
.
def
(
"default_cpu_generator"
,
&
framework
::
DefaultCPUGenerator
);
m
.
def
(
"default_cuda_generator"
,
&
framework
::
DefaultCUDAGenerator
);
m
.
def
(
"default_xpu_generator"
,
&
framework
::
DefaultXPUGenerator
);
m
.
def
(
"set_random_seed_generator"
,
&
framework
::
SetRandomSeedGenerator
);
m
.
def
(
"get_random_seed_generator"
,
&
framework
::
GetRandomSeedGenerator
);
}
...
...
paddle/phi/kernels/xpu/dropout_kernel.cc
浏览文件 @
98c17a68
...
...
@@ -58,6 +58,9 @@ void DropoutRawKernel(const Context& dev_ctx,
}
else
{
seed_data
=
fix_seed
?
seed
:
0
;
}
if
(
seed_data
==
0
)
{
seed_data
=
dev_ctx
.
GetGenerator
()
->
Random64
();
}
auto
*
mask_data
=
dev_ctx
.
template
Alloc
<
T
>(
mask
);
// Special case when dropout_prob is 1.0
...
...
python/paddle/__init__.py
浏览文件 @
98c17a68
...
...
@@ -327,6 +327,8 @@ from .tensor.einsum import einsum # noqa: F401
from
.framework.random
import
seed
# noqa: F401
from
.framework.random
import
get_cuda_rng_state
# noqa: F401
from
.framework.random
import
set_cuda_rng_state
# noqa: F401
from
.framework.random
import
get_rng_state
# noqa: F401
from
.framework.random
import
set_rng_state
# noqa: F401
from
.framework
import
ParamAttr
# noqa: F401
from
.framework
import
CPUPlace
# noqa: F401
from
.framework
import
IPUPlace
# noqa: F401
...
...
@@ -424,6 +426,7 @@ __all__ = [ # noqa
'save'
,
'multinomial'
,
'get_cuda_rng_state'
,
'get_rng_state'
,
'rank'
,
'empty_like'
,
'eye'
,
...
...
@@ -606,6 +609,7 @@ __all__ = [ # noqa
'unique'
,
'unique_consecutive'
,
'set_cuda_rng_state'
,
'set_rng_state'
,
'set_printoptions'
,
'std'
,
'flatten'
,
...
...
python/paddle/distributed/fleet/recompute/recompute.py
浏览文件 @
98c17a68
...
...
@@ -56,16 +56,15 @@ def check_recompute_necessary(inputs):
@
contextlib
.
contextmanager
def
swith_rng_state_tracker
(
rng_state
,
tracker
):
orig_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
orig_cuda_rng_tracker
=
get_rng_state_tracker
().
get_states_tracker
()
paddle
.
set_cuda_rng_state
(
rng_state
)
orig_rng_state
=
paddle
.
get_rng_state
()
orig_rng_tracker
=
get_rng_state_tracker
().
get_states_tracker
()
paddle
.
set_rng_state
(
rng_state
)
get_rng_state_tracker
().
set_states_tracker
(
tracker
)
try
:
yield
finally
:
paddle
.
set_
cuda_rng_state
(
orig_cuda
_rng_state
)
get_rng_state_tracker
().
set_states_tracker
(
orig_
cuda_
rng_tracker
)
paddle
.
set_
rng_state
(
orig
_rng_state
)
get_rng_state_tracker
().
set_states_tracker
(
orig_rng_tracker
)
class
LegacyRecomputeFunction
(
LegacyPyLayer
):
...
...
@@ -95,15 +94,8 @@ class LegacyRecomputeFunction(LegacyPyLayer):
# NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
# one process with multiple gpu and mix-gpu-cpu senarios are not support
if
ctx
.
preserve_rng_state
:
cur_device
=
paddle
.
get_device
()
if
'gpu:'
not
in
cur_device
:
raise
RuntimeError
(
"Recompute with RNG perserve is not support current device: {}."
.
format
(
cur_device
)
)
ctx
.
fw_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
(
ctx
.
fw_rng_state
=
paddle
.
get_rng_state
()
ctx
.
fwd_rng_state_tracker
=
(
get_rng_state_tracker
().
get_states_tracker
()
)
...
...
@@ -156,7 +148,7 @@ class LegacyRecomputeFunction(LegacyPyLayer):
# need restore auto_cast state as well as w/b list
if
ctx
.
preserve_rng_state
:
with
swith_rng_state_tracker
(
ctx
.
fw_
cuda_rng_state
,
ctx
.
fwd_cuda
_rng_state_tracker
ctx
.
fw_
rng_state
,
ctx
.
fwd
_rng_state_tracker
):
with
paddle
.
amp
.
auto_cast
(
enable
=
ctx
.
is_fw_autocast
,
...
...
@@ -244,15 +236,8 @@ class RecomputeFunction(PyLayer):
# NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
# one process with multiple gpu and mix-gpu-cpu senarios are not support
if
ctx
.
preserve_rng_state
:
cur_device
=
paddle
.
get_device
()
if
'gpu:'
not
in
cur_device
:
raise
RuntimeError
(
"Recompute with RNG perserve is not support current device: {}."
.
format
(
cur_device
)
)
ctx
.
fw_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
(
ctx
.
fw_rng_state
=
paddle
.
get_rng_state
()
ctx
.
fwd_rng_state_tracker
=
(
get_rng_state_tracker
().
get_states_tracker
()
)
...
...
@@ -305,7 +290,7 @@ class RecomputeFunction(PyLayer):
# need restore auto_cast state as well as w/b list
if
ctx
.
preserve_rng_state
:
with
swith_rng_state_tracker
(
ctx
.
fw_
cuda_rng_state
,
ctx
.
fwd_cuda
_rng_state_tracker
ctx
.
fw_
rng_state
,
ctx
.
fwd
_rng_state_tracker
):
with
paddle
.
amp
.
auto_cast
(
enable
=
ctx
.
is_fw_autocast
,
...
...
python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
浏览文件 @
98c17a68
...
...
@@ -312,15 +312,6 @@ class TestRecompute(unittest.TestCase):
recompute_block
=
[
2
],
recompute_kwargs
=
kwargs
)
def
test_recompute_cpu_rng
(
self
):
paddle
.
set_device
(
"cpu"
)
for
flag
in
[
True
,
False
]:
with
self
.
assertRaises
(
RuntimeError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
],
recompute_kwargs
=
{
"use_reentrant"
:
flag
},
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
0 → 100644
浏览文件 @
98c17a68
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
import
numpy
as
np
import
paddle
from
paddle.distributed.fleet.utils
import
recompute
def
get_fc_block
(
block_idx
,
input_size
,
is_last
=
False
):
block_name
=
"block_"
+
str
(
block_idx
)
block
=
paddle
.
nn
.
Sequential
(
(
block_name
+
"_fc_0"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
),
),
(
block_name
+
"_dropout"
,
paddle
.
nn
.
Dropout
(
p
=
0.5
)),
(
block_name
+
"_relu_1"
,
paddle
.
nn
.
ReLU
()),
(
block_name
+
"_fc_1"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
),
),
(
block_name
+
"_relu_2"
,
paddle
.
nn
.
ReLU
()),
)
if
is_last
:
block
.
add_sublayer
(
block_name
+
"_fc_2"
,
paddle
.
nn
.
Linear
(
input_size
,
1
,
bias_attr
=
False
),
)
else
:
block
.
add_sublayer
(
block_name
+
"_fc_2"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
),
)
return
block
class
Naive_fc_net
(
paddle
.
nn
.
Layer
):
def
__init__
(
self
,
input_size
=
10
,
recompute_blocks
=
[
1
,
3
],
recompute_kwargs
=
{}
):
super
(
Naive_fc_net
,
self
).
__init__
()
self
.
recompute_blocks
=
recompute_blocks
self
.
recompute_kwargs
=
recompute_kwargs
self
.
runfunc0
=
get_fc_block
(
0
,
input_size
,
is_last
=
False
)
self
.
runfunc1
=
get_fc_block
(
1
,
input_size
,
is_last
=
False
)
self
.
runfunc2
=
get_fc_block
(
2
,
input_size
,
is_last
=
False
)
self
.
runfunc3
=
get_fc_block
(
3
,
input_size
,
is_last
=
False
)
self
.
runfunc4
=
get_fc_block
(
4
,
input_size
,
is_last
=
True
)
self
.
total_func
=
[
self
.
runfunc0
,
self
.
runfunc1
,
self
.
runfunc2
,
self
.
runfunc3
,
self
.
runfunc4
,
]
def
forward
(
self
,
inputs
):
nums
=
len
(
self
.
total_func
)
for
i
in
range
(
nums
):
if
i
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
total_func
[
i
],
inputs
,
**
{
"preserve_rng_state"
:
True
}
)
else
:
inputs
=
self
.
total_func
[
i
](
inputs
)
return
inputs
def
run_model
(
xpu_state
,
recompute_block
=
[],
recompute_kwargs
=
{}):
gen
=
paddle
.
seed
(
10
)
random
.
seed
(
10
)
batch_size
,
input_size
=
1
,
10
model
=
Naive_fc_net
(
input_size
,
recompute_blocks
=
recompute_block
,
recompute_kwargs
=
recompute_kwargs
,
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
()
)
loss_
=
[]
param_
=
[]
grad_
=
[]
for
_
in
range
(
5
):
x
=
paddle
.
rand
(
shape
=
[
batch_size
,
input_size
],
dtype
=
"float32"
)
y_pred
=
model
(
x
)
loss
=
y_pred
.
mean
()
loss_
.
append
(
loss
.
item
())
loss
.
backward
()
optimizer
.
step
()
param_
.
append
(
model
.
parameters
()[
9
])
grad_
.
append
(
model
.
parameters
()[
3
].
_grad_ivar
())
optimizer
.
clear_grad
()
return
loss_
,
param_
,
grad_
xpu_state
=
paddle
.
get_rng_state
()
# without recompute
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
xpu_state
,
recompute_block
=
[])
loss
,
param
,
grad
=
run_model
(
xpu_state
,
recompute_block
=
[
1
,
3
])
# The result of the recompute_loss should be the same as the normal_loss.
np
.
testing
.
assert_allclose
(
loss_ref
,
loss
,
rtol
=
1e-05
,
atol
=
1e-05
)
python/paddle/framework/random.py
浏览文件 @
98c17a68
...
...
@@ -45,10 +45,51 @@ def seed(seed):
if
core
.
is_compiled_with_cuda
():
for
i
in
range
(
core
.
get_cuda_device_count
()):
core
.
default_cuda_generator
(
i
).
manual_seed
(
seed
)
elif
core
.
is_compiled_with_xpu
():
for
i
in
range
(
core
.
get_xpu_device_count
()):
core
.
default_xpu_generator
(
i
).
manual_seed
(
seed
)
return
core
.
default_cpu_generator
().
manual_seed
(
seed
)
def
get_rng_state
(
device
=
None
):
"""
Get all random states of random generators of specified device.
Args:
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, Default is None.
If None, return the generators of current device (specified by ``set_device``).
Returns:
GeneratorState: object.
Examples:
.. code-block:: python
import paddle
sts = paddle.get_rng_state()
"""
state_list
=
[]
if
device
is
None
:
place
=
fluid
.
framework
.
_current_expected_place
()
else
:
place
=
device
.
_convert_to_place
(
device
)
if
isinstance
(
place
,
core
.
CPUPlace
):
state_list
.
append
(
core
.
default_cpu_generator
().
get_state
())
elif
isinstance
(
place
,
core
.
CUDAPlace
):
for
i
in
range
(
core
.
get_cuda_device_count
()):
state_list
.
append
(
core
.
default_cuda_generator
(
i
).
get_state
())
elif
isinstance
(
place
,
core
.
XPUPlace
):
for
i
in
range
(
core
.
get_xpu_device_count
()):
state_list
.
append
(
core
.
default_xpu_generator
(
i
).
get_state
())
else
:
raise
ValueError
(
"get_rng_state is not implemented for current device: {}"
.
format
(
place
)
)
return
state_list
def
get_cuda_rng_state
():
"""
...
...
@@ -75,6 +116,59 @@ def get_cuda_rng_state():
return
state_list
def
set_rng_state
(
state_list
,
device
=
None
):
"""
Sets generator state for all device generators.
Args:
state_list(list|tuple): The device states to set back to device generators. state_list is obtained from get_rng_state().
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, Default is None.
If None, return the generators of current device (specified by ``set_device``).
Returns:
None.
Examples:
.. code-block:: python
import paddle
sts = paddle.get_rng_state()
paddle.set_rng_state(sts)
"""
if
device
is
None
:
place
=
fluid
.
framework
.
_current_expected_place
()
else
:
place
=
device
.
_convert_to_place
(
device
)
if
isinstance
(
place
,
core
.
CUDAPlace
):
if
not
len
(
state_list
)
==
core
.
get_cuda_device_count
():
raise
ValueError
(
"Length of gpu state list shoule be equal to the gpu device count"
)
for
i
in
range
(
core
.
get_cuda_device_count
()):
core
.
default_cuda_generator
(
i
).
set_state
(
state_list
[
i
])
elif
isinstance
(
place
,
core
.
XPUPlace
):
if
not
len
(
state_list
)
==
core
.
get_xpu_device_count
():
raise
ValueError
(
"Length of xpu state list shoule be equal to the xpu device count"
)
for
i
in
range
(
core
.
get_xpu_device_count
()):
core
.
default_xpu_generator
(
i
).
set_state
(
state_list
[
i
])
elif
isinstance
(
place
,
core
.
CPUPlace
):
if
not
len
(
state_list
)
==
1
:
raise
ValueError
(
"Length of cpu state list shoule be equal to 1"
)
core
.
default_cpu_generator
().
set_state
(
state_list
[
0
])
else
:
raise
ValueError
(
"set_rng_state is not implemented for current device: {}"
.
format
(
place
)
)
def
set_cuda_rng_state
(
state_list
):
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录