Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
98c17a68
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
98c17a68
编写于
12月 23, 2022
作者:
Q
QingshuChen
提交者:
GitHub
12月 23, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
suport recompute for kunlun (#49069)
上级
644dfc60
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
277 addition
and
38 deletion
+277
-38
paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
...uid/framework/details/bind_threaded_ssa_graph_executor.cc
+7
-3
paddle/fluid/framework/generator.cc
paddle/fluid/framework/generator.cc
+32
-0
paddle/fluid/framework/generator.h
paddle/fluid/framework/generator.h
+2
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+7
-0
paddle/fluid/pybind/generator_py.cc
paddle/fluid/pybind/generator_py.cc
+1
-0
paddle/phi/kernels/xpu/dropout_kernel.cc
paddle/phi/kernels/xpu/dropout_kernel.cc
+3
-0
python/paddle/__init__.py
python/paddle/__init__.py
+4
-0
python/paddle/distributed/fleet/recompute/recompute.py
python/paddle/distributed/fleet/recompute/recompute.py
+11
-26
python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
...ests/collective/fleet/test_dygraph_recompute_for_eager.py
+0
-9
python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
...paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
+116
-0
python/paddle/framework/random.py
python/paddle/framework/random.py
+94
-0
未找到文件。
paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
浏览文件 @
98c17a68
...
...
@@ -254,9 +254,13 @@ void BindThreadedSSAGraphExecutor::RunMultiDeviceOpAsync(
auto
dev_ctxes
=
op
->
DeviceContext
();
auto
&
inputs
=
op
->
Inputs
();
for
(
auto
&
input
:
inputs
)
{
auto
dev_ctxes
=
input
->
GeneratedOp
()
->
DeviceContext
();
for
(
auto
&
item
:
dev_ctxes
)
{
((
platform
::
XPUDeviceContext
*
)(
item
.
second
))
->
Wait
();
if
(
input
&&
input
->
GeneratedOp
()
!=
nullptr
)
{
auto
dev_ctxes
=
input
->
GeneratedOp
()
->
DeviceContext
();
for
(
auto
&
item
:
dev_ctxes
)
{
((
platform
::
XPUDeviceContext
*
)(
item
.
second
))
->
Wait
();
}
}
else
{
VLOG
(
3
)
<<
"No generated op:"
<<
op
->
Name
();
}
}
op
->
Run
(
strategy_
.
use_device_
);
...
...
paddle/fluid/framework/generator.cc
浏览文件 @
98c17a68
...
...
@@ -20,11 +20,43 @@ limitations under the License. */
#include <utility>
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
framework
{
const
std
::
shared_ptr
<
Generator
>&
DefaultXPUGenerator
(
int64_t
device_id
)
{
#if defined(PADDLE_WITH_XPU)
static
int64_t
num_xpu_devices
=
-
1
;
static
std
::
once_flag
num_devices_init_flag
;
static
std
::
deque
<
std
::
once_flag
>
xpu_device_flags
;
static
std
::
vector
<
std
::
shared_ptr
<
Generator
>>
default_xpu_generators
;
std
::
call_once
(
num_devices_init_flag
,
[]()
{
num_xpu_devices
=
paddle
::
platform
::
GetXPUDeviceCount
();
xpu_device_flags
.
resize
(
num_xpu_devices
);
default_xpu_generators
.
resize
(
num_xpu_devices
);
});
if
(
device_id
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"xpu device id shoule be greater than 0"
));
}
std
::
call_once
(
xpu_device_flags
[
device_id
],
[
device_id
]()
{
default_xpu_generators
[
device_id
]
=
std
::
make_shared
<
Generator
>
(
GetRandomSeed
(),
device_id
);
VLOG
(
4
)
<<
"initial seed: "
<<
default_xpu_generators
[
device_id
]
->
GetCurrentSeed
();
});
return
default_xpu_generators
[
device_id
];
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"getDefaultXPUGenerator only support in XPU place"
));
#endif
}
const
std
::
shared_ptr
<
Generator
>&
DefaultCUDAGenerator
(
int64_t
device_id
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/framework/generator.h
浏览文件 @
98c17a68
...
...
@@ -107,6 +107,8 @@ const std::shared_ptr<Generator>& DefaultCPUGenerator();
const
std
::
shared_ptr
<
Generator
>&
DefaultCUDAGenerator
(
int64_t
device_id
=
-
1
);
const
std
::
shared_ptr
<
Generator
>&
DefaultXPUGenerator
(
int64_t
device_id
=
-
1
);
std
::
shared_ptr
<
std
::
mt19937_64
>
GetCPURandomEngine
(
uint64_t
);
const
std
::
shared_ptr
<
Generator
>&
SetRandomSeedGenerator
(
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
98c17a68
...
...
@@ -170,6 +170,13 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
cuda_ctx
->
PartialInitWithAllocator
();
dev_ctx
->
SetGenerator
(
framework
::
DefaultCUDAGenerator
(
p
.
GetDeviceId
()).
get
());
#endif
}
else
if
(
is_xpu_place
(
p
))
{
#if defined(PADDLE_WITH_XPU)
dev_ctx
->
SetAllocator
(
memory
::
allocation
::
AllocatorFacade
::
Instance
().
GetAllocator
(
p
).
get
());
dev_ctx
->
SetGenerator
(
framework
::
DefaultXPUGenerator
(
p
.
GetDeviceId
()).
get
());
#endif
}
else
{
dev_ctx
->
SetAllocator
(
...
...
paddle/fluid/pybind/generator_py.cc
浏览文件 @
98c17a68
...
...
@@ -90,6 +90,7 @@ void BindGenerator(py::module* m_ptr) {
.
def
(
"random"
,
&
framework
::
Generator
::
Random64
);
m
.
def
(
"default_cpu_generator"
,
&
framework
::
DefaultCPUGenerator
);
m
.
def
(
"default_cuda_generator"
,
&
framework
::
DefaultCUDAGenerator
);
m
.
def
(
"default_xpu_generator"
,
&
framework
::
DefaultXPUGenerator
);
m
.
def
(
"set_random_seed_generator"
,
&
framework
::
SetRandomSeedGenerator
);
m
.
def
(
"get_random_seed_generator"
,
&
framework
::
GetRandomSeedGenerator
);
}
...
...
paddle/phi/kernels/xpu/dropout_kernel.cc
浏览文件 @
98c17a68
...
...
@@ -58,6 +58,9 @@ void DropoutRawKernel(const Context& dev_ctx,
}
else
{
seed_data
=
fix_seed
?
seed
:
0
;
}
if
(
seed_data
==
0
)
{
seed_data
=
dev_ctx
.
GetGenerator
()
->
Random64
();
}
auto
*
mask_data
=
dev_ctx
.
template
Alloc
<
T
>(
mask
);
// Special case when dropout_prob is 1.0
...
...
python/paddle/__init__.py
浏览文件 @
98c17a68
...
...
@@ -327,6 +327,8 @@ from .tensor.einsum import einsum # noqa: F401
from
.framework.random
import
seed
# noqa: F401
from
.framework.random
import
get_cuda_rng_state
# noqa: F401
from
.framework.random
import
set_cuda_rng_state
# noqa: F401
from
.framework.random
import
get_rng_state
# noqa: F401
from
.framework.random
import
set_rng_state
# noqa: F401
from
.framework
import
ParamAttr
# noqa: F401
from
.framework
import
CPUPlace
# noqa: F401
from
.framework
import
IPUPlace
# noqa: F401
...
...
@@ -424,6 +426,7 @@ __all__ = [ # noqa
'save'
,
'multinomial'
,
'get_cuda_rng_state'
,
'get_rng_state'
,
'rank'
,
'empty_like'
,
'eye'
,
...
...
@@ -606,6 +609,7 @@ __all__ = [ # noqa
'unique'
,
'unique_consecutive'
,
'set_cuda_rng_state'
,
'set_rng_state'
,
'set_printoptions'
,
'std'
,
'flatten'
,
...
...
python/paddle/distributed/fleet/recompute/recompute.py
浏览文件 @
98c17a68
...
...
@@ -56,16 +56,15 @@ def check_recompute_necessary(inputs):
@
contextlib
.
contextmanager
def
swith_rng_state_tracker
(
rng_state
,
tracker
):
orig_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
orig_cuda_rng_tracker
=
get_rng_state_tracker
().
get_states_tracker
()
paddle
.
set_cuda_rng_state
(
rng_state
)
orig_rng_state
=
paddle
.
get_rng_state
()
orig_rng_tracker
=
get_rng_state_tracker
().
get_states_tracker
()
paddle
.
set_rng_state
(
rng_state
)
get_rng_state_tracker
().
set_states_tracker
(
tracker
)
try
:
yield
finally
:
paddle
.
set_
cuda_rng_state
(
orig_cuda
_rng_state
)
get_rng_state_tracker
().
set_states_tracker
(
orig_
cuda_
rng_tracker
)
paddle
.
set_
rng_state
(
orig
_rng_state
)
get_rng_state_tracker
().
set_states_tracker
(
orig_rng_tracker
)
class
LegacyRecomputeFunction
(
LegacyPyLayer
):
...
...
@@ -95,15 +94,8 @@ class LegacyRecomputeFunction(LegacyPyLayer):
# NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
# one process with multiple gpu and mix-gpu-cpu senarios are not support
if
ctx
.
preserve_rng_state
:
cur_device
=
paddle
.
get_device
()
if
'gpu:'
not
in
cur_device
:
raise
RuntimeError
(
"Recompute with RNG perserve is not support current device: {}."
.
format
(
cur_device
)
)
ctx
.
fw_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
(
ctx
.
fw_rng_state
=
paddle
.
get_rng_state
()
ctx
.
fwd_rng_state_tracker
=
(
get_rng_state_tracker
().
get_states_tracker
()
)
...
...
@@ -156,7 +148,7 @@ class LegacyRecomputeFunction(LegacyPyLayer):
# need restore auto_cast state as well as w/b list
if
ctx
.
preserve_rng_state
:
with
swith_rng_state_tracker
(
ctx
.
fw_
cuda_rng_state
,
ctx
.
fwd_cuda
_rng_state_tracker
ctx
.
fw_
rng_state
,
ctx
.
fwd
_rng_state_tracker
):
with
paddle
.
amp
.
auto_cast
(
enable
=
ctx
.
is_fw_autocast
,
...
...
@@ -244,15 +236,8 @@ class RecomputeFunction(PyLayer):
# NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
# one process with multiple gpu and mix-gpu-cpu senarios are not support
if
ctx
.
preserve_rng_state
:
cur_device
=
paddle
.
get_device
()
if
'gpu:'
not
in
cur_device
:
raise
RuntimeError
(
"Recompute with RNG perserve is not support current device: {}."
.
format
(
cur_device
)
)
ctx
.
fw_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
(
ctx
.
fw_rng_state
=
paddle
.
get_rng_state
()
ctx
.
fwd_rng_state_tracker
=
(
get_rng_state_tracker
().
get_states_tracker
()
)
...
...
@@ -305,7 +290,7 @@ class RecomputeFunction(PyLayer):
# need restore auto_cast state as well as w/b list
if
ctx
.
preserve_rng_state
:
with
swith_rng_state_tracker
(
ctx
.
fw_
cuda_rng_state
,
ctx
.
fwd_cuda
_rng_state_tracker
ctx
.
fw_
rng_state
,
ctx
.
fwd
_rng_state_tracker
):
with
paddle
.
amp
.
auto_cast
(
enable
=
ctx
.
is_fw_autocast
,
...
...
python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
浏览文件 @
98c17a68
...
...
@@ -312,15 +312,6 @@ class TestRecompute(unittest.TestCase):
recompute_block
=
[
2
],
recompute_kwargs
=
kwargs
)
def
test_recompute_cpu_rng
(
self
):
paddle
.
set_device
(
"cpu"
)
for
flag
in
[
True
,
False
]:
with
self
.
assertRaises
(
RuntimeError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
],
recompute_kwargs
=
{
"use_reentrant"
:
flag
},
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
0 → 100644
浏览文件 @
98c17a68
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
random
import
numpy
as
np
import
paddle
from
paddle.distributed.fleet.utils
import
recompute
def
get_fc_block
(
block_idx
,
input_size
,
is_last
=
False
):
block_name
=
"block_"
+
str
(
block_idx
)
block
=
paddle
.
nn
.
Sequential
(
(
block_name
+
"_fc_0"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
),
),
(
block_name
+
"_dropout"
,
paddle
.
nn
.
Dropout
(
p
=
0.5
)),
(
block_name
+
"_relu_1"
,
paddle
.
nn
.
ReLU
()),
(
block_name
+
"_fc_1"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
),
),
(
block_name
+
"_relu_2"
,
paddle
.
nn
.
ReLU
()),
)
if
is_last
:
block
.
add_sublayer
(
block_name
+
"_fc_2"
,
paddle
.
nn
.
Linear
(
input_size
,
1
,
bias_attr
=
False
),
)
else
:
block
.
add_sublayer
(
block_name
+
"_fc_2"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
),
)
return
block
class
Naive_fc_net
(
paddle
.
nn
.
Layer
):
def
__init__
(
self
,
input_size
=
10
,
recompute_blocks
=
[
1
,
3
],
recompute_kwargs
=
{}
):
super
(
Naive_fc_net
,
self
).
__init__
()
self
.
recompute_blocks
=
recompute_blocks
self
.
recompute_kwargs
=
recompute_kwargs
self
.
runfunc0
=
get_fc_block
(
0
,
input_size
,
is_last
=
False
)
self
.
runfunc1
=
get_fc_block
(
1
,
input_size
,
is_last
=
False
)
self
.
runfunc2
=
get_fc_block
(
2
,
input_size
,
is_last
=
False
)
self
.
runfunc3
=
get_fc_block
(
3
,
input_size
,
is_last
=
False
)
self
.
runfunc4
=
get_fc_block
(
4
,
input_size
,
is_last
=
True
)
self
.
total_func
=
[
self
.
runfunc0
,
self
.
runfunc1
,
self
.
runfunc2
,
self
.
runfunc3
,
self
.
runfunc4
,
]
def
forward
(
self
,
inputs
):
nums
=
len
(
self
.
total_func
)
for
i
in
range
(
nums
):
if
i
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
total_func
[
i
],
inputs
,
**
{
"preserve_rng_state"
:
True
}
)
else
:
inputs
=
self
.
total_func
[
i
](
inputs
)
return
inputs
def
run_model
(
xpu_state
,
recompute_block
=
[],
recompute_kwargs
=
{}):
gen
=
paddle
.
seed
(
10
)
random
.
seed
(
10
)
batch_size
,
input_size
=
1
,
10
model
=
Naive_fc_net
(
input_size
,
recompute_blocks
=
recompute_block
,
recompute_kwargs
=
recompute_kwargs
,
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
()
)
loss_
=
[]
param_
=
[]
grad_
=
[]
for
_
in
range
(
5
):
x
=
paddle
.
rand
(
shape
=
[
batch_size
,
input_size
],
dtype
=
"float32"
)
y_pred
=
model
(
x
)
loss
=
y_pred
.
mean
()
loss_
.
append
(
loss
.
item
())
loss
.
backward
()
optimizer
.
step
()
param_
.
append
(
model
.
parameters
()[
9
])
grad_
.
append
(
model
.
parameters
()[
3
].
_grad_ivar
())
optimizer
.
clear_grad
()
return
loss_
,
param_
,
grad_
xpu_state
=
paddle
.
get_rng_state
()
# without recompute
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
xpu_state
,
recompute_block
=
[])
loss
,
param
,
grad
=
run_model
(
xpu_state
,
recompute_block
=
[
1
,
3
])
# The result of the recompute_loss should be the same as the normal_loss.
np
.
testing
.
assert_allclose
(
loss_ref
,
loss
,
rtol
=
1e-05
,
atol
=
1e-05
)
python/paddle/framework/random.py
浏览文件 @
98c17a68
...
...
@@ -45,10 +45,51 @@ def seed(seed):
if
core
.
is_compiled_with_cuda
():
for
i
in
range
(
core
.
get_cuda_device_count
()):
core
.
default_cuda_generator
(
i
).
manual_seed
(
seed
)
elif
core
.
is_compiled_with_xpu
():
for
i
in
range
(
core
.
get_xpu_device_count
()):
core
.
default_xpu_generator
(
i
).
manual_seed
(
seed
)
return
core
.
default_cpu_generator
().
manual_seed
(
seed
)
def
get_rng_state
(
device
=
None
):
"""
Get all random states of random generators of specified device.
Args:
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, Default is None.
If None, return the generators of current device (specified by ``set_device``).
Returns:
GeneratorState: object.
Examples:
.. code-block:: python
import paddle
sts = paddle.get_rng_state()
"""
state_list
=
[]
if
device
is
None
:
place
=
fluid
.
framework
.
_current_expected_place
()
else
:
place
=
device
.
_convert_to_place
(
device
)
if
isinstance
(
place
,
core
.
CPUPlace
):
state_list
.
append
(
core
.
default_cpu_generator
().
get_state
())
elif
isinstance
(
place
,
core
.
CUDAPlace
):
for
i
in
range
(
core
.
get_cuda_device_count
()):
state_list
.
append
(
core
.
default_cuda_generator
(
i
).
get_state
())
elif
isinstance
(
place
,
core
.
XPUPlace
):
for
i
in
range
(
core
.
get_xpu_device_count
()):
state_list
.
append
(
core
.
default_xpu_generator
(
i
).
get_state
())
else
:
raise
ValueError
(
"get_rng_state is not implemented for current device: {}"
.
format
(
place
)
)
return
state_list
def
get_cuda_rng_state
():
"""
...
...
@@ -75,6 +116,59 @@ def get_cuda_rng_state():
return
state_list
def
set_rng_state
(
state_list
,
device
=
None
):
"""
Sets generator state for all device generators.
Args:
state_list(list|tuple): The device states to set back to device generators. state_list is obtained from get_rng_state().
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, Default is None.
If None, return the generators of current device (specified by ``set_device``).
Returns:
None.
Examples:
.. code-block:: python
import paddle
sts = paddle.get_rng_state()
paddle.set_rng_state(sts)
"""
if
device
is
None
:
place
=
fluid
.
framework
.
_current_expected_place
()
else
:
place
=
device
.
_convert_to_place
(
device
)
if
isinstance
(
place
,
core
.
CUDAPlace
):
if
not
len
(
state_list
)
==
core
.
get_cuda_device_count
():
raise
ValueError
(
"Length of gpu state list shoule be equal to the gpu device count"
)
for
i
in
range
(
core
.
get_cuda_device_count
()):
core
.
default_cuda_generator
(
i
).
set_state
(
state_list
[
i
])
elif
isinstance
(
place
,
core
.
XPUPlace
):
if
not
len
(
state_list
)
==
core
.
get_xpu_device_count
():
raise
ValueError
(
"Length of xpu state list shoule be equal to the xpu device count"
)
for
i
in
range
(
core
.
get_xpu_device_count
()):
core
.
default_xpu_generator
(
i
).
set_state
(
state_list
[
i
])
elif
isinstance
(
place
,
core
.
CPUPlace
):
if
not
len
(
state_list
)
==
1
:
raise
ValueError
(
"Length of cpu state list shoule be equal to 1"
)
core
.
default_cpu_generator
().
set_state
(
state_list
[
0
])
else
:
raise
ValueError
(
"set_rng_state is not implemented for current device: {}"
.
format
(
place
)
)
def
set_cuda_rng_state
(
state_list
):
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录