Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3fbc3cf4
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3fbc3cf4
编写于
1月 13, 2021
作者:
J
JZ-LIANG
提交者:
GitHub
1月 13, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Recompute Offload (#30233) (#30372)
上级
020e2431
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
960 addition
and
33 deletion
+960
-33
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+5
-1
paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+1
-1
paddle/fluid/operators/fill_constant_op.cc
paddle/fluid/operators/fill_constant_op.cc
+17
-1
paddle/fluid/operators/fill_constant_op.h
paddle/fluid/operators/fill_constant_op.h
+38
-7
paddle/fluid/operators/memcpy_op.cc
paddle/fluid/operators/memcpy_op.cc
+146
-0
paddle/fluid/operators/memcpy_op.h
paddle/fluid/operators/memcpy_op.h
+75
-0
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+30
-4
python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
.../distributed/fleet/meta_optimizers/recompute_optimizer.py
+5
-1
python/paddle/fluid/backward.py
python/paddle/fluid/backward.py
+59
-5
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+375
-2
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
...paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+7
-0
python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
...id/tests/unittests/test_fleet_recompute_meta_optimizer.py
+14
-0
python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
...uid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+11
-11
python/paddle/fluid/tests/unittests/test_memcpy_op.py
python/paddle/fluid/tests/unittests/test_memcpy_op.py
+176
-0
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
3fbc3cf4
...
...
@@ -22,7 +22,11 @@ enum Mode {
HETER
=
4
;
// support XPU and GPU computing server
}
message
RecomputeConfig
{
repeated
string
checkpoints
=
1
;
}
message
RecomputeConfig
{
repeated
string
checkpoints
=
1
;
optional
bool
enable_offload
=
2
[
default
=
false
];
repeated
int32
checkpoint_shape
=
3
;
}
message
ShardingConfig
{
optional
float
fuse_broadcast_MB
=
1
[
default
=
32.0
];
...
...
paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
浏览文件 @
3fbc3cf4
...
...
@@ -394,5 +394,5 @@ REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
.
EQ
(
"square"
,
0
)
.
LE
(
"elementwise_mul"
,
1
)
.
LE
(
"elementwise_sub"
,
1
)
.
EQ
(
"fill_constant"
,
1
)
.
LE
(
"fill_constant"
,
2
)
.
EQ
(
"fusion_squared_mat_sub"
,
0
));
paddle/fluid/operators/fill_constant_op.cc
浏览文件 @
3fbc3cf4
...
...
@@ -116,6 +116,15 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
"memory. Otherwise, fill output variable to the running "
"device"
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"place_type"
,
"(int, default -1) allow mamually setting place where the "
"variable should be hold. "
"-1: not set manually, determine the place by executor. "
"0: CPUPlace. "
"1: CUDAPlace. "
"2: CUDAPinnedPlace. "
"3: XPUPlace. "
)
.
SetDefault
(
-
1
);
AddOutput
(
"Out"
,
"(Tensor) Tensor of specified shape will be filled "
"with the specified value"
);
...
...
@@ -154,4 +163,11 @@ REGISTER_OP_VERSION(fill_constant)
)ROC"
,
paddle
::
framework
::
compatible
::
OpVersionDesc
().
NewInput
(
"ValueTensor"
,
"In order to support new feature tensor support of Value"
));
"In order to support new feature tensor support of Value"
))
.
AddCheckpoint
(
R"ROC(
Upgrade fill_constant to add a new attribute [place_type].
)ROC"
,
paddle
::
framework
::
compatible
::
OpVersionDesc
().
NewAttr
(
"place_type"
,
"In order to support tensor in CUDAPinnedPlace and XPUPlace"
,
-
1
));
paddle/fluid/operators/fill_constant_op.h
浏览文件 @
3fbc3cf4
...
...
@@ -39,6 +39,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
auto
str_value
=
ctx
.
Attr
<
std
::
string
>
(
"str_value"
);
auto
float_value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
force_cpu
=
ctx
.
Attr
<
bool
>
(
"force_cpu"
);
auto
place_type
=
ctx
.
Attr
<
int
>
(
"place_type"
);
framework
::
Tensor
*
tensor
=
nullptr
;
framework
::
Variable
*
out_var
=
ctx
.
OutputVar
(
"Out"
);
...
...
@@ -101,29 +102,59 @@ class FillConstantKernel : public framework::OpKernel<T> {
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
ctx
.
GetPlace
());
bool
cpu_place
=
force_cpu
||
ctx
.
GetPlace
()
==
platform
::
CPUPlace
();
if
(
cpu_place
)
{
int
actual_place
=
place_type
;
if
(
actual_place
==
-
1
)
{
bool
cpu_place
=
force_cpu
||
ctx
.
GetPlace
()
==
platform
::
CPUPlace
();
if
(
cpu_place
)
{
actual_place
=
0
;
}
else
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
actual_place
=
1
;
}
else
if
(
platform
::
is_xpu_place
(
ctx
.
GetPlace
()))
{
actual_place
=
3
;
}
}
if
(
actual_place
==
0
)
{
tensor
->
mutable_data
(
platform
::
CPUPlace
(),
data_type
);
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
functor
;
functor
(
reinterpret_cast
<
const
platform
::
CPUDeviceContext
&>
(
dev_ctx
),
tensor
,
static_cast
<
T
>
(
value
));
}
}
else
if
(
actual_place
==
1
)
{
#ifdef PADDLE_WITH_CUDA
if
(
!
cpu_place
)
{
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
data_type
);
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
functor
;
functor
(
reinterpret_cast
<
const
platform
::
CUDADeviceContext
&>
(
dev_ctx
),
tensor
,
static_cast
<
T
>
(
value
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with GPU."
));
#endif
}
else
if
(
actual_place
==
2
)
{
#ifdef PADDLE_WITH_CUDA
tensor
->
mutable_data
(
platform
::
CUDAPinnedPlace
(),
data_type
);
math
::
SetConstant
<
platform
::
CPUDeviceContext
,
T
>
functor
;
functor
(
reinterpret_cast
<
const
platform
::
CPUDeviceContext
&>
(
dev_ctx
),
tensor
,
static_cast
<
T
>
(
value
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with GPU."
));
#endif
}
else
if
(
actual_place
==
3
)
{
#ifdef PADDLE_WITH_XPU
if
(
!
cpu_place
)
{
tensor
->
mutable_data
(
ctx
.
GetPlace
(),
data_type
);
math
::
SetConstant
<
platform
::
XPUDeviceContext
,
T
>
functor
;
functor
(
reinterpret_cast
<
const
platform
::
XPUDeviceContext
&>
(
dev_ctx
),
tensor
,
static_cast
<
T
>
(
value
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with XPU."
));
#endif
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Could NOT determine the place of variable, place_type = %d ."
,
actual_place
));
}
}
};
}
// namespace operators
...
...
paddle/fluid/operators/memcpy_op.cc
0 → 100644
浏览文件 @
3fbc3cf4
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/memcpy_op.h"
#include <string>
namespace
paddle
{
namespace
framework
{
class
OpDesc
;
class
Variable
;
}
// namespace framework
namespace
imperative
{
class
OpBase
;
}
// namespace imperative
namespace
platform
{
struct
CPUPlace
;
struct
CUDAPlace
;
struct
float16
;
}
// namespace platform
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
class
MemcpyOp
:
public
framework
::
OperatorWithKernel
{
public:
MemcpyOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorWithKernel
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
type
=
ctx
->
GetInputsVarType
(
"X"
)[
0
];
if
(
type
==
framework
::
proto
::
VarType
::
SELECTED_ROWS
||
type
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
ctx
->
SetOutputDim
(
"Out"
,
ctx
->
GetInputDim
(
"X"
));
if
(
type
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
}
}
protected:
framework
::
OpKernelType
GetKernelTypeForVar
(
const
std
::
string
&
var_name
,
const
framework
::
Tensor
&
tensor
,
const
framework
::
OpKernelType
&
expected_kernel_type
)
const
override
{
return
framework
::
OpKernelType
(
expected_kernel_type
.
data_type_
,
expected_kernel_type
.
place_
,
tensor
.
layout
());
}
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
device_context
());
}
};
class
MemcpyInferVarType
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
framework
::
InferVarTypeContext
*
ctx
)
const
override
{
ctx
->
SyncTypeAndDataType
(
"X"
,
"Out"
);
}
};
class
MemcpyKernel
{
public:
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
x
=
ctx
.
InputVar
(
"X"
);
if
(
x
==
nullptr
)
{
return
;
}
PADDLE_ENFORCE_EQ
(
ctx
.
HasOutput
(
"Out"
),
true
,
platform
::
errors
::
NotFound
(
"Output(Out) of memcpy_op is not found."
));
auto
*
out
=
ctx
.
OutputVar
(
"Out"
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
dev_ctx
=
*
pool
.
Get
(
ctx
.
GetPlace
());
auto
dst_place_type
=
ctx
.
Attr
<
int
>
(
"dst_place_type"
);
framework
::
VisitVarType
(
*
x
,
MemcpyFunctor
(
out
,
dev_ctx
,
dst_place_type
));
}
};
class
MemcpyOpProtoMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(LoDTensor) The input variable "
);
AddOutput
(
"Out"
,
"(LoDTensor) The type of output "
"is the same as input X."
);
AddAttr
<
int
>
(
"dst_place_type"
,
"Determine the dst place of tensor copy. "
"By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
"place type is Unimplemented and will cause ERROR."
"0: dst is on CPUPlace. "
"1: dst is on CUDAPlace. "
"2: dst is on CUDAPinnedPlace. "
"3: dst is on XPUPlace. "
);
AddComment
(
R"DOC(
Memcpy Operator.
By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
and used as an internal op by Recompute-Offload.
You would have to update it if you want other more capacities.
Out = X, when type in [LoDTensor]
raise error if the type is not listed above.
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OPERATOR
(
memcpy
,
ops
::
MemcpyOp
,
ops
::
MemcpyOpProtoMaker
,
ops
::
MemcpyInferVarType
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OP_CPU_KERNEL_FUNCTOR
(
memcpy
,
float
,
ops
::
MemcpyKernel
,
double
,
ops
::
MemcpyKernel
,
int
,
ops
::
MemcpyKernel
,
int64_t
,
ops
::
MemcpyKernel
,
bool
,
ops
::
MemcpyKernel
,
plat
::
float16
,
ops
::
MemcpyKernel
);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR
(
memcpy
,
float
,
ops
::
MemcpyKernel
,
double
,
ops
::
MemcpyKernel
,
int
,
ops
::
MemcpyKernel
,
int64_t
,
ops
::
MemcpyKernel
,
bool
,
ops
::
MemcpyKernel
,
plat
::
float16
,
ops
::
MemcpyKernel
);
#endif
paddle/fluid/operators/memcpy_op.h
0 → 100755
浏览文件 @
3fbc3cf4
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
class
LoDTensor
;
class
Variable
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
operators
{
class
MemcpyFunctor
{
public:
MemcpyFunctor
(
framework
::
Variable
*
out
,
const
platform
::
DeviceContext
&
dev_ctx
,
const
int
dst_place_type
)
:
out_
(
out
),
dev_ctx_
(
dev_ctx
),
dst_place_type_
(
dst_place_type
)
{}
void
operator
()(
const
framework
::
LoDTensor
&
lod_tensor
)
const
{
auto
&
out_tensor
=
*
out_
->
GetMutable
<
framework
::
LoDTensor
>
();
if
(
dst_place_type_
==
3
)
{
framework
::
TensorCopy
(
lod_tensor
,
platform
::
CUDAPinnedPlace
(),
dev_ctx_
,
&
out_tensor
);
}
else
if
(
dst_place_type_
==
2
)
{
framework
::
TensorCopy
(
lod_tensor
,
dev_ctx_
.
GetPlace
(),
dev_ctx_
,
&
out_tensor
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"memcpy dst_place_type: %d is not supported yet."
,
dst_place_type_
));
}
out_tensor
.
set_lod
(
lod_tensor
.
lod
());
}
void
operator
()(
const
framework
::
SelectedRows
&
rows
)
const
{
// (JZ-LIANG) to support SelectedRows
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Memcpy for SelectedRows is NOT support yet."
));
}
template
<
typename
T
>
void
operator
()(
const
T
&
v
)
const
{
PADDLE_ENFORCE_EQ
(
true
,
false
,
platform
::
errors
::
PermissionDenied
(
"Not support type for Memcpy op with type %s"
,
typeid
(
T
).
name
()));
}
private:
framework
::
Variable
*
out_
;
const
platform
::
DeviceContext
&
dev_ctx_
;
const
int
dst_place_type_
;
};
}
// namespace operators
}
// namespace paddle
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
3fbc3cf4
...
...
@@ -632,8 +632,20 @@ class DistributedStrategy(object):
@
property
def
recompute_configs
(
self
):
"""
Set recompute configurations. In general, the recompute strategy of current
implementation should have some manually assign checkpoints
Set recompute configurations.
**Note**:
checkpoints(list): list of string name of checkpoints. In general, the recompute
strategy of current implementation should have some manually assign checkpoints.
enable_offload(bool): enable recompute checkpoints offload feature. this feature
will offload the checkpoint to host memory to allow even larger batch size. since
the memcpy from host to device takes time, it is a trade off between larger batch
size and training speed.
checkpoint_shape(list): list of int that specific the shape of checkpoint. so far
recompute-offload requires that all checkpoint to be same shape, and every dimension
specific here should be determined ("-1" is not allowed).
Examples:
...
...
@@ -642,7 +654,10 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["x", "y"]}
strategy.recompute_configs = {
"checkpoints": ["x", "y"],
"enable_offload": True,
"checkpoint_shape": [100, 512, 1024] }
"""
return
get_msg_dict
(
self
.
strategy
.
recompute_configs
)
...
...
@@ -692,6 +707,14 @@ class DistributedStrategy(object):
This configuration will affect the communication speed in sharding training,
and should be an empirical value decided by your model size and network topology.
hybrid_dp(bool): enable hybrid data parallelism above the sharding parallelism.
you are supposed to have at least double the number of gpu you have in normal sharding
training to enable this feature.
sharding_group_size(int): attribute of hybrid_dp. specific the the number of gpus within
each sharding group; and therefore, the number of hybrid data parallelism ways will be equal
to (global_size / sharding_group_size).
Examples:
.. code-block:: python
...
...
@@ -699,7 +722,10 @@ class DistributedStrategy(object):
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sharding = True
strategy.sharding_configs = {"fuse_broadcast_MB": 32}
strategy.sharding_configs = {
"fuse_broadcast_MB": 32,
"hybrid_dp": True,
"sharding_group_size": 8}
"""
return
get_msg_dict
(
self
.
strategy
.
sharding_configs
)
...
...
python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
浏览文件 @
3fbc3cf4
...
...
@@ -39,9 +39,13 @@ class RecomputeOptimizer(MetaOptimizerBase):
return
configs
=
self
.
user_defined_strategy
.
recompute_configs
self
.
wrapped_opt
=
RO
(
self
.
inner_opt
)
self
.
wrapped_opt
.
_set_checkpoints
(
list
(
configs
[
"checkpoints"
]))
if
configs
[
"enable_offload"
]:
self
.
wrapped_opt
.
_enable_offload
()
# TODO(JZ-LIANG) might found a way to infer the checkpoint shape automatically
checkpoint_shapes
=
list
(
configs
[
"checkpoint_shape"
])
self
.
wrapped_opt
.
checkpoint_shape
=
checkpoint_shapes
def
_can_apply
(
self
):
if
not
self
.
role_maker
.
_is_collective
:
...
...
python/paddle/fluid/backward.py
浏览文件 @
3fbc3cf4
...
...
@@ -99,8 +99,32 @@ class ProgramStats(object):
max_op_idx
=
max
(
max_op_idx
,
idx
)
if
min_op_idx
>=
max_op_idx
:
return
False
,
min_op_idx
,
max_op_idx
return
True
,
min_op_idx
,
max_op_idx
def
_update_segment_start
(
self
,
min_idx
,
pre_segment_end_idx
):
"""
persist vars of amp-related cast should be included in recompute segment
"""
def
is_amp_cast
(
op
):
return
op
.
desc
.
type
()
==
'cast'
and
self
.
block
.
var
(
op
.
desc
.
input_arg_names
()[
0
]).
persistable
idx_
=
min_idx
-
1
updated_min_idx
=
min_idx
while
idx_
>
pre_segment_end_idx
:
if
is_amp_cast
(
self
.
ops
[
idx_
]):
_logger
.
debug
(
"found amp-cast op: {}, : {}"
.
format
(
self
.
ops
[
idx_
].
desc
.
type
(),
self
.
ops
[
idx_
].
desc
.
input_arg_names
()[
0
]))
updated_min_idx
=
idx_
idx_
-=
1
else
:
break
return
updated_min_idx
def
build_stats
(
self
):
for
i
,
op
in
enumerate
(
self
.
ops
):
self
.
op_deps
[
i
]
=
{
"in_ops"
:
[],
"out_ops"
:
[]}
...
...
@@ -751,20 +775,29 @@ def _append_backward_ops_with_checkpoints_(
if
name
not
in
program_stat
.
var_op_deps
:
break
op_idx
=
program_stat
.
var_op_deps
[
name
][
"var_as_output_ops"
]
# only count the last generate op
for
idx
in
op_idx
:
max_op_idx
=
max
(
max_op_idx
,
idx
)
if
max_op_idx
>
0
:
segments
.
append
([
0
,
max_op_idx
+
1
])
else
:
start_idx
=
0
pre_segment_end_idx
=
-
1
while
True
:
_logger
.
debug
(
"FW op range[0] - [{}]"
.
format
(
len
(
ops
)))
if
start_idx
>=
len
(
checkpoints_name
)
-
1
:
break
# min_idx: checkpoint_1' s input op
# max_idx: checkpoint_2' s output op
flag
,
min_idx
,
max_idx
=
program_stat
.
is_subgraph
(
[
checkpoints_name
[
start_idx
]],
[
checkpoints_name
[
start_idx
+
1
]])
if
flag
:
# max_idx + 1 since the exact and used segment end idx is max_idx
min_idx
=
program_stat
.
_update_segment_start
(
min_idx
,
pre_segment_end_idx
)
segments
.
append
([
min_idx
,
max_idx
+
1
])
start_idx
+=
1
if
segments
!=
[]
and
segments
[
0
][
0
]
!=
0
:
...
...
@@ -772,12 +805,31 @@ def _append_backward_ops_with_checkpoints_(
else
:
recompute_segments
=
segments
for
i
,
(
idx1
,
idx2
)
in
enumerate
(
recompute_segments
):
_logger
.
debug
(
"recompute segment[{}]"
.
format
(
i
))
_logger
.
debug
(
"segment start op: [{}]: [{}]"
.
format
(
ops
[
idx1
].
desc
.
type
(
),
ops
[
idx1
].
desc
.
input_arg_names
()))
_logger
.
debug
(
"segment end op: [{}]: [{}]"
.
format
(
ops
[
idx2
-
1
].
desc
.
type
(),
ops
[
idx2
-
1
].
desc
.
input_arg_names
()))
_logger
.
debug
(
"recompute segment[{}]"
.
format
(
i
))
_logger
.
debug
(
"segment start op: [{}]: [{}]"
.
format
(
ops
[
idx1
].
desc
.
type
(
),
ops
[
idx1
].
desc
.
input_arg_names
()))
_logger
.
debug
(
"segment end op: [{}]: [{}]"
.
format
(
ops
[
idx2
-
1
].
desc
.
type
(),
ops
[
idx2
-
1
].
desc
.
input_arg_names
()))
# 2) go through all forward ops and induct all variables that will be hold in memory
vars_should_be_hold
=
[]
# a. variables that are used across segments will be held in memory
for
segment
in
recompute_segments
:
vars_should_be_hold
.
extend
(
program_stat
.
get_out_of_subgraph_vars
(
segment
[
0
],
segment
[
1
]))
cross_vars
=
set
(
vars_should_be_hold
)
-
set
(
checkpoints_name
)
_logger
.
debug
(
"found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars"
.
format
(
\
len
(
cross_vars
),
cross_vars
))
_logger
.
debug
(
"found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars"
.
format
(
\
len
(
cross_vars
),
cross_vars
))
# b. output of seed op should be kept in memory
vars_should_be_hold
.
extend
(
program_stat
.
get_reserved_vars
())
# c. input variables are checkpoints
...
...
@@ -792,8 +844,6 @@ def _append_backward_ops_with_checkpoints_(
max_calculated_op_position
=
len
(
ops
)
if
recompute_segments
==
[]:
# if there is no recompute segment, add backward ops like
# _append_backward_ops_ function
gap_ops
=
ops
[
0
:
max_calculated_op_position
]
for
op
in
reversed
(
gap_ops
):
if
op
.
has_attr
(
"sub_block"
):
...
...
@@ -807,7 +857,6 @@ def _append_backward_ops_with_checkpoints_(
grad_to_var
.
update
(
op_grad_to_var
)
for
i
,
segment
in
enumerate
(
recompute_segments
[::
-
1
]):
# add grad op for ops not in any segments
gap_ops
=
ops
[
segment
[
1
]:
max_calculated_op_position
]
max_calculated_op_position
=
segment
[
0
]
for
op
in
reversed
(
gap_ops
):
...
...
@@ -851,7 +900,7 @@ def _append_backward_ops_with_checkpoints_(
# added_descs should be in grad_op_descs because it is backward op desc
grad_op_descs
.
extend
(
buffer_descs
)
# 3.c. add backward ops
of current recomputation ops
# 3.c. add backward ops
for all ops in current segment
for
op_desc
in
reversed
(
added_descs
):
grad_op_desc
,
op_grad_to_var
=
core
.
get_grad_op_desc
(
op_desc
,
cpt
.
to_text
(
no_grad_dict
[
block
.
idx
]),
[])
...
...
@@ -1480,9 +1529,11 @@ def append_backward(loss,
# TODO: support _append_backward_ops_with_checkpoints_ in
# sub-block (control flow)
is_recompute
=
False
if
checkpoints
!=
None
and
\
isinstance
(
checkpoints
,
list
)
and
\
len
(
checkpoints
)
>
0
:
is_recompute
=
True
program_stat
,
checkpoint_names
,
\
vars_should_be_hold
,
\
recompute_segments
=
\
...
...
@@ -1577,7 +1628,10 @@ def append_backward(loss,
attr_val
.
extend
(
g
.
op
.
attr
(
op_role_var_attr_name
))
g
.
op
.
_set_attr
(
op_role_var_attr_name
,
attr_val
)
return
params_and_grads
if
is_recompute
:
return
params_and_grads
,
checkpoint_names
else
:
return
params_and_grads
def
_as_list
(
x
):
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
3fbc3cf4
...
...
@@ -4593,6 +4593,7 @@ class RecomputeOptimizer(Optimizer):
self
.
_checkpoints
=
None
self
.
_learning_rate
=
self
.
_optimizer
.
_learning_rate
self
.
_learning_rate_map
=
self
.
_optimizer
.
_learning_rate_map
self
.
enable_offload
=
False
def
_set_checkpoints
(
self
,
checkpoints
):
"""
...
...
@@ -4608,6 +4609,10 @@ class RecomputeOptimizer(Optimizer):
),
"_checkpoints should be a list of Variable or a list of String"
self
.
_checkpoints
=
checkpoints
# should enable offload before calling backward
def
_enable_offload
(
self
):
self
.
enable_offload
=
True
@
framework
.
deprecate_stat_dict
def
load
(
self
,
state_dict
):
"""
...
...
@@ -4696,6 +4701,358 @@ class RecomputeOptimizer(Optimizer):
return
self
.
_optimizer
.
apply_gradients
(
params_grads
=
params_grads
)
def
_creat_vars
(
self
,
varname
):
pinned_var_name
=
unique_name
.
generate
(
varname
+
"@Pinned"
)
fetched_var_name
=
unique_name
.
generate
(
varname
+
"@Fetch"
)
pinned_var
=
self
.
_main_program
.
global_block
().
create_var
(
name
=
pinned_var_name
,
shape
=
self
.
checkpoint_shape
,
dtype
=
self
.
_main_program
.
global_block
().
var
(
varname
).
dtype
,
persistable
=
False
,
stop_gradient
=
True
)
fetch_var
=
self
.
_main_program
.
global_block
().
create_var
(
name
=
fetched_var_name
,
shape
=
self
.
checkpoint_shape
,
dtype
=
self
.
_main_program
.
global_block
().
var
(
varname
).
dtype
,
persistable
=
False
,
stop_gradient
=
False
)
return
pinned_var_name
,
fetched_var_name
def
_append_fill_constant_ops
(
self
,
startup_program
):
"""
add fill_constant_ops to the end of the prog
we should fill the pinned vars before runing the main_prog
to instantiate their tensor hold_, which could tell us whether
the host memory could hold all the checkpoints from all the
GPU devices in this node.
"""
op_role
=
0
block
=
startup_program
.
global_block
()
fill_constant_vars
=
self
.
checkpoint_name2pinned_name
.
values
()
OP_ROLE_KEY
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
for
varname
in
fill_constant_vars
:
var
=
self
.
_main_program
.
global_block
().
var
(
varname
)
# NOTE (JZ-LIANG) to pre-allocate the CUDAPinned MEM
pinned_var
=
block
.
create_var
(
name
=
varname
,
shape
=
self
.
checkpoint_shape
,
dtype
=
self
.
_main_program
.
global_block
().
var
(
var
.
name
).
dtype
,
persistable
=
False
,
stop_gradient
=
True
)
block
.
append_op
(
type
=
'fill_constant'
,
outputs
=
{
'Out'
:
varname
},
attrs
=
{
"shape"
:
var
.
shape
,
"dtype"
:
var
.
dtype
,
"value"
:
0.0
,
"place_type"
:
2
,
OP_ROLE_KEY
:
op_role
,
})
return
def
_insert_async_memcpy_op
(
self
,
insert_idx
,
src_varname
,
dst_varname
,
op_role
,
kind
):
OP_ROLE_KEY
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
self
.
block
.
_insert_op_without_sync
(
insert_idx
,
type
=
'memcpy'
,
inputs
=
{
'X'
:
[
self
.
_main_program
.
global_block
().
var
(
src_varname
)]},
outputs
=
{
'Out'
:
[
self
.
_main_program
.
global_block
().
var
(
dst_varname
)]
},
attrs
=
{
"dst_place_type"
:
int
(
kind
),
OP_ROLE_KEY
:
op_role
})
def
_insert_fetch_op
(
self
,
idx
,
varname
):
assert
varname
in
self
.
checkpoint_name2pinned_name
,
"Try to fetch {} from Pinned Memory, but it is NOT a checkpoint"
.
format
(
varname
)
pinned_varname
=
self
.
checkpoint_name2pinned_name
[
varname
]
fetch_varname
=
self
.
checkpoint_name2fetch_name
[
varname
]
self
.
_insert_async_memcpy_op
(
idx
,
pinned_varname
,
fetch_varname
,
1
,
2
)
def
_insert_offload_op
(
self
,
idx
,
varname
):
assert
varname
in
self
.
checkpoint_name2pinned_name
,
"Try to offload {} to Pinned Memory, but it is NOT a checkpoint"
.
format
(
varname
)
pinned_varname
=
self
.
checkpoint_name2pinned_name
[
varname
]
self
.
_insert_async_memcpy_op
(
idx
,
varname
,
pinned_varname
,
0
,
3
)
def
_insert_sync_op
(
self
,
op_idx
,
checkpoint_name
):
# single stream offload no need sync
pass
def
_record_fetch_op
(
self
,
idx
):
assert
len
(
self
.
un_fetch_checkpoint_names
)
>
0
,
"Could NOT found checkpoint to fetch"
checkpoint_name
=
self
.
un_fetch_checkpoint_names
.
pop
(
-
1
)
logging
.
debug
(
"Record fetch [{}]"
.
format
(
checkpoint_name
))
self
.
idx2insertions
[
idx
]
=
(
"fetch"
,
checkpoint_name
)
return
checkpoint_name
def
_record_offload_op
(
self
,
idx
,
checkpoint_name
):
expected_checkpoint_name
=
self
.
un_offload_checkpoint_names
.
pop
(
0
)
assert
checkpoint_name
==
expected_checkpoint_name
,
"expected to offload [{}] but got [{}]"
.
format
(
expected_checkpoint_name
,
checkpoint_name
)
logging
.
debug
(
"Record offload [{}]"
.
format
(
checkpoint_name
))
self
.
idx2insertions
[
idx
]
=
(
"offload"
,
checkpoint_name
)
def
_record_sync_op
(
self
,
idx
,
checkpoint_name
):
assert
checkpoint_name
not
in
self
.
synced_checkpoints
,
"Try to sync the checkpoint [{}] twice"
.
format
(
checkpoint_name
)
self
.
synced_checkpoints
.
add
(
checkpoint_name
)
logging
.
debug
(
"Record offload sync [{}]"
.
format
(
checkpoint_name
))
self
.
idx2insertions
[
idx
]
=
(
"sync"
,
checkpoint_name
)
def
_parse_backward
(
self
):
self
.
idx2insertions
=
{}
# don't offload the last checkpoints, to favor throughput
self
.
un_fetch_checkpoint_names
=
self
.
sorted_checkpoint_names
[:]
self
.
un_fetch_checkpoint_names
.
pop
(
-
1
)
need_fetch_checkpoint_names
=
self
.
un_fetch_checkpoint_names
[:]
self
.
checkpoint_usage_count
=
{}
for
checkpoint_name
in
self
.
un_fetch_checkpoint_names
:
self
.
checkpoint_usage_count
[
checkpoint_name
]
=
0
self
.
bw_strart_op_idx
=
len
(
self
.
block
.
ops
)
for
idx
,
op
in
enumerate
(
self
.
block
.
ops
):
if
int
(
op
.
desc
.
attr
(
"op_role"
))
==
1
:
self
.
bw_strart_op_idx
=
idx
break
assert
self
.
bw_strart_op_idx
<
len
(
self
.
block
.
ops
),
"Could NOT found backword op in prog"
# fetch second to last checkpoint at the beginning of BW
fetched_checkpoint_varname
=
self
.
_record_fetch_op
(
self
.
bw_strart_op_idx
)
last_last_fetch_checkpoint
=
None
for
i
,
op
in
enumerate
(
self
.
block
.
ops
[
self
.
bw_strart_op_idx
:]):
idx
=
self
.
bw_strart_op_idx
+
i
input_vars
=
op
.
desc
.
input_arg_names
()
for
input_var
in
input_vars
:
if
input_var
in
need_fetch_checkpoint_names
:
if
input_var
not
in
self
.
un_fetch_checkpoint_names
:
# fetch the offloade checkpoint when the first usage of its previous one
if
self
.
checkpoint_usage_count
[
input_var
]
==
0
:
# TODO (JZ-LIANG) sync memcpy_stream if extra stream for memcpy
second_to_last_fetch_checkpoint
=
fetched_checkpoint_varname
# there is NO fetch ahead the first checkpoint
if
input_var
!=
self
.
sorted_checkpoint_names
[
0
]:
fetched_checkpoint_varname
=
self
.
_record_fetch_op
(
idx
)
# should check the current used checkpoint is ths last fetch one
assert
second_to_last_fetch_checkpoint
==
input_var
,
"Current recompute segment should use [{}] BUT got [{}]"
.
format
(
second_to_last_fetch_checkpoint
,
input_var
)
# rename
self
.
block
.
ops
[
idx
].
_rename_input
(
input_var
,
self
.
checkpoint_name2fetch_name
[
input_var
])
self
.
checkpoint_usage_count
[
input_var
]
+=
1
else
:
raise
ValueError
(
"use checkpoint [{}] before fetch in BW"
.
format
(
input_var
))
assert
len
(
self
.
un_fetch_checkpoint_names
)
==
0
,
"{} checkpoints have NOT been Recorded"
.
format
(
self
.
un_fetch_checkpoint_names
)
def
_update_backward
(
self
):
if
len
(
self
.
idx2insertions
)
==
0
:
return
total_op
=
len
(
self
.
block
.
ops
)
for
op_idx
in
reversed
(
range
(
self
.
bw_strart_op_idx
,
total_op
)):
if
op_idx
in
self
.
idx2insertions
:
operation
,
checkpoint_name
=
self
.
idx2insertions
[
op_idx
]
if
operation
==
"fetch"
:
self
.
_insert_fetch_op
(
op_idx
,
checkpoint_name
)
logging
.
debug
(
"Insert [{}] fetch op."
.
format
(
checkpoint_name
))
del
self
.
idx2insertions
[
op_idx
]
elif
operation
==
"sync"
:
self
.
_insert_sync_op
(
op_idx
,
checkpoint_name
)
logging
.
debug
(
"Sync [{}] fetch op."
.
format
(
checkpoint_name
))
self
.
block
.
_sync_with_cpp
()
assert
len
(
self
.
idx2insertions
)
==
0
,
"{} checkpoints left un-Fecthed"
.
format
(
[
ele
[
1
]
for
ele
in
self
.
idx2insertions
.
values
()])
def
_parse_forward
(
self
):
self
.
idx2insertions
=
{}
# don't offload the last checkpoints, faster, less memory saving
self
.
un_offload_checkpoint_names
=
self
.
sorted_checkpoint_names
[:]
last_checkpoint
=
self
.
un_offload_checkpoint_names
.
pop
(
-
1
)
need_offload_checkpoint_names
=
self
.
un_offload_checkpoint_names
[:]
self
.
checkpoint_usage_count_and_idx
=
{}
for
checkpoint_name
in
self
.
un_offload_checkpoint_names
:
self
.
checkpoint_usage_count_and_idx
[
checkpoint_name
]
=
{
'count'
:
0
,
'idx'
:
-
1
}
self
.
synced_checkpoints
=
set
()
self
.
fw_strart_op_idx
=
len
(
self
.
block
.
ops
)
for
idx
,
op
in
enumerate
(
self
.
block
.
ops
):
if
int
(
op
.
desc
.
attr
(
"op_role"
))
==
0
:
self
.
fw_strart_op_idx
=
idx
break
assert
self
.
fw_strart_op_idx
<
len
(
self
.
block
.
ops
),
"Could NOT found Forward op in prog"
last_offload_checkpoint
=
None
for
i
,
op
in
enumerate
(
self
.
block
.
ops
[
self
.
fw_strart_op_idx
:
self
.
bw_strart_op_idx
]):
idx
=
self
.
fw_strart_op_idx
+
i
output_vars
=
op
.
desc
.
output_arg_names
()
input_vars
=
op
.
desc
.
input_arg_names
()
for
output_var
in
output_vars
:
if
output_var
in
need_offload_checkpoint_names
:
assert
len
(
output_vars
)
==
1
,
"chekpoint should be the only Output of a certain op, but [{}] is from [{}]"
.
format
(
output_var
,
op
)
if
output_var
in
self
.
un_offload_checkpoint_names
:
# insert sync op if last checkpoint has not been sync
if
last_offload_checkpoint
!=
None
:
if
self
.
checkpoint_usage_count_and_idx
[
last_offload_checkpoint
][
'count'
]
==
0
:
self
.
_record_sync_op
(
idx
,
last_offload_checkpoint
)
else
:
last_usage_idx
=
self
.
checkpoint_usage_count_and_idx
[
last_offload_checkpoint
][
'idx'
]
assert
last_usage_idx
>
0
,
"last_usage_idx of checkpoint [{}] should large than 0"
.
format
(
last_offload_checkpoint
)
self
.
_record_sync_op
(
last_usage_idx
+
1
,
last_offload_checkpoint
)
# insert offload op after the checkpoint's generation op
self
.
_record_offload_op
(
idx
+
1
,
output_var
)
last_offload_checkpoint
=
output_var
else
:
raise
ValueError
(
"There should be just ONE op that output checkpoint [{}]"
.
format
(
output_var
))
# need to sync the last need to offload checkpoint before the last checkpoint as output op
if
output_var
==
last_checkpoint
:
assert
len
(
output_vars
)
==
1
,
"chekpoint should be the only Output of a certain op, but [{}] is from [{}]"
.
format
(
output_var
,
op
)
assert
last_offload_checkpoint
==
self
.
sorted_checkpoint_names
[
-
2
],
"the last offload chekpoint before [{}] is suppose to be [{}], but got [{}]"
.
format
(
last_checkpoint
,
self
.
sorted_checkpoint_names
[
-
2
],
last_offload_checkpoint
)
# sync if last checkpoint has not been sync
if
self
.
checkpoint_usage_count_and_idx
[
last_offload_checkpoint
][
'idx'
]
==
0
:
self
.
_record_sync_op
(
idx
,
last_offload_checkpoint
)
else
:
last_usage_idx
=
self
.
checkpoint_usage_count_and_idx
[
last_offload_checkpoint
][
'idx'
]
assert
last_usage_idx
>
0
,
"last_usage_idx of checkpoint [{}] should large than 0"
.
format
(
last_offload_checkpoint
)
self
.
_record_sync_op
(
last_usage_idx
+
1
,
last_offload_checkpoint
)
# record checkpoint usage
for
input_var
in
input_vars
:
if
input_var
in
need_offload_checkpoint_names
:
assert
input_var
not
in
self
.
synced_checkpoints
,
"checkpoint [{}] used after sync"
.
format
(
input_var
)
self
.
checkpoint_usage_count_and_idx
[
input_var
][
'count'
]
+=
1
self
.
checkpoint_usage_count_and_idx
[
input_var
][
'idx'
]
=
idx
assert
len
(
self
.
un_offload_checkpoint_names
)
==
0
,
"{} checkpoints have NOT been Recorded"
.
format
(
self
.
un_fetch_checkpoint_names
)
assert
len
(
self
.
synced_checkpoints
)
==
len
(
need_offload_checkpoint_names
),
"{} checkpoints have NOT been Recorded"
.
format
(
set
(
need_offload_checkpoint_names
)
-
set
(
self
.
synced_checkpoints
))
def
_update_forward
(
self
):
if
len
(
self
.
idx2insertions
)
==
0
:
return
for
op_idx
in
reversed
(
range
(
self
.
fw_strart_op_idx
,
self
.
bw_strart_op_idx
)):
if
op_idx
in
self
.
idx2insertions
:
operation
,
checkpoint_name
=
self
.
idx2insertions
[
op_idx
]
if
operation
==
"offload"
:
self
.
_insert_offload_op
(
op_idx
,
checkpoint_name
)
logging
.
debug
(
"Insert [{}] offload op."
.
format
(
checkpoint_name
))
del
self
.
idx2insertions
[
op_idx
]
elif
operation
==
"sync"
:
self
.
_insert_sync_op
(
op_idx
,
checkpoint_name
)
logging
.
debug
(
"Insert [{}] offload_sync op."
.
format
(
checkpoint_name
))
del
self
.
idx2insertions
[
op_idx
]
self
.
block
.
_sync_with_cpp
()
assert
len
(
self
.
idx2insertions
)
==
0
,
"{} checkpoints left un-Offloaded"
.
format
(
[
ele
[
1
]
for
ele
in
self
.
idx2insertions
.
values
()])
def
_check_offload_fetch
(
self
):
# TODO(JZ-LIANG) the single stream offload need no sync
pass
def
_offload
(
self
,
loss
,
startup_program
=
None
):
"""
core steps for recompute offload
1. create pinned vars and temp vars
2. parse & update Forward pass: offload, sync
3. parse & update Backward pass: rename, fetch, sync
4. verify the correctness
"""
self
.
_main_program
=
loss
.
block
.
program
self
.
block
=
loss
.
block
if
startup_program
==
None
:
startup_program
=
fluid
.
default_startup_program
()
with
program_guard
(
self
.
_main_program
,
startup_program
):
assert
len
(
self
.
checkpoint_shape
)
>
0
,
(
"checkpoints shape {} should be an non empty list like: [12, 512, 1024]"
.
format
(
self
.
checkpoint_shape
))
assert
all
([
ele
>
0
for
ele
in
self
.
checkpoint_shape
]),
(
"all ele in checkpoints shape {} should be a determined integer larger than 0"
.
format
(
self
.
checkpoint_shape
))
self
.
checkpoint_name2pinned_name
=
dict
()
self
.
checkpoint_name2fetch_name
=
dict
()
for
checkpoint_varname
in
self
.
sorted_checkpoint_names
:
pinned_var_name
,
fetch_var_name
=
self
.
_creat_vars
(
checkpoint_varname
)
self
.
checkpoint_name2pinned_name
[
checkpoint_varname
]
=
pinned_var_name
self
.
checkpoint_name2fetch_name
[
checkpoint_varname
]
=
fetch_var_name
self
.
_append_fill_constant_ops
(
startup_program
)
# TODO (JZ-LIANG) to provide two offload stragtegy in future
# step 2. parse & update FW: rename, offload, sync
self
.
_parse_backward
()
self
.
_update_backward
()
# step 3. parse & update BW: rename, offload, sync
self
.
_parse_forward
()
self
.
_update_forward
()
# step 4. verify the correctness
self
.
_check_offload_fetch
()
return
def
backward
(
self
,
loss
,
startup_program
=
None
,
...
...
@@ -4760,8 +5117,24 @@ class RecomputeOptimizer(Optimizer):
else
:
checkpoint_vars
.
append
(
loss
.
block
.
var
(
ckpt
))
params_grads
=
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
checkpoints
=
checkpoint_vars
)
# allow return to non-recompute when checkpoints is empty
if
len
(
checkpoint_vars
)
>
0
:
params_grads
,
sorted_checkpoint_names
=
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
checkpoints
=
checkpoint_vars
)
else
:
params_grads
=
append_backward
(
loss
,
parameter_list
,
no_grad_set
,
checkpoints
=
checkpoint_vars
)
if
self
.
enable_offload
:
self
.
sorted_checkpoint_names
=
sorted_checkpoint_names
self
.
_offload
(
loss
,
startup_program
=
startup_program
)
return
params_grads
def
apply_optimize
(
self
,
loss
,
startup_program
,
params_grads
):
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
3fbc3cf4
...
...
@@ -84,6 +84,7 @@ if(NOT WITH_GPU OR WIN32)
LIST
(
REMOVE_ITEM TEST_OPS test_collective_allreduce_api
)
LIST
(
REMOVE_ITEM TEST_OPS test_collective_broadcast_api
)
LIST
(
REMOVE_ITEM TEST_OPS test_collective_allgather_api
)
LIST
(
REMOVE_ITEM TEST_OPS test_memcpy_op
)
endif
()
if
(
WIN32
)
...
...
python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
浏览文件 @
3fbc3cf4
...
...
@@ -132,5 +132,12 @@ class TestFleetMetaOptimizer(unittest.TestCase):
elif
name
==
"sharding"
:
strategy
.
sharding
=
True
strategy
.
sharding_configs
=
{
"fuse_broadcast_MB"
:
0.2
}
elif
name
==
"recompute-offload"
:
strategy
.
recompute
=
True
strategy
.
recompute_configs
=
{
"checkpoints"
:
[
"fc_0.tmp_2"
,
"fc_1.tmp_2"
],
"enable_offload"
:
True
,
"checkpoint_shape"
:
[
256
]
}
else
:
raise
NotImplementedError
()
python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
浏览文件 @
3fbc3cf4
...
...
@@ -153,6 +153,20 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
self
.
assertIn
(
'subprog'
,
''
.
join
(
outs
))
self
.
assertIn
(
'lamb'
,
ops
)
def
test_recompute_offload
(
self
):
train_prog
,
startup_prog
=
fluid
.
Program
(),
fluid
.
Program
()
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
self
.
set_strategy
(
strategy
,
'recompute-offload'
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
ops
=
[
op
.
type
for
op
in
avg_cost
.
block
.
ops
]
outs
=
[
op
.
output
(
'Out'
)[
0
]
for
op
in
avg_cost
.
block
.
ops
if
op
.
type
==
'memcpy'
]
self
.
assertIn
(
'memcpy'
,
ops
)
self
.
assertIn
(
'@Pinned'
,
''
.
join
(
outs
))
self
.
assertIn
(
'@Fetch'
,
''
.
join
(
outs
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
浏览文件 @
3fbc3cf4
...
...
@@ -170,19 +170,19 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
self
.
assertEqual
(
ops
,
[
'cast'
,
'cast'
,
'cast'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'c_sync_calc_stream'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_
sync_comm_stream'
,
'cast'
,
'cast
'
,
'
mul'
,
'cast'
,
'elementwise_add'
,
'cast'
,
'tanh'
,
'cast'
,
'mul
'
,
'
elementwise_add'
,
'cast'
,
'tanh'
,
'cast'
,
'mul'
,
'elementwise_add
'
,
'
softmax'
,
'cast'
,
'cross_entropy2'
,
'mean'
,
'elementwise_mul
'
,
'
fill_constant'
,
'scale'
,
'elementwise_mul_grad'
,
'mean_grad
'
,
'
cross_entropy_grad2'
,
'cast'
,
'softmax_grad
'
,
'
elementwise_add_grad'
,
'mul_grad'
,
'cast'
,
'cast'
,
'mul
'
,
'cast'
,
'
elementwise_add'
,
'cast'
,
'tanh_grad'
,
'cast
'
,
'
elementwise_add_grad'
,
'mul_grad'
,
'cast'
,
'cast'
,
'mul'
,
'cast
'
,
'elementwise_add'
,
'cast'
,
'tanh_grad'
,
'cast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_
broadcast'
,
'c_sync_comm_stream
'
,
'
cast'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'cast'
,
'tanh
'
,
'
cast'
,
'cast'
,
'mul'
,
'elementwise_add'
,
'cast'
,
'tanh'
,
'cast
'
,
'
mul'
,
'elementwise_add'
,
'softmax'
,
'cast'
,
'cross_entropy2
'
,
'
mean'
,
'elementwise_mul'
,
'fill_constant'
,
'scale
'
,
'
elementwise_mul_grad'
,
'mean_grad'
,
'cross_entropy_grad2'
,
'cast
'
,
'
softmax_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'cast
'
,
'cast'
,
'
cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'cast'
,
'tanh_grad
'
,
'
cast'
,
'elementwise_add_grad'
,
'mul_grad'
,
'cast'
,
'cast'
,
'mul
'
,
'
cast'
,
'
elementwise_add'
,
'cast'
,
'tanh_grad'
,
'cast'
,
'elementwise_add_grad'
,
'mul_grad'
,
'c_sync_calc_stream'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
...
...
python/paddle/fluid/tests/unittests/test_memcpy_op.py
0 → 100755
浏览文件 @
3fbc3cf4
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
op_test
import
numpy
as
np
import
unittest
import
paddle
import
paddle.fluid.core
as
core
from
paddle.fluid.op
import
Operator
import
paddle.fluid
as
fluid
from
paddle.fluid
import
compiler
,
Program
,
program_guard
from
paddle.fluid.backward
import
append_backward
class
TestMemcpy_FillConstant
(
unittest
.
TestCase
):
def
get_prog
(
self
):
paddle
.
enable_static
()
main_program
=
Program
()
with
program_guard
(
main_program
):
pinned_var_name
=
"tensor@Pinned"
gpu_var_name
=
"tensor@GPU"
pinned_var
=
main_program
.
global_block
().
create_var
(
name
=
pinned_var_name
,
shape
=
[
10
,
10
],
dtype
=
'float32'
,
persistable
=
False
,
stop_gradient
=
True
)
gpu_var
=
main_program
.
global_block
().
create_var
(
name
=
gpu_var_name
,
shape
=
[
10
,
10
],
dtype
=
'float32'
,
persistable
=
False
,
stop_gradient
=
True
)
main_program
.
global_block
().
append_op
(
type
=
"fill_constant"
,
outputs
=
{
"Out"
:
gpu_var_name
},
attrs
=
{
"shape"
:
[
10
,
10
],
"dtype"
:
gpu_var
.
dtype
,
"value"
:
1.0
,
"place_type"
:
1
})
main_program
.
global_block
().
append_op
(
type
=
"fill_constant"
,
outputs
=
{
"Out"
:
pinned_var_name
},
attrs
=
{
"shape"
:
[
10
,
10
],
"dtype"
:
gpu_var
.
dtype
,
"value"
:
0.0
,
"place_type"
:
2
})
return
main_program
,
gpu_var
,
pinned_var
def
test_gpu_cpoy_to_pinned
(
self
):
main_program
,
gpu_var
,
pinned_var
=
self
.
get_prog
()
main_program
.
global_block
().
append_op
(
type
=
'memcpy'
,
inputs
=
{
'X'
:
gpu_var
},
outputs
=
{
'Out'
:
pinned_var
},
attrs
=
{
'dst_place_type'
:
3
})
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
gpu_
,
pinned_
=
exe
.
run
(
main_program
,
feed
=
{},
fetch_list
=
[
gpu_var
.
name
,
pinned_var
.
name
])
self
.
assertTrue
(
np
.
allclose
(
gpu_
,
pinned_
))
self
.
assertTrue
(
np
.
allclose
(
pinned_
,
np
.
ones
((
10
,
10
))))
def
test_pinned_cpoy_gpu
(
self
):
main_program
,
gpu_var
,
pinned_var
=
self
.
get_prog
()
main_program
.
global_block
().
append_op
(
type
=
'memcpy'
,
inputs
=
{
'X'
:
pinned_var
},
outputs
=
{
'Out'
:
gpu_var
},
attrs
=
{
'dst_place_type'
:
2
})
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
gpu_
,
pinned_
=
exe
.
run
(
main_program
,
feed
=
{},
fetch_list
=
[
gpu_var
.
name
,
pinned_var
.
name
])
self
.
assertTrue
(
np
.
allclose
(
gpu_
,
pinned_
))
self
.
assertTrue
(
np
.
allclose
(
gpu_
,
np
.
zeros
((
10
,
10
))))
class
TestMemcpyOPError
(
unittest
.
TestCase
):
def
get_prog
(
self
):
paddle
.
enable_static
()
main_program
=
Program
()
with
program_guard
(
main_program
):
pinned_var
=
main_program
.
global_block
().
create_var
(
name
=
"tensor@Pinned_0"
,
shape
=
[
10
,
10
],
dtype
=
'float32'
,
persistable
=
False
,
stop_gradient
=
True
)
main_program
.
global_block
().
append_op
(
type
=
"fill_constant"
,
outputs
=
{
"Out"
:
"tensor@Pinned_0"
},
attrs
=
{
"shape"
:
[
10
,
10
],
"dtype"
:
pinned_var
.
dtype
,
"value"
:
0.0
,
"place_type"
:
2
})
return
main_program
,
pinned_var
def
test_SELECTED_ROWS
(
self
):
main_program
,
pinned_var
=
self
.
get_prog
()
selected_row_var
=
main_program
.
global_block
().
create_var
(
\
name
=
"selected_row_0"
,
dtype
=
"float32"
,
persistable
=
False
,
\
type
=
fluid
.
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
stop_gradient
=
True
)
main_program
.
global_block
().
append_op
(
type
=
"fill_constant"
,
outputs
=
{
"Out"
:
selected_row_var
},
attrs
=
{
"shape"
:
selected_row_var
.
shape
,
"dtype"
:
selected_row_var
.
dtype
,
"value"
:
1.0
,
"place_type"
:
1
})
main_program
.
global_block
().
append_op
(
type
=
'memcpy'
,
inputs
=
{
'X'
:
selected_row_var
},
outputs
=
{
'Out'
:
pinned_var
},
attrs
=
{
'dst_place_type'
:
3
})
with
self
.
assertRaises
(
NotImplementedError
):
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
selected_row_var_
,
pinned_
=
exe
.
run
(
main_program
,
feed
=
{},
fetch_list
=
[
selected_row_var
.
name
,
pinned_var
.
name
])
def
test_OTHER_PLACE_NotImplementedError
(
self
):
main_program
,
pinned_var
=
self
.
get_prog
()
lod_tensor_var
=
main_program
.
global_block
().
create_var
(
\
name
=
"lod_tensor_0"
,
dtype
=
"float32"
,
persistable
=
False
,
stop_gradient
=
True
)
main_program
.
global_block
().
append_op
(
type
=
"fill_constant"
,
outputs
=
{
"Out"
:
lod_tensor_var
},
attrs
=
{
"shape"
:
lod_tensor_var
.
shape
,
"dtype"
:
lod_tensor_var
.
dtype
,
"value"
:
1.0
,
"place_type"
:
0
})
main_program
.
global_block
().
append_op
(
type
=
'memcpy'
,
inputs
=
{
'X'
:
pinned_var
},
outputs
=
{
'Out'
:
lod_tensor_var
},
attrs
=
{
'dst_place_type'
:
0
,
})
with
self
.
assertRaises
(
NotImplementedError
):
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
lod_tensor_var_
,
pinned_
=
exe
.
run
(
main_program
,
feed
=
{},
fetch_list
=
[
lod_tensor_var
.
name
,
pinned_var
.
name
])
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录