未验证 提交 c269a160 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] flatten params and grads, fuse grad_clip and optimizer op (#33461)

* enable npu alignment

* support flatten_params/grads

* support clip by global norm

* remove memset in coalesce_tensor_op

* fix npu kernel of sum op when input is one tensor

* add ut for flatten_param_grads+regularizer

* fix ut

* fix typo
上级 fa821ef9
......@@ -60,6 +60,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
<< dst_place;
return;
}
VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
#ifdef PADDLE_WITH_MKLDNN
auto size = src.layout() == DataLayout::kMKLDNN
......
......@@ -30,6 +30,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace,
const void* src, size_t num) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
std::memcpy(dst, src, num);
}
......
......@@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
bool use_align = context.Attr<bool>("use_align");
auto align_size = context.Attr<int>("align_size");
if (context.Attr<bool>("check_name")) {
for (size_t i = 0; i < in_var_names.size(); ++i) {
......@@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
context.Attr<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype,
context.GetPlace(), use_align);
context.GetPlace(), use_align, align_size);
// Alloc the continuous space
auto fused_tensor = context.Output<framework::LoDTensor>("FusedOutput");
......@@ -113,11 +114,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);
offset +=
use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype
: len;
offset += use_align
? platform::Alignment(len * size_of_dtype,
context.GetPlace(), align_size) /
size_of_dtype
: len;
}
} else if (context.Attr<bool>("set_constant")) {
// TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
......@@ -134,11 +135,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx,
&sub_tensor);
}
offset +=
use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
size_of_dtype
: len;
offset += use_align
? platform::Alignment(len * size_of_dtype,
context.GetPlace(), align_size) /
size_of_dtype
: len;
}
}
......@@ -146,28 +147,24 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
offset = 0;
std::stringstream ss;
ss << "alloc_space_for_vars: ";
#if defined(PADDLE_WITH_ASCEND_CL)
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
platform::NPUMemsetAsync(
static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
0.0, fused_tensor->numel() * sizeof(T), stream);
#endif
for (size_t i = 0; i < out_tensors.size(); ++i) {
size_t len = static_cast<size_t>(out_tensors[i]->numel());
auto dim = out_tensors[i]->dims();
VLOG(4) << len << " " << dim << " " << offset;
out_tensors[i]
->ShareDataWith(fused_tensor->Slice(
static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
.Resize(dim);
len = use_align
? platform::Alignment(len * size_of_dtype, context.GetPlace()) /
? platform::Alignment(len * size_of_dtype, context.GetPlace(),
align_size) /
size_of_dtype
: len;
offset += len;
ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")"
<< " address: " << out_tensors[i]->data<void>() << ", ";
<< " address: " << out_tensors[i]->data<void>() << " len: " << len
<< ", ";
offset += len;
}
PADDLE_ENFORCE_EQ(
(int64_t)offset, fused_tensor->numel(),
......@@ -183,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
const std::vector<const framework::LoDTensor *> &lod_tensors,
const std::vector<std::string> var_names, size_t *numel,
const size_t &size_of_dtype, const platform::Place &place,
const bool use_align = true) const {
const bool use_align = true, const int align_size = -1) const {
PADDLE_ENFORCE_EQ(
lod_tensors.size(), var_names.size(),
platform::errors::InvalidArgument(
......@@ -203,15 +200,18 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
size, 0,
platform::errors::InvalidArgument(
"The number of tensor `%s`'s elements is 0.", var_names[i]));
auto len =
use_align
? platform::Alignment(static_cast<size_t>(size) * size_of_dtype,
place, align_size) /
size_of_dtype
: static_cast<size_t>(size);
VLOG(4) << size << " " << len;
ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
<< ") "
<< " addres:" << lod_tensors[i]->data<void>() << ", ";
*numel += use_align
? platform::Alignment(
static_cast<size_t>(size) * size_of_dtype, place) /
size_of_dtype
: static_cast<size_t>(size);
<< " addres:" << lod_tensors[i]->data<void>() << " len: " << len
<< ", ";
*numel += len;
}
VLOG(10) << ss.str();
}
......@@ -221,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {}
void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
auto use_align = ctx->Attrs().Get<bool>("use_align");
auto align_size = ctx->Attrs().Get<int>("align_size");
auto dtype = static_cast<framework::proto::VarType::Type>(
ctx->Attrs().Get<int>("dtype"));
size_t size_of_dtype = framework::SizeOfType(dtype);
auto alignment = [](size_t size, size_t align_size) {
size_t remaining = size % align_size;
auto aligned_size =
remaining == 0 ? size : size + (align_size - remaining);
VLOG(4) << remaining << " " << size << " " << align_size << " "
<< aligned_size;
return aligned_size;
};
VLOG(4) << "align_size: " << align_size;
if (use_align && align_size > 0) {
int64_t numel = 0;
auto dims = ctx->GetInputsDim("Input");
for (const auto &dim : dims) {
auto size = framework::product(dim);
auto len = use_align
? alignment(static_cast<size_t>(size) * size_of_dtype,
align_size) /
size_of_dtype
: static_cast<size_t>(size);
numel += len;
}
ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel}));
VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel});
}
}
protected:
framework::OpKernelType GetKernelTypeForVar(
......@@ -271,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker {
"Whether to consider memory chunk and take alignment into "
"account for inputs and outputs.")
.SetDefault(true);
AddAttr<int>("align_size", "The alignment size when use_align is True")
.SetDefault(-1);
AddComment(R"DOC(
CoalesceTensor Operator.
......@@ -314,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL(
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::NPUDeviceContext, double>);
#endif
#ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL(
coalesce_tensor,
......@@ -343,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor)
"In order to optionally take memory alignment into account when "
"coalescing tensors. The default value is true to be compatible "
"with before.",
true));
true))
.AddCheckpoint(
R"ROC(
Upgrade coalesce_tensor: add a new attribute [align_size].)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"align_size",
"In order to optionally take memory alignment into account when "
"coalescing tensors. The default value is -1 and use the default "
"align_size "
"of each place to be compatible with before.",
-1));
......@@ -35,9 +35,11 @@ class SumNPUKernel : public framework::OpKernel<T> {
auto place = ctx.GetPlace();
int n = static_cast<int>(x.size());
PADDLE_ENFORCE_EQ(n > 1, true,
platform::errors::InvalidArgument(
"The size of Input(x) list must larger or equal 2"));
if (n == 1) {
TensorCopy(*x[0], place, out);
return;
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
......
......@@ -16,22 +16,26 @@ limitations under the License. */
namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place) {
size_t alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
size_t Alignment(size_t size, const platform::Place &place, int align_size) {
size_t alignment = 0;
if (align_size > 0) {
alignment = align_size;
} else {
alignment = 1024;
if (platform::is_cpu_place(place)) {
alignment = CpuMinChunkSize();
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize();
alignment = GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU)
// TODO(wangxi): add XpuMinChunkSize
alignment = alignment;
alignment = alignment;
#elif defined(PADDLE_WITH_ASCEND_CL)
alignment = NPUMinChunkSize();
alignment = NPUMinChunkSize();
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA or NPU."));
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA/XPU/NPU."));
#endif
}
}
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
......
......@@ -22,9 +22,13 @@ limitations under the License. */
#elif defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/npu_info.h"
#endif
namespace paddle {
namespace platform {
size_t Alignment(size_t size, const platform::Place &place);
size_t Alignment(size_t size, const platform::Place &place,
int align_size = -1);
} // namespace platform
} // namespace paddle
......@@ -28,7 +28,7 @@ from . import framework
from . import layers
from . import unique_name
from .backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
from .clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm
from .framework import program_guard
from .initializer import Constant
from .layer_helper import LayerHelper
......@@ -42,6 +42,7 @@ from functools import reduce
from functools import cmp_to_key
from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt
import warnings
__all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
......@@ -68,7 +69,15 @@ class Optimizer(object):
parameter_list=None,
regularization=None,
grad_clip=None,
flatten_param_grads=False,
align_size=-1,
name=None):
"""
Args:
flatten_param_grads (bool, optional): Whether to flatten all the parameters and grads.
If true, the parameters and gradients will be coalesce to contiguous mempry,
and the grad_clip ops / optimizer ops will be fuse to one operator.
"""
# Because of the loop import, so place it in the function body
from paddle.optimizer.lr import LRScheduler
self._parameter_list = list(
......@@ -107,6 +116,8 @@ class Optimizer(object):
self.regularization = regularization
self._grad_clip = grad_clip
self._learning_rate = learning_rate
self._flatten_param_grads = flatten_param_grads
self._align_size = align_size
self._dtype = None
# Infer the dtype form parameter
......@@ -126,7 +137,7 @@ class Optimizer(object):
self._accumulators = defaultdict(lambda: dict())
# global_accumulator dict, {accum_name : acc_variable, ...}
self._global_accumulators = {}
self.helper = None
self.helper = LayerHelper(self.__class__.__name__)
self._opti_name_list = []
self._accumulators_holder = {}
self._param_device_map = dict()
......@@ -739,7 +750,7 @@ class Optimizer(object):
current_block.backward_block_idx]
start = len(target_block.ops)
self.helper = LayerHelper(self.__class__.__name__)
self._update_param_device_map(parameters_and_grads, target_block)
self._create_accumulators(
target_block,
......@@ -958,7 +969,9 @@ class Optimizer(object):
repeate_regularizer = False
with framework.name_scope('regularization'):
for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None:
if not repeate_regularizer and getattr(
param, 'regularizer',
None) is not None and regularization is not None:
repeate_regularizer = True
logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
......@@ -970,6 +983,83 @@ class Optimizer(object):
params_and_grads.append((param, new_grad))
return params_and_grads
def flatten_param_grads(self, params_grads):
need_flatten_params = []
need_flatten_grads = []
for p, g in params_grads:
if g is None:
continue
g.persistable = True
if getattr(p, 'need_clip', True) is False or getattr(
p, 'regularizer', None) is not None:
warnings.warn(
"flatten_param_grads=True will be discarded since paramter '{}''s need_clip is False or "
"the regularizer is set".format(p.name))
self._flatten_param_grads = False
return params_grads
need_flatten_params.append(p)
need_flatten_grads.append(g)
shape = [np.prod(p.shape) for p in need_flatten_params]
block = need_flatten_params[0].block
flatten_param = self.helper.create_global_variable(
name='flatten_param',
persistable=True,
dtype=need_flatten_params[0].dtype,
shape=[np.sum(shape)],
belong_to_optimizer=True)
flatten_param.trainable = True
flatten_param.optimize_attr = need_flatten_params[0].optimize_attr
flatten_param.regularizer = need_flatten_params[0].regularizer
flatten_grad = self.helper.create_global_variable(
name='flatten_grad',
persistable=True,
dtype=need_flatten_grads[0].dtype,
shape=[np.sum(shape)],
belong_to_optimizer=True)
with program_guard(default_main_program()):
block.append_op(
type="coalesce_tensor",
inputs={"Input": need_flatten_params},
outputs={
"Output": need_flatten_params,
"FusedOutput": flatten_param
},
attrs={
"copy_data": True,
"use_align": True,
"align_size": self._align_size,
"dtype": need_flatten_params[0].dtype
})
block.append_op(
type="coalesce_tensor",
inputs={"Input": need_flatten_grads},
outputs={
"Output": need_flatten_grads,
"FusedOutput": flatten_grad
},
attrs={
"copy_data": True,
"use_align": True,
"align_size": self._align_size,
"dtype": need_flatten_grads[0].dtype
})
#NOTE(zhiqiu): the initializer should be set after coalesce_tensor op,
# so the shape of flatten_param and flatten_grad will be inferred.
self.helper.set_variable_initializer(
flatten_param, initializer=Constant(0.0))
self.helper.set_variable_initializer(
flatten_grad, initializer=Constant(0.0))
return [(flatten_param, flatten_grad)]
def apply_gradients(self, params_grads):
"""
Second part of `minimize`, appending optimization operators for
......@@ -992,9 +1082,14 @@ class Optimizer(object):
# ...
optimizer.apply_gradients(params_grads)
"""
params_grads = sorted(params_grads, key=lambda x: x[0].name)
# NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
if self._flatten_param_grads and self.regularization is None:
if self._grad_clip == None or isinstance(self._grad_clip,
ClipGradByGlobalNorm):
params_grads = self.flatten_param_grads(params_grads)
# 'optimizer(grad_clip)' or 'set_gradient_clip'
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
......@@ -2156,6 +2251,9 @@ class AdamOptimizer(Optimizer):
The default value is False.
use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow
for whole model instead of creating beta_pow for each parameter. Default is false.
flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false.
align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means
use same align_size as allocator.
Examples:
.. code-block:: python
......@@ -2266,7 +2364,9 @@ class AdamOptimizer(Optimizer):
grad_clip=None,
name=None,
lazy_mode=False,
use_global_beta_pow=False):
use_global_beta_pow=False,
flatten_param_grads=False,
align_size=-1):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -2276,6 +2376,8 @@ class AdamOptimizer(Optimizer):
parameter_list=parameter_list,
regularization=regularization,
grad_clip=grad_clip,
flatten_param_grads=flatten_param_grads,
align_size=align_size,
name=name)
self.type = "adam"
self._beta1 = beta1
......
......@@ -82,5 +82,30 @@ class TestSum2(OpTest):
self.check_output_with_place(self.place, check_dygraph=False)
class TestSum3(OpTest):
def setUp(self):
self.set_npu()
self.init_dtype()
self.op_type = "sum"
self.place = paddle.NPUPlace(0)
x0 = np.random.random((3, 3)).astype(self.dtype)
self.inputs = {'X': [("x0", x0)]}
y = x0
self.outputs = {'Out': y}
self.attrs = {'use_mkldnn': False}
def init_dtype(self):
self.dtype = np.float16
def set_npu(self):
self.__class__.use_npu = True
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
if __name__ == '__main__':
unittest.main()
......@@ -636,12 +636,13 @@ class TestAdamOpV2(unittest.TestCase):
paddle.enable_static()
class TestNetWithEpsilonTensor(unittest.TestCase):
class TestAdamOptimizer(unittest.TestCase):
def _test(self,
place,
use_tensor=True,
use_fluid_api=True,
use_global_beta_pow=False):
use_global_beta_pow=False,
flatten_param_grads=False):
paddle.enable_static()
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
......@@ -649,94 +650,114 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
paddle.seed(SEED)
np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32')
b_np = np.random.random(size=(32, 32)).astype('float32')
label_np = np.random.randint(2, size=(32, 1)).astype('int64')
a_np = np.random.random(size=(2, 2)).astype('float32')
b_np = np.random.random(size=(2, 2)).astype('float32')
label_np = np.random.randint(2, size=(2, 1)).astype('int64')
weight_attr1 = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
trainable=True)
weight_attr2 = paddle.ParamAttr(
name="weight2",
initializer=fluid.initializer.Constant(value=2.0),
trainable=True)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
with paddle.static.program_guard(main_prog, startup_prog):
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
label = paddle.static.data(
name="label", shape=[32, 1], dtype='int64')
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
beta1_init = 0.9
beta2_init = 0.999
epsilon_init = 1e-8
if use_tensor:
beta1 = fluid.layers.create_global_var(
shape=[1],
value=float(beta1_init),
dtype='float32',
persistable=True,
name="beta1")
beta2 = fluid.layers.create_global_var(
shape=[1],
value=float(beta2_init),
dtype='float32',
persistable=True,
name="beta2")
epsilon = fluid.layers.create_global_var(
shape=[1],
value=float(epsilon_init),
dtype='float32',
persistable=True,
name="epsilon")
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
use_global_beta_pow=use_global_beta_pow)
else:
adam = paddle.optimizer.Adam(
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon)
else:
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
use_global_beta_pow=use_global_beta_pow,
name='a')
with paddle.utils.unique_name.guard():
a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
label = paddle.static.data(
name="label", shape=[2, 1], dtype='int64')
sum = paddle.add(a, b)
z = paddle.pow(sum, 2.0)
fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
prediction = fluid.layers.fc(input=fc_1,
size=2,
param_attr=weight_attr2,
act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
beta1_init = 0.9
beta2_init = 0.999
epsilon_init = 1e-8
if use_tensor:
beta1 = fluid.layers.create_global_var(
shape=[1],
value=float(beta1_init),
dtype='float32',
persistable=True,
name="beta1")
beta2 = fluid.layers.create_global_var(
shape=[1],
value=float(beta2_init),
dtype='float32',
persistable=True,
name="beta2")
epsilon = fluid.layers.create_global_var(
shape=[1],
value=float(epsilon_init),
dtype='float32',
persistable=True,
name="epsilon")
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
else:
adam = paddle.optimizer.Adam(
learning_rate=0.01,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
grad_clip=clip)
else:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init)
adam.minimize(loss)
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(10):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(epoch, pred_res[
0], loss_res))
paddle.disable_static()
return pred_res, loss_res
if use_fluid_api:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
use_global_beta_pow=use_global_beta_pow,
flatten_param_grads=flatten_param_grads,
align_size=256,
grad_clip=clip)
else:
adam = fluid.optimizer.Adam(
learning_rate=0.01,
beta1=beta1_init,
beta2=beta2_init,
epsilon=epsilon_init,
grad_clip=clip)
adam.minimize(loss)
scope = fluid.Scope()
with fluid.scope_guard(scope):
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(10):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
paddle.disable_static()
return pred_res, loss_res
def _test_with_place(self, place):
preds = []
......@@ -745,10 +766,12 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
for use_tensor in [True, False]:
for use_fluid_api in [True, False]:
for use_global_beta_pow in [True, False]:
pred, loss = self._test(place, use_tensor, use_fluid_api,
use_global_beta_pow)
preds.append(pred)
losses.append(loss)
for flatten_param_grads in [True, False]:
pred, loss = self._test(
place, use_tensor, use_fluid_api,
use_global_beta_pow, flatten_param_grads)
preds.append(pred)
losses.append(loss)
for pred in preds:
self.assertTrue(np.allclose(pred, preds[0]))
for loss in losses:
......@@ -760,6 +783,33 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
if core.is_compiled_with_cuda():
self._test_with_place(paddle.CUDAPlace(0))
def test_adam_flatten_param_grads_with_regularizer(self):
# flatten_param_grads + regularizer is not supported yet.
paddle.enable_static()
main = fluid.Program()
weight_attr = paddle.ParamAttr(
name="weight1",
initializer=fluid.initializer.Constant(value=1.0),
regularizer=fluid.regularizer.L1DecayRegularizer(
regularization_coeff=0.1),
trainable=True)
with fluid.program_guard(main):
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
y_predict = fluid.layers.fc(input=x,
size=1,
act=None,
param_attr=weight_attr)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_cost = fluid.layers.mean(cost)
adam = fluid.optimizer.AdamOptimizer(
0.01, flatten_param_grads=True, align_size=256)
adam.minimize(avg_cost)
paddle.disable_static()
self.assertEqual(adam._flatten_param_grads, False)
def test_adam_exception(self):
paddle.enable_static()
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册