未验证 提交 f9ac5fb9 编写于 作者: H Huihuang Zheng 提交者: GitHub

[Dy2stat] Fix Memory Optimization in run_program_op and Add SimNet as Unit Test (#25383)

Add Similarity Net as unit test. During the unit test, we found three problems:

1. The run_program_op has memory optimization error when running dy2stat net multiple times.
2. The support for SelectedRows can cause problem in dy2stat.
3. The return grammar has problem.

This PR fixes the 1. problem but modify codes for the 2. 3. problems to make PR smaller. I will fix those two problems in the next PR(s)
上级 c42d662e
......@@ -91,7 +91,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_ten
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost)
if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
endif()
......
......@@ -17,10 +17,12 @@ limitations under the License. */
#include <algorithm>
#include <iterator>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -149,14 +151,46 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
}
}
static void AppendSkipDeletionVars(
std::vector<std::string> *all_vars,
const std::vector<std::string> &append_vars) {
static void AppendSkipDeletionVars(const std::vector<std::string> &append_vars,
std::vector<std::string> *all_vars) {
for (auto &var : append_vars) {
all_vars->emplace_back(var);
}
}
static void AppendSafeEagerDeletionSkipVars(
const framework::ProgramDesc &program,
std::vector<std::string> *skip_vars) {
const framework::BlockDesc &block = program.Block(0);
const std::vector<framework::OpDesc *> &all_ops = block.AllOps();
std::unordered_set<std::string> grad_op_output;
std::unordered_set<std::string> grad_op_input;
for (const framework::OpDesc *op : all_ops) {
int op_role = BOOST_GET_CONST(
int, op->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
if ((op_role & static_cast<int>(framework::OpRole::kBackward)) == 0) {
continue;
}
for (const std::string &in_arg_name : op->InputArgumentNames()) {
grad_op_input.emplace(in_arg_name);
}
for (const std::string &out_arg_name : op->OutputArgumentNames()) {
grad_op_output.emplace(out_arg_name);
}
}
// For the grad op input variables, if it is not output of grad_op, it may
// be output of forward op and we should set the variables as skip_var to
// prevent it being deleted when grad op is called multiple times.
for (const std::string &var_name : grad_op_input) {
if (grad_op_output.find(var_name) == grad_op_output.end()) {
skip_vars->emplace_back(var_name);
}
}
}
} // namespace details
template <typename DeviceContext, typename T>
......@@ -192,7 +226,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
// skip delete vars
std::vector<std::string> skip_vars;
details::AppendSkipDeletionVars(&skip_vars, output_var_names);
details::AppendSkipDeletionVars(output_var_names, &skip_vars);
VLOG(2) << "Prepare to skip " << skip_vars.size()
<< " var(s): " << string::join_strings(skip_vars, ' ');
......@@ -261,20 +295,21 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
out_scope_vec->size(), 1,
platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope."));
auto &scope = *(out_scope_vec->front());
// Step 2. prepare executor and scope
framework::Executor exe(ctx.GetPlace());
// skip delete vars
std::vector<std::string> skip_vars;
details::AppendSkipDeletionVars(&skip_vars, input_grad_var_names);
details::AppendSkipDeletionVars(&skip_vars, param_grad_names);
details::AppendSkipDeletionVars(input_grad_var_names, &skip_vars);
details::AppendSkipDeletionVars(param_grad_names, &skip_vars);
details::AppendSafeEagerDeletionSkipVars(*program, &skip_vars);
VLOG(2) << "Prepare to skip " << skip_vars.size()
<< " var(s): " << string::join_strings(skip_vars, ' ');
auto exe_ctx = exe.Prepare(*program, 0, skip_vars);
auto &scope = *(out_scope_vec->front());
details::ShareVarsIntoScope(output_grad_vars, output_grad_var_names,
&scope);
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle.fluid.param_attr as attr
from functools import reduce
from paddle.fluid.dygraph import declarative, to_variable
from paddle.fluid.dygraph import Embedding, Layer, Linear
class EmbeddingLayer(object):
"""
Embedding Layer class
"""
def __init__(self, dict_size, emb_dim, name="emb", padding_idx=None):
"""
initialize
"""
self.dict_size = dict_size
self.emb_dim = emb_dim
self.name = name
self.padding_idx = padding_idx
def ops(self):
"""
operation
"""
# TODO(huihuangzheng): The original code set the is_sparse=True, but it
# causes crush in dy2stat. Set it to True after fixing it.
emb = Embedding(
size=[self.dict_size, self.emb_dim],
is_sparse=False,
padding_idx=self.padding_idx,
param_attr=attr.ParamAttr(
name=self.name, initializer=fluid.initializer.Xavier()))
return emb
class FCLayer(object):
"""
Fully Connect Layer class
"""
def __init__(self, fc_dim, act, name="fc"):
"""
initialize
"""
self.fc_dim = fc_dim
self.act = act
self.name = name
def ops(self):
"""
operation
"""
fc = FC(size=self.fc_dim,
param_attr=attr.ParamAttr(name="%s.w" % self.name),
bias_attr=attr.ParamAttr(name="%s.b" % self.name),
act=self.act)
return fc
class ConcatLayer(object):
"""
Connection Layer class
"""
def __init__(self, axis):
"""
initialize
"""
self.axis = axis
def ops(self, inputs):
"""
operation
"""
concat = fluid.layers.concat(inputs, axis=self.axis)
return concat
class ReduceMeanLayer(object):
"""
Reduce Mean Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input):
"""
operation
"""
mean = fluid.layers.reduce_mean(input)
return mean
class CosSimLayer(object):
"""
Cos Similarly Calculate Layer
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
sim = fluid.layers.cos_sim(x, y)
return sim
class ElementwiseMaxLayer(object):
"""
Elementwise Max Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
max = fluid.layers.elementwise_max(x, y)
return max
class ElementwiseAddLayer(object):
"""
Elementwise Add Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
add = fluid.layers.elementwise_add(x, y)
return add
class ElementwiseSubLayer(object):
"""
Elementwise Add Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
sub = fluid.layers.elementwise_sub(x, y)
return sub
class ConstantLayer(object):
"""
Generate A Constant Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input, shape, dtype, value):
"""
operation
"""
shape = list(shape)
input_shape = fluid.layers.shape(input)
shape[0] = input_shape[0]
constant = fluid.layers.fill_constant(shape, dtype, value)
return constant
class SoftsignLayer(object):
"""
Softsign Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input):
"""
operation
"""
softsign = fluid.layers.softsign(input)
return softsign
class FC(Layer):
"""
This interface is used to construct a callable object of the ``FC`` class.
For more details, refer to code examples.
It creates a fully connected layer in the network. It can take
one or multiple ``Tensor`` as its inputs. It creates a Variable called weights for each input tensor,
which represents a fully connected weight matrix from each input unit to
each output unit. The fully connected layer multiplies each input tensor
with its corresponding weight to produce an output Tensor with shape [N, `size`],
where N is batch size. If multiple input tensors are given, the results of
multiple output tensors with shape [N, `size`] will be summed up. If ``bias_attr``
is not None, a bias variable will be created and added to the output.
Finally, if ``act`` is not None, it will be applied to the output as well.
When the input is single ``Tensor`` :
.. math::
Out = Act({XW + b})
When the input are multiple ``Tensor`` :
.. math::
Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
In the above equation:
* :math:`N`: Number of the input. N equals to len(input) if input is list of ``Tensor`` .
* :math:`X_i`: The i-th input ``Tensor`` .
* :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
* :math:`b`: The bias parameter created by this layer (if needed).
* :math:`Act`: The activation function.
* :math:`Out`: The output ``Tensor`` .
See below for an example.
.. code-block:: text
Given:
data_1.data = [[[0.1, 0.2]]]
data_1.shape = (1, 1, 2) # 1 is batch_size
data_2.data = [[[0.1, 0.2, 0.3]]]
data_2.shape = (1, 1, 3) # 1 is batch_size
fc = FC("fc", 2, num_flatten_dims=2)
out = fc(input=[data_1, data_2])
Then:
out.data = [[[0.182996 -0.474117]]]
out.shape = (1, 1, 2)
Parameters:
size(int): The number of output units in this layer.
num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
two dimensions. If this happens, the multi-dimension tensor will first be flattened
into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
dimensions will be flatten to form the first dimension of the final matrix (height of
the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
form the second dimension of the final matrix (width of the matrix). For example, suppose
`X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
param_attr (ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
weights(Parameter) of this layer. Default: None.
bias_attr (ParamAttr or list of ParamAttr, optional): The attribute for the bias
of this layer. If it is set to False, no bias will be added to the output units.
If it is set to None, the bias is initialized zero. Default: None.
act (str, optional): Activation to be applied to the output of this layer. Default: None.
is_test(bool, optional): A flag indicating whether execution is in test phase. Default: False.
dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
Attribute:
**weight** (list of Parameter): the learnable weights of this layer.
**bias** (Parameter or None): the learnable bias of this layer.
Returns:
None
Examples:
.. code-block:: python
from paddle.fluid.dygraph.base import to_variable
import paddle.fluid as fluid
from paddle.fluid.dygraph import FC
import numpy as np
data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
with fluid.dygraph.guard():
fc = FC("fc", 64, num_flatten_dims=2)
data = to_variable(data)
conv = fc(data)
"""
def __init__(self,
size,
num_flatten_dims=1,
param_attr=None,
bias_attr=None,
act=None,
is_test=False,
dtype="float32"):
super(FC, self).__init__(dtype)
self._size = size
self._num_flatten_dims = num_flatten_dims
self._dtype = dtype
self._param_attr = param_attr
self._bias_attr = bias_attr
self._act = act
self.__w = list()
def _build_once(self, input):
i = 0
for inp, param in self._helper.iter_inputs_and_params(input,
self._param_attr):
input_shape = inp.shape
param_shape = [
reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
1)
] + [self._size]
self.__w.append(
self.add_parameter(
'_w%d' % i,
self.create_parameter(
attr=param,
shape=param_shape,
dtype=self._dtype,
is_bias=False)))
i += 1
size = list([self._size])
self._b = self.create_parameter(
attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
# TODO(songyouwei): We should remove _w property
@property
def _w(self, i=0):
return self.__w[i]
@_w.setter
def _w(self, value, i=0):
assert isinstance(self.__w[i], Variable)
self.__w[i].set_value(value)
@property
def weight(self):
if len(self.__w) > 1:
return self.__w
else:
return self.__w[0]
@weight.setter
def weight(self, value):
if len(self.__w) == 1:
self.__w[0] = value
@property
def bias(self):
return self._b
@bias.setter
def bias(self, value):
self._b = value
def forward(self, input):
mul_results = list()
i = 0
for inp, param in self._helper.iter_inputs_and_params(input,
self._param_attr):
tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
type="mul",
inputs={"X": inp,
"Y": self.__w[i]},
outputs={"Out": tmp},
attrs={
"x_num_col_dims": self._num_flatten_dims,
"y_num_col_dims": 1
})
i += 1
mul_results.append(tmp)
if len(mul_results) == 1:
pre_bias = mul_results[0]
else:
pre_bias = self._helper.create_variable_for_type_inference(
self._dtype)
self._helper.append_op(
type="sum",
inputs={"X": mul_results},
outputs={"Out": pre_bias},
attrs={"use_mkldnn": False})
if self._b is not None:
pre_activation = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
self._helper.append_op(
type='elementwise_add',
inputs={'X': [pre_bias],
'Y': [self._b]},
outputs={'Out': [pre_activation]},
attrs={'axis': self._num_flatten_dims})
else:
pre_activation = pre_bias
# Currently, we don't support inplace in dygraph mode
return self._helper.append_activation(pre_activation, act=self._act)
class HingeLoss(object):
"""
Hing Loss Calculate class
"""
def __init__(self, conf_dict):
"""
initialize
"""
self.margin = conf_dict["loss"]["margin"]
def compute(self, pos, neg):
"""
compute loss
"""
elementwise_max = ElementwiseMaxLayer()
elementwise_add = ElementwiseAddLayer()
elementwise_sub = ElementwiseSubLayer()
constant = ConstantLayer()
reduce_mean = ReduceMeanLayer()
loss = reduce_mean.ops(
elementwise_max.ops(
constant.ops(neg, neg.shape, "float32", 0.0),
elementwise_add.ops(
elementwise_sub.ops(neg, pos),
constant.ops(neg, neg.shape, "float32", self.margin))))
return loss
class BOW(Layer):
"""
BOW
"""
def __init__(self, conf_dict):
"""
initialize
"""
super(BOW, self).__init__()
self.dict_size = conf_dict["dict_size"]
self.task_mode = conf_dict["task_mode"]
self.emb_dim = conf_dict["net"]["emb_dim"]
self.bow_dim = conf_dict["net"]["bow_dim"]
self.seq_len = conf_dict["seq_len"]
self.emb_layer = EmbeddingLayer(self.dict_size, self.emb_dim,
"emb").ops()
self.bow_layer = Linear(self.bow_dim, self.bow_dim)
self.bow_layer_po = FCLayer(self.bow_dim, None, "fc").ops()
self.softmax_layer = FCLayer(2, "softmax", "cos_sim").ops()
@declarative
def forward(self, left, right):
"""
Forward network
"""
# embedding layer
left_emb = self.emb_layer(left)
right_emb = self.emb_layer(right)
left_emb = fluid.layers.reshape(
left_emb, shape=[-1, self.seq_len, self.bow_dim])
right_emb = fluid.layers.reshape(
right_emb, shape=[-1, self.seq_len, self.bow_dim])
bow_left = fluid.layers.reduce_sum(left_emb, dim=1)
bow_right = fluid.layers.reduce_sum(right_emb, dim=1)
softsign_layer = SoftsignLayer()
left_soft = softsign_layer.ops(bow_left)
right_soft = softsign_layer.ops(bow_right)
left_bow = self.bow_layer(left_soft)
right_bow = self.bow_layer(right_soft)
cos_sim_layer = CosSimLayer()
pred = cos_sim_layer.ops(left_bow, right_bow)
return left_bow, pred
# TODO(huihuangzheng): uncomment the following return statements after
# we fix it.
#
# matching layer
#if self.task_mode == "pairwise":
# left_bow = self.bow_layer(left_soft)
# right_bow = self.bow_layer(right_soft)
# cos_sim_layer = CosSimLayer()
# pred = cos_sim_layer.ops(left_bow, right_bow)
# return left_bow, pred
#else:
# concat_layer = ConcatLayer(1)
# concat = concat_layer.ops([left_soft, right_soft])
# concat_fc = self.bow_layer_po(concat)
# pred = self.softmax_layer(concat_fc)
# return left_soft, pred
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import paddle
import paddle.fluid as fluid
import random
import unittest
from paddle.fluid.dygraph import ProgramTranslator
from simnet_dygraph_model import BOW, HingeLoss
SEED = 102
random.seed(SEED)
def create_conf_dict():
conf_dict = {}
conf_dict["task_mode"] = "train"
conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
conf_dict["loss"] = {"margin": 0.1}
return conf_dict
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="Total examples' number in batch for training.")
parser.add_argument(
"--seq_len", type=int, default=32, help="The length of each sentence.")
parser.add_argument(
"--epoch", type=int, default=1, help="The number of training epoch.")
parser.add_argument(
"--fake_sample_size",
type=int,
default=128,
help="The number of samples of fake data.")
args = parser.parse_args([])
return args
args = parse_args()
def fake_vocabulary():
vocab = {}
vocab["<unk>"] = 0
for i in range(26):
c = chr(ord('a') + i)
vocab[c] = i + 1
return vocab
vocab = fake_vocabulary()
class FakeReaderProcessor(object):
def __init__(self, args, vocab):
self.vocab = vocab
self.seq_len = args.seq_len
self.sample_size = args.fake_sample_size
self.data_samples = []
for i in range(self.sample_size):
query = [random.randint(0, 26) for i in range(self.seq_len)]
pos_title = query[:]
neg_title = [26 - q for q in query]
self.data_samples.append(
np.array([query, pos_title, neg_title]).astype(np.int64))
def get_reader(self, mode, epoch=0):
def reader_with_pairwise():
if mode == "train":
for i in range(self.sample_size):
yield self.data_samples[i]
return reader_with_pairwise
simnet_process = FakeReaderProcessor(args, vocab)
def train(conf_dict, to_static):
"""
train process
"""
program_translator = ProgramTranslator()
program_translator.enable(to_static)
# Get device
if fluid.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
fluid.default_startup_program().random_seed = SEED
fluid.default_main_program().random_seed = SEED
conf_dict['dict_size'] = len(vocab)
conf_dict['seq_len'] = args.seq_len
net = BOW(conf_dict)
loss = HingeLoss(conf_dict)
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate=0.001,
beta1=0.9,
beta2=0.999,
epsilon=1e-08,
parameter_list=net.parameters())
metric = fluid.metrics.Auc(name="auc")
global_step = 0
losses = []
train_loader = fluid.io.DataLoader.from_generator(
capacity=16,
return_list=True,
iterable=True,
use_double_buffer=True)
get_train_examples = simnet_process.get_reader(
"train", epoch=args.epoch)
train_loader.set_sample_list_generator(
paddle.batch(
get_train_examples, batch_size=args.batch_size), place)
for left, pos_right, neg_right in train_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1])
net.train()
global_step += 1
left_feat, pos_score = net(left, pos_right)
pred = pos_score
_, neg_score = net(left, neg_right)
avg_cost = loss.compute(pos_score, neg_score)
#avg_cost = loss.compute(pos_score, pos_score)
losses.append(np.mean(avg_cost.numpy()))
avg_cost.backward()
optimizer.minimize(avg_cost)
net.clear_gradients()
return losses
class TestSimnet(unittest.TestCase):
def test_dygraph_static_same_loss(self):
if fluid.is_compiled_with_cuda():
fluid.set_flags({"FLAGS_cudnn_deterministic": True})
conf_dict = create_conf_dict()
dygraph_loss = train(conf_dict, to_static=False)
static_loss = train(conf_dict, to_static=True)
self.assertEqual(len(dygraph_loss), len(static_loss))
for i in range(len(dygraph_loss)):
self.assertAlmostEqual(dygraph_loss[i], static_loss[i])
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册