提交 5bca34ed 编写于 作者: L liaogang

Merge remote-tracking branch 'upstream/master'

...@@ -142,12 +142,15 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge ...@@ -142,12 +142,15 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
.. code-block:: python .. code-block:: python
group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
StaticInput(input=encoded_proj,is_seq=True)]
trg_embedding = embedding_layer( trg_embedding = embedding_layer(
input=data_layer(name='target_language_word', input=data_layer(name='target_language_word',
size=target_dict_dim), size=target_dict_dim),
size=word_vector_dim, size=word_vector_dim,
param_attr=ParamAttr(name='_target_language_embedding')) param_attr=ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training, # For decoder equipped with attention mechanism, in training,
# target embedding (the groudtruth) is the data input, # target embedding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory. # while encoded source sequence is accessed to as an unbounded memory.
...@@ -156,13 +159,7 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network. ...@@ -156,13 +159,7 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
# All sequence inputs should have the same length. # All sequence inputs should have the same length.
decoder = recurrent_group(name=decoder_group_name, decoder = recurrent_group(name=decoder_group_name,
step=gru_decoder_with_attention, step=gru_decoder_with_attention,
input=[ input=group_inputs)
StaticInput(input=encoded_vector,
is_seq=True),
StaticInput(input=encoded_proj,
is_seq=True),
trg_embedding
])
The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
...@@ -217,10 +214,8 @@ The code is listed below: ...@@ -217,10 +214,8 @@ The code is listed below:
.. code-block:: python .. code-block:: python
gen_inputs = [StaticInput(input=encoded_vector, group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
is_seq=True), StaticInput(input=encoded_proj,is_seq=True)]
StaticInput(input=encoded_proj,
is_seq=True), ]
# In generation, decoder predicts a next target word based on # In generation, decoder predicts a next target word based on
# the encoded source sequence and the last generated target word. # the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by # The encoded source sequence (encoder's output) must be specified by
...@@ -231,10 +226,10 @@ The code is listed below: ...@@ -231,10 +226,10 @@ The code is listed below:
size=target_dict_dim, size=target_dict_dim,
embedding_name='_target_language_embedding', embedding_name='_target_language_embedding',
embedding_size=word_vector_dim) embedding_size=word_vector_dim)
gen_inputs.append(trg_embedding) group_inputs.append(trg_embedding)
beam_gen = beam_search(name=decoder_group_name, beam_gen = beam_search(name=decoder_group_name,
step=gru_decoder_with_attention, step=gru_decoder_with_attention,
input=gen_inputs, input=group_inputs,
id_input=data_layer(name="sent_id", id_input=data_layer(name="sent_id",
size=1), size=1),
dict_file=trg_dict_path, dict_file=trg_dict_path,
......
...@@ -169,6 +169,12 @@ dotmul_projection ...@@ -169,6 +169,12 @@ dotmul_projection
:members: dotmul_projection :members: dotmul_projection
:noindex: :noindex:
dotmul_operator
---------------
.. automodule:: paddle.trainer_config_helpers.layers
:members: dotmul_operator
:noindex:
full_matrix_projection full_matrix_projection
---------------------- ----------------------
.. automodule:: paddle.trainer_config_helpers.layers .. automodule:: paddle.trainer_config_helpers.layers
......
...@@ -85,6 +85,7 @@ bool CudnnConvLayer::init(const LayerMap &layerMap, ...@@ -85,6 +85,7 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
biasOffset_ = numFilters_ / groups_[0]; biasOffset_ = numFilters_ / groups_[0];
} }
batchNum_ = 0;
isSelectAlgo_ = false; isSelectAlgo_ = false;
return true; return true;
} }
...@@ -132,6 +133,11 @@ void CudnnConvLayer::reshape(int batchSize) { ...@@ -132,6 +133,11 @@ void CudnnConvLayer::reshape(int batchSize) {
getOutput().setFrameHeight(outputH_); getOutput().setFrameHeight(outputH_);
getOutput().setFrameWidth(outputW_); getOutput().setFrameWidth(outputW_);
// if the batchSize remains the same, set isSelectAlgo_ true.
// Otherwise, set isSelectAlgo_ false and select algo again.
isSelectAlgo_ = (batchSize == batchNum_);
batchNum_ = batchSize;
size_t maxWorkSpace = 0; size_t maxWorkSpace = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(), CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
...@@ -160,6 +166,10 @@ void CudnnConvLayer::reshape(int batchSize) { ...@@ -160,6 +166,10 @@ void CudnnConvLayer::reshape(int batchSize) {
maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]); maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]); maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
<< " / " << bwdDataAlgo_[i]
<< " / " << bwdFilterAlgo_[i];
} }
} }
......
...@@ -87,6 +87,10 @@ protected: ...@@ -87,6 +87,10 @@ protected:
/// Is or not select conv algorihtm. /// Is or not select conv algorihtm.
bool isSelectAlgo_; bool isSelectAlgo_;
/// batchNum is used to record batch size. If the batch size is changed,
/// the selection algorithm will be called.
int batchNum_;
public: public:
explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {} explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
......
...@@ -19,7 +19,7 @@ namespace paddle { ...@@ -19,7 +19,7 @@ namespace paddle {
MultinomialSampler::MultinomialSampler(const real* prob, int size) MultinomialSampler::MultinomialSampler(const real* prob, int size)
: rand_(0.0, size) { : rand_(0.0, size) {
intervals_.reserve(size + 1); intervals_.resize(size + 1);
double sum = 0; double sum = 0;
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
sum += prob[i]; sum += prob[i];
...@@ -50,12 +50,13 @@ MultinomialSampler::MultinomialSampler(const real* prob, int size) ...@@ -50,12 +50,13 @@ MultinomialSampler::MultinomialSampler(const real* prob, int size)
int bigPos = nextBigPos(0); int bigPos = nextBigPos(0);
auto fillIntervals = [&]() { auto fillIntervals = [&]() {
while (bigPos < size && smallPos < size) { while (bigPos < size) {
while (intervals_[bigPos].thresh > 1 && smallPos < size) { while (intervals_[bigPos].thresh > 1 && smallPos < size) {
intervals_[smallPos].otherId = bigPos; intervals_[smallPos].otherId = bigPos;
intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh; intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh;
smallPos = nextSmallPos(smallPos + 1); smallPos = nextSmallPos(smallPos + 1);
} }
if (smallPos >= size) break;
bigPos = nextBigPos(bigPos + 1); bigPos = nextBigPos(bigPos + 1);
// If intervals_[bigPos].thresh < 1, it becomes a small interval // If intervals_[bigPos].thresh < 1, it becomes a small interval
} }
......
...@@ -41,39 +41,42 @@ public: ...@@ -41,39 +41,42 @@ public:
TEST(MultinomialSampler, gen) { TEST(MultinomialSampler, gen) {
int numGrids = 1024 * 1024; int numGrids = 1024 * 1024;
int size = 1024 * 4; int size = 1024 * 4;
default_random_engine reng; default_random_engine reng;
uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
vector<real> prob;
int sum = 0;
for (int i = 0; i < size; ++i) {
prob.push_back(rand(reng));
sum += prob.back();
}
CHECK_LE(sum, numGrids);
prob.back() += numGrids - sum;
vector<int> counts(size); for (size_t iter=0; iter < 256; ++iter) {
MultinomialSamplerTester sampler(&prob[0], size); uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
counts.assign(size, 0); vector<real> prob;
{ int sum = 0;
double s = (double)size / (double)numGrids; for (int i = 0; i < size; ++i) {
REGISTER_TIMER("MultinomialSampler"); prob.push_back(rand(reng));
for (double i = 0; i < numGrids; ++i) { sum += prob.back();
int ret = sampler.testGen([i, s]() { return s * i; }); }
if (ret < 0 || ret >= size) {
EXPECT_GE(ret, 0); CHECK_LE(sum, numGrids);
EXPECT_LT(ret, size); prob.back() += numGrids - sum;
break;
vector<int> counts(size);
MultinomialSamplerTester sampler(&prob[0], size);
counts.assign(size, 0);
{
double s = (double)size / (double)numGrids;
REGISTER_TIMER("MultinomialSampler");
for (double i = 0; i < numGrids; ++i) {
int ret = sampler.testGen([i, s]() { return s * i; });
if (ret < 0 || ret >= size) {
EXPECT_GE(ret, 0);
EXPECT_LT(ret, size);
break;
}
++counts[ret];
} }
++counts[ret];
} }
} for (int i = 0; i < size; ++i) {
for (int i = 0; i < size; ++i) { if (prob[i] != counts[i]) {
if (prob[i] != counts[i]) { EXPECT_EQ(prob[i], counts[i]);
EXPECT_EQ(prob[i], counts[i]); LOG(INFO) << iter;
LOG(INFO) << "i=" << i; break;
break; }
} }
} }
} }
...@@ -135,6 +138,7 @@ void benchmarkRandom() { ...@@ -135,6 +138,7 @@ void benchmarkRandom() {
LOG(INFO) << "sum1=" << sum1; LOG(INFO) << "sum1=" << sum1;
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
initMain(argc, argv); initMain(argc, argv);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
......
...@@ -636,7 +636,6 @@ class Operator(Cfg): ...@@ -636,7 +636,6 @@ class Operator(Cfg):
input_layer_names, input_layer_names,
): ):
self.add_keys(locals()) self.add_keys(locals())
self.operator_conf = OperatorConfig() self.operator_conf = OperatorConfig()
self.operator_conf.type = self.type self.operator_conf.type = self.type
...@@ -686,12 +685,15 @@ class ConvOperator(Operator): ...@@ -686,12 +685,15 @@ class ConvOperator(Operator):
if num_filters is not None: if num_filters is not None:
self.operator_conf.num_filters = num_filters self.operator_conf.num_filters = num_filters
parse_conv(conv_conf, input_layer_names[0], self.operator_conf.conv_conf, True) parse_conv(conv_conf,
MakeLayerNameInSubmodel(input_layer_names[0]),
self.operator_conf.conv_conf)
self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x ** 2) * num_filters self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x ** 2) * num_filters
config_assert(len(input_layer_names) == 2, "Conv is binary operator") config_assert(len(input_layer_names) == 2, "Conv is binary operator")
def calc_output_size(self, input_sizes):
return self.operator_conf.output_size
# please refer to the comments in proto/ModelConfig.proto # please refer to the comments in proto/ModelConfig.proto
...@@ -2462,11 +2464,11 @@ class MixedLayer(LayerBase): ...@@ -2462,11 +2464,11 @@ class MixedLayer(LayerBase):
if size != 0: if size != 0:
self.set_layer_size(size) self.set_layer_size(size)
else: else:
size = operator.calc_output_size(operator_conf.input_sizes) sz = operator.calc_output_size(operator_conf.input_sizes)
if size != 0: if sz != 0:
config_assert(size == self.config.size, config_assert(sz == self.config.size,
"different inputs have different size: %s vs. %s" % "different inputs have different size: %s vs. %s" %
(size, self.config.size)) (sz, self.config.size))
for input_index in xrange(len(self.inputs)): for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index) input_layer = self.get_input_layer(input_index)
input = self.inputs[input_index] input = self.inputs[input_index]
......
...@@ -286,7 +286,6 @@ def full_matrix_projection(input, size=0, param_attr=None): ...@@ -286,7 +286,6 @@ def full_matrix_projection(input, size=0, param_attr=None):
size=size, size=size,
**param_attr.attr) **param_attr.attr)
proj.origin = input proj.origin = input
proj.origin.projection = "matrix"
return proj return proj
...@@ -333,7 +332,6 @@ def table_projection(input, size=0, param_attr=None): ...@@ -333,7 +332,6 @@ def table_projection(input, size=0, param_attr=None):
size=size, size=size,
**param_attr.attr) **param_attr.attr)
proj.origin = input proj.origin = input
proj.origin.projection = "table"
return proj return proj
...@@ -377,17 +375,15 @@ def identity_projection(input, offset=None): ...@@ -377,17 +375,15 @@ def identity_projection(input, offset=None):
if offset is None: if offset is None:
proj = IdentityProjection(input_layer_name=input.name) proj = IdentityProjection(input_layer_name=input.name)
proj.origin = input proj.origin = input
proj.origin.projection = 'identity'
else: else:
proj = IdentityOffsetProjection(input_layer_name=input.name, proj = IdentityOffsetProjection(input_layer_name=input.name,
offset=offset) offset=offset)
proj.origin = input proj.origin = input
proj.origin.projection = 'identity_offset'
return proj return proj
@wrap_param_attr_default() @wrap_param_attr_default()
def dotmul_projection(input, param_attr=None, scale=1): def dotmul_projection(input, param_attr=None):
""" """
DotMulProjection with a layer as input. DotMulProjection with a layer as input.
It performs element-wise multiplication with weight. It performs element-wise multiplication with weight.
...@@ -407,30 +403,35 @@ def dotmul_projection(input, param_attr=None, scale=1): ...@@ -407,30 +403,35 @@ def dotmul_projection(input, param_attr=None, scale=1):
:type input: LayerOutput :type input: LayerOutput
:param param_attr: Parameter config, None if use default. :param param_attr: Parameter config, None if use default.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:param scale: config scalar, default value is one.
:type scale: float
:return: A DotMulProjection Object. :return: A DotMulProjection Object.
:rtype: DotMulProjection :rtype: DotMulProjection
""" """
proj = DotMulProjection(input_layer_name=input.name, proj = DotMulProjection(input_layer_name=input.name,
size=input.size, size=input.size,
**param_attr.attr) **param_attr.attr)
proj.origin = input proj.origin = input
return proj return proj
def dotmul_operator(x, y, scale=1): def dotmul_operator(x, y, scale=1):
""" """
DotMulOperator takes two inputs and performs element-wise multiplication: DotMulOperator takes two inputs and performs element-wise multiplication:
.. math:: .. math::
out.row[i] += scale * (in1.row[i] .* in2.row[i]) out.row[i] += scale * (x.row[i] .* y.row[i])
where :math:`.*` means element-wise multiplication, and where :math:`.*` means element-wise multiplication, and
scale is a config scalar, its default value is one. scale is a config scalar, its default value is one.
The example usage is: The example usage is:
.. code-block:: python .. code-block:: python
op = dotmul_operator(x, y,
scale=1) op = dotmul_operator(x=layer1, y=layer2, scale=0.5)
:param input: Input layer
:type input: LayerOutput :param x: Input layer1
:type x: LayerOutput
:param y: Input layer2
:type y: LayerOutput
:param scale: config scalar, default value is one. :param scale: config scalar, default value is one.
:type scale: float :type scale: float
:return: A DotMulOperator Object. :return: A DotMulOperator Object.
...@@ -487,7 +488,6 @@ def context_projection(input, context_len, context_start=None, ...@@ -487,7 +488,6 @@ def context_projection(input, context_len, context_start=None,
trainable_padding=trainable, trainable_padding=trainable,
**extra_dict) **extra_dict)
proj.origin = input proj.origin = input
proj.origin.projection = 'context'
return proj return proj
...@@ -2667,8 +2667,8 @@ def classification_cost(input, label, name=None, ...@@ -2667,8 +2667,8 @@ def classification_cost(input, label, name=None,
return LayerOutput(name, LayerType.COST, parents=[input, label]) return LayerOutput(name, LayerType.COST, parents=[input, label])
def conv_operator(input, filter_size, num_filters, def conv_operator(img, filter, filter_size, num_filters,
num_channel=None, stride=1, padding=0, num_channel=None, stride=1, padding=0, groups=1,
filter_size_y=None, stride_y=None, padding_y=None): filter_size_y=None, stride_y=None, padding_y=None):
""" """
Different from img_conv_layer, conv_op is an Operator, which can be used Different from img_conv_layer, conv_op is an Operator, which can be used
...@@ -2680,13 +2680,16 @@ def conv_operator(input, filter_size, num_filters, ...@@ -2680,13 +2680,16 @@ def conv_operator(input, filter_size, num_filters,
.. code-block:: python .. code-block:: python
op = conv_operator(input=[layer1, layer2], op = conv_operator(img=input1,
filter=input2,
filter_size=3.0, filter_size=3.0,
num_filters=64, num_filters=64,
num_channels=64) num_channels=64)
:param input: Input layer. :param img: input image
:type input: LayerOutput|list|tuple :type img: LayerOutput
:param filter: input filter
:type filter: LayerOutput
:param filter_size: The x dimension of a filter kernel. :param filter_size: The x dimension of a filter kernel.
:type filter_size: int :type filter_size: int
:param filter_size_y: The y dimension of a filter kernel. Since :param filter_size_y: The y dimension of a filter kernel. Since
...@@ -2708,14 +2711,13 @@ def conv_operator(input, filter_size, num_filters, ...@@ -2708,14 +2711,13 @@ def conv_operator(input, filter_size, num_filters,
:return: A ConvOperator Object. :return: A ConvOperator Object.
:rtype: ConvOperator :rtype: ConvOperator
""" """
assert isinstance(input, list) or isinstance(input, tuple)
if filter_size_y is None: if filter_size_y is None:
filter_size_y = filter_size filter_size_y = filter_size
if stride_y is None: if stride_y is None:
stride_y = stride stride_y = stride
if padding_y is None: if padding_y is None:
padding_y = padding padding_y = padding
op = ConvOperator(input_layer_name=[x.name for x in input], op = ConvOperator(input_layer_names=[img.name, filter.name],
num_filters = num_filter, num_filters = num_filter,
conv_conf=Conv(filter_size=filter_size, conv_conf=Conv(filter_size=filter_size,
padding=padding, padding=padding,
...@@ -2723,9 +2725,9 @@ def conv_operator(input, filter_size, num_filters, ...@@ -2723,9 +2725,9 @@ def conv_operator(input, filter_size, num_filters,
channels=num_channel, channels=num_channel,
filter_size_y=filter_size_y, filter_size_y=filter_size_y,
padding_y=padding_y, padding_y=padding_y,
stride_y=stride_y)) stride_y=stride_y,
op.origin = input groups=groups))
op.origin.operator = "conv_op" op.origin = [img, filter]
return op return op
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册