diff --git a/doc/algorithm/rnn/rnn.rst b/doc/algorithm/rnn/rnn.rst index 9653ddbf371764df726b4c2db6724cbb80b64861..4753db450b744d3860d69e3928147b0932a9630f 100644 --- a/doc/algorithm/rnn/rnn.rst +++ b/doc/algorithm/rnn/rnn.rst @@ -142,12 +142,15 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: .. code-block:: python - + group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] trg_embedding = embedding_layer( input=data_layer(name='target_language_word', size=target_dict_dim), size=word_vector_dim, param_attr=ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + # For decoder equipped with attention mechanism, in training, # target embedding (the groudtruth) is the data input, # while encoded source sequence is accessed to as an unbounded memory. @@ -156,13 +159,7 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network. # All sequence inputs should have the same length. decoder = recurrent_group(name=decoder_group_name, step=gru_decoder_with_attention, - input=[ - StaticInput(input=encoded_vector, - is_seq=True), - StaticInput(input=encoded_proj, - is_seq=True), - trg_embedding - ]) + input=group_inputs) The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: @@ -217,10 +214,8 @@ The code is listed below: .. code-block:: python - gen_inputs = [StaticInput(input=encoded_vector, - is_seq=True), - StaticInput(input=encoded_proj, - is_seq=True), ] + group_inputs=[StaticInput(input=encoded_vector,is_seq=True), + StaticInput(input=encoded_proj,is_seq=True)] # In generation, decoder predicts a next target word based on # the encoded source sequence and the last generated target word. # The encoded source sequence (encoder's output) must be specified by @@ -231,10 +226,10 @@ The code is listed below: size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) - gen_inputs.append(trg_embedding) + group_inputs.append(trg_embedding) beam_gen = beam_search(name=decoder_group_name, step=gru_decoder_with_attention, - input=gen_inputs, + input=group_inputs, id_input=data_layer(name="sent_id", size=1), dict_file=trg_dict_path, diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst index f902d1c995bc5045d62d0b2e279ee612f9dc7c93..c1d7a7ce815301be7d4193560fc6c27d90cf6e69 100644 --- a/doc/ui/api/trainer_config_helpers/layers.rst +++ b/doc/ui/api/trainer_config_helpers/layers.rst @@ -169,6 +169,12 @@ dotmul_projection :members: dotmul_projection :noindex: +dotmul_operator +--------------- +.. automodule:: paddle.trainer_config_helpers.layers + :members: dotmul_operator + :noindex: + full_matrix_projection ---------------------- .. automodule:: paddle.trainer_config_helpers.layers diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp index a74e6ba38dfc63aff715e6a83fa6d2e5e7b979dd..0f932f960f6bacb5fc80273e5dfedf86bfb9d152 100644 --- a/paddle/gserver/layers/CudnnConvLayer.cpp +++ b/paddle/gserver/layers/CudnnConvLayer.cpp @@ -85,6 +85,7 @@ bool CudnnConvLayer::init(const LayerMap &layerMap, biasOffset_ = numFilters_ / groups_[0]; } + batchNum_ = 0; isSelectAlgo_ = false; return true; } @@ -132,6 +133,11 @@ void CudnnConvLayer::reshape(int batchSize) { getOutput().setFrameHeight(outputH_); getOutput().setFrameWidth(outputW_); + // if the batchSize remains the same, set isSelectAlgo_ true. + // Otherwise, set isSelectAlgo_ false and select algo again. + isSelectAlgo_ = (batchSize == batchNum_); + batchNum_ = batchSize; + size_t maxWorkSpace = 0; for (size_t i = 0; i < inputLayers_.size(); i++) { CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(), @@ -160,6 +166,10 @@ void CudnnConvLayer::reshape(int batchSize) { maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]); maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]); + + VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i] + << " / " << bwdDataAlgo_[i] + << " / " << bwdFilterAlgo_[i]; } } diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h index 2c72ba885ed1043d15f1e61fb32666ec001e9242..a6dadba10daa49d03e4a52a9c028a87400ca23ea 100644 --- a/paddle/gserver/layers/CudnnConvLayer.h +++ b/paddle/gserver/layers/CudnnConvLayer.h @@ -87,6 +87,10 @@ protected: /// Is or not select conv algorihtm. bool isSelectAlgo_; + /// batchNum is used to record batch size. If the batch size is changed, + /// the selection algorithm will be called. + int batchNum_; + public: explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {} diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp index 710772c0cf476f3b2dee790dc3f8254ee2452b0c..518dc0c60cbdc2a95b7eb9c8ff33dd6a9fb87c98 100644 --- a/paddle/gserver/layers/MultinomialSampler.cpp +++ b/paddle/gserver/layers/MultinomialSampler.cpp @@ -19,7 +19,7 @@ namespace paddle { MultinomialSampler::MultinomialSampler(const real* prob, int size) : rand_(0.0, size) { - intervals_.reserve(size + 1); + intervals_.resize(size + 1); double sum = 0; for (int i = 0; i < size; ++i) { sum += prob[i]; @@ -50,12 +50,13 @@ MultinomialSampler::MultinomialSampler(const real* prob, int size) int bigPos = nextBigPos(0); auto fillIntervals = [&]() { - while (bigPos < size && smallPos < size) { + while (bigPos < size) { while (intervals_[bigPos].thresh > 1 && smallPos < size) { intervals_[smallPos].otherId = bigPos; intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh; smallPos = nextSmallPos(smallPos + 1); } + if (smallPos >= size) break; bigPos = nextBigPos(bigPos + 1); // If intervals_[bigPos].thresh < 1, it becomes a small interval } diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp index 39a90958331f6cc3a19c12342f9c280e467a066e..73b4d0b8b7110d4ab79809875e2481cd2b565a68 100644 --- a/paddle/gserver/tests/test_MultinomialSampler.cpp +++ b/paddle/gserver/tests/test_MultinomialSampler.cpp @@ -41,39 +41,42 @@ public: TEST(MultinomialSampler, gen) { int numGrids = 1024 * 1024; int size = 1024 * 4; - default_random_engine reng; - uniform_int_distribution rand(1, numGrids / size * 1.8); - vector prob; - int sum = 0; - for (int i = 0; i < size; ++i) { - prob.push_back(rand(reng)); - sum += prob.back(); - } - CHECK_LE(sum, numGrids); - prob.back() += numGrids - sum; - vector counts(size); - MultinomialSamplerTester sampler(&prob[0], size); - counts.assign(size, 0); - { - double s = (double)size / (double)numGrids; - REGISTER_TIMER("MultinomialSampler"); - for (double i = 0; i < numGrids; ++i) { - int ret = sampler.testGen([i, s]() { return s * i; }); - if (ret < 0 || ret >= size) { - EXPECT_GE(ret, 0); - EXPECT_LT(ret, size); - break; + for (size_t iter=0; iter < 256; ++iter) { + uniform_int_distribution rand(1, numGrids / size * 1.8); + vector prob; + int sum = 0; + for (int i = 0; i < size; ++i) { + prob.push_back(rand(reng)); + sum += prob.back(); + } + + CHECK_LE(sum, numGrids); + prob.back() += numGrids - sum; + + vector counts(size); + MultinomialSamplerTester sampler(&prob[0], size); + counts.assign(size, 0); + { + double s = (double)size / (double)numGrids; + REGISTER_TIMER("MultinomialSampler"); + for (double i = 0; i < numGrids; ++i) { + int ret = sampler.testGen([i, s]() { return s * i; }); + if (ret < 0 || ret >= size) { + EXPECT_GE(ret, 0); + EXPECT_LT(ret, size); + break; + } + ++counts[ret]; } - ++counts[ret]; } - } - for (int i = 0; i < size; ++i) { - if (prob[i] != counts[i]) { - EXPECT_EQ(prob[i], counts[i]); - LOG(INFO) << "i=" << i; - break; + for (int i = 0; i < size; ++i) { + if (prob[i] != counts[i]) { + EXPECT_EQ(prob[i], counts[i]); + LOG(INFO) << iter; + break; + } } } } @@ -135,6 +138,7 @@ void benchmarkRandom() { LOG(INFO) << "sum1=" << sum1; } + int main(int argc, char** argv) { initMain(argc, argv); testing::InitGoogleTest(&argc, argv); diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 4ce01e005ae3ca549bb39c149e4ebf3cb04f8c1c..a57e9065c6f980b0338bd4ed0a91160fa7bed94f 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -636,7 +636,6 @@ class Operator(Cfg): input_layer_names, ): self.add_keys(locals()) - self.operator_conf = OperatorConfig() self.operator_conf.type = self.type @@ -686,12 +685,15 @@ class ConvOperator(Operator): if num_filters is not None: self.operator_conf.num_filters = num_filters - parse_conv(conv_conf, input_layer_names[0], self.operator_conf.conv_conf, True) + parse_conv(conv_conf, + MakeLayerNameInSubmodel(input_layer_names[0]), + self.operator_conf.conv_conf) self.operator_conf.output_size = (self.operator_conf.conv_conf.output_x ** 2) * num_filters config_assert(len(input_layer_names) == 2, "Conv is binary operator") - + def calc_output_size(self, input_sizes): + return self.operator_conf.output_size # please refer to the comments in proto/ModelConfig.proto @@ -2462,11 +2464,11 @@ class MixedLayer(LayerBase): if size != 0: self.set_layer_size(size) else: - size = operator.calc_output_size(operator_conf.input_sizes) - if size != 0: - config_assert(size == self.config.size, + sz = operator.calc_output_size(operator_conf.input_sizes) + if sz != 0: + config_assert(sz == self.config.size, "different inputs have different size: %s vs. %s" % - (size, self.config.size)) + (sz, self.config.size)) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) input = self.inputs[input_index] diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index fab7e6e091863fdad6d81f4c63f12132c2be5161..8b7cabf2fad507b15c820ffa44f29f44e44f407e 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -286,7 +286,6 @@ def full_matrix_projection(input, size=0, param_attr=None): size=size, **param_attr.attr) proj.origin = input - proj.origin.projection = "matrix" return proj @@ -333,7 +332,6 @@ def table_projection(input, size=0, param_attr=None): size=size, **param_attr.attr) proj.origin = input - proj.origin.projection = "table" return proj @@ -377,17 +375,15 @@ def identity_projection(input, offset=None): if offset is None: proj = IdentityProjection(input_layer_name=input.name) proj.origin = input - proj.origin.projection = 'identity' else: proj = IdentityOffsetProjection(input_layer_name=input.name, offset=offset) proj.origin = input - proj.origin.projection = 'identity_offset' return proj @wrap_param_attr_default() -def dotmul_projection(input, param_attr=None, scale=1): +def dotmul_projection(input, param_attr=None): """ DotMulProjection with a layer as input. It performs element-wise multiplication with weight. @@ -407,30 +403,35 @@ def dotmul_projection(input, param_attr=None, scale=1): :type input: LayerOutput :param param_attr: Parameter config, None if use default. :type param_attr: ParameterAttribute - :param scale: config scalar, default value is one. - :type scale: float :return: A DotMulProjection Object. :rtype: DotMulProjection """ proj = DotMulProjection(input_layer_name=input.name, - size=input.size, - **param_attr.attr) - proj.origin = input + size=input.size, + **param_attr.attr) + proj.origin = input return proj def dotmul_operator(x, y, scale=1): """ DotMulOperator takes two inputs and performs element-wise multiplication: + .. math:: - out.row[i] += scale * (in1.row[i] .* in2.row[i]) + out.row[i] += scale * (x.row[i] .* y.row[i]) + where :math:`.*` means element-wise multiplication, and scale is a config scalar, its default value is one. + The example usage is: + .. code-block:: python - op = dotmul_operator(x, y, - scale=1) - :param input: Input layer - :type input: LayerOutput + + op = dotmul_operator(x=layer1, y=layer2, scale=0.5) + + :param x: Input layer1 + :type x: LayerOutput + :param y: Input layer2 + :type y: LayerOutput :param scale: config scalar, default value is one. :type scale: float :return: A DotMulOperator Object. @@ -487,7 +488,6 @@ def context_projection(input, context_len, context_start=None, trainable_padding=trainable, **extra_dict) proj.origin = input - proj.origin.projection = 'context' return proj @@ -2667,8 +2667,8 @@ def classification_cost(input, label, name=None, return LayerOutput(name, LayerType.COST, parents=[input, label]) -def conv_operator(input, filter_size, num_filters, - num_channel=None, stride=1, padding=0, +def conv_operator(img, filter, filter_size, num_filters, + num_channel=None, stride=1, padding=0, groups=1, filter_size_y=None, stride_y=None, padding_y=None): """ Different from img_conv_layer, conv_op is an Operator, which can be used @@ -2680,13 +2680,16 @@ def conv_operator(input, filter_size, num_filters, .. code-block:: python - op = conv_operator(input=[layer1, layer2], + op = conv_operator(img=input1, + filter=input2, filter_size=3.0, num_filters=64, num_channels=64) - :param input: Input layer. - :type input: LayerOutput|list|tuple + :param img: input image + :type img: LayerOutput + :param filter: input filter + :type filter: LayerOutput :param filter_size: The x dimension of a filter kernel. :type filter_size: int :param filter_size_y: The y dimension of a filter kernel. Since @@ -2708,14 +2711,13 @@ def conv_operator(input, filter_size, num_filters, :return: A ConvOperator Object. :rtype: ConvOperator """ - assert isinstance(input, list) or isinstance(input, tuple) if filter_size_y is None: filter_size_y = filter_size if stride_y is None: stride_y = stride if padding_y is None: padding_y = padding - op = ConvOperator(input_layer_name=[x.name for x in input], + op = ConvOperator(input_layer_names=[img.name, filter.name], num_filters = num_filter, conv_conf=Conv(filter_size=filter_size, padding=padding, @@ -2723,9 +2725,9 @@ def conv_operator(input, filter_size, num_filters, channels=num_channel, filter_size_y=filter_size_y, padding_y=padding_y, - stride_y=stride_y)) - op.origin = input - op.origin.operator = "conv_op" + stride_y=stride_y, + groups=groups)) + op.origin = [img, filter] return op