diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp index 09258fb30599014c6746ad20f0caeed9ff1692f5..f7736f0ce905f88d69598e2cef8e825fbec7de70 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.cpp +++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp @@ -161,7 +161,17 @@ real CostForOneSequence::forward() { } void CostForOneSequence::backward() { + /* + * when softmax layer is the output layer, and it is combined with + * cross-entropy as cost. The derivate with regard to softmax's input + * is simply: + * + * grad_i = softmax_out_i - target_i, + * + * and here hard label is used. + */ softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.; + MatrixPtr tmp = Matrix::create( softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h index 96a5df7dfbe46108ef04d23857d155305ad40b56..5d0cffee3c159702a2d6b96de710553b0ede9f6a 100644 --- a/paddle/gserver/layers/CrossEntropyOverBeam.h +++ b/paddle/gserver/layers/CrossEntropyOverBeam.h @@ -19,8 +19,8 @@ limitations under the License. */ namespace paddle { +/* This struct stores the beams in all search steps for a single sequence. */ struct BeamExpansion { - // store the entire beam expansion for a single sequence std::vector scores; std::vector seqInfo; @@ -111,8 +111,11 @@ private: size_t batchSize_; size_t beamSize_; - // Currently, this layer only works on CPU, if its inputs is on GPU, - // copy them to CPU memory. + /* + * the process of constructing beams is not friendly to GPU, currently, this + * layer only runs on CPU, if any of its inputs is on GPU memory, then copy + * it to CPU memory. + */ std::vector candidateScores_; std::vector candidateScoreGrad_; std::vector candidateInBeam_; @@ -120,9 +123,12 @@ private: std::vector goldSequence_; std::vector> beamSplitPos_; - // split entire bath of beams into beam per sequnence. + /* + * split entire bath of beams into beam per sequnence and store the result + * into this member. + */ std::vector beamPerSeq_; - // beamCosts_ is used to propagate error in one sequence. + /* beamCosts_ is used to propagate error in one sequence. */ std::vector beamCosts_; }; diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp index 506a4281df4f0a2cead2c83c4754cfb4226b8b80..538d18cdc3d262df0ddb031d9e6b38a3fea57606 100644 --- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp +++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp @@ -28,16 +28,10 @@ using namespace paddle; // NOLINT DECLARE_int32(gpu_id); DECLARE_bool(thread_local_rand_use_global_seed); -// const size_t MAX_SEQ_NUM = 5; -// const size_t MAX_SEQ_LEN = 10; -// const size_t MAX_BEAM_SIZE = 3; - const size_t MAX_SEQ_NUM = 23; const size_t MAX_SEQ_LEN = 50; const size_t MAX_BEAM_SIZE = 27; -// const size_t SEED = 1503391792; -// const size_t SEED = 1; const size_t SEED = (size_t)(time(NULL)); struct SingleBeamExpansion { @@ -176,10 +170,12 @@ void genGroundTruth(vector& beamExpansions, beam.resetGroundTruth(seqNum); for (size_t i = 0; i < seqNum; ++i) { if (randFloat() > 0.5) { - // force the randomly generated label falls in the beam by chance 0.5. - // otherwise, when sequence length is relatively long and beam size is - // relatively small, the gold sequences falls off the beam at in - // the first search. + /* + * force the randomly generated label falls in the beam by chance 0.5. + * otherwise, when sequence length is relatively long and beam size is + * relatively small, the gold sequences falls off the beam at in the + * first search. + */ real* begPos = beam.selectedIndices.data() + i * beamSize; beam.colIdxInBeam[i] = rand() % count_if(begPos, begPos + beamSize, [](const real& val) { @@ -222,9 +218,7 @@ void genGroundTruth(vector& beamExpansions, if (randFloat() > 0.5) { // force the randomly generated label falls in the beam by chance 0.5. - // otherwise, when sequence length is relatively long and beam size is - // relatively small, the gold sequences falls off the beam at in - // the first search. + real* start = curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; int n = rand() % count_if(start, start + beamSize, [](const real& val) { @@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) { const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE; LOG(INFO) << "beamSize = " << beamSize; - // TODO(caoying): test with more beam expansions. + // TODO(caoying): test with random beam expansions. const size_t expansionCount = 3; vector beams; genRandomBeamExpansion(expansionCount, beamSize, beams); diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 7707ece819c9e684e13730e21c8d8c64649e2710..579713546f15e25f5f67979038211aea041b7a92 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1605,16 +1605,16 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase): @config_layer('cross_entropy_over_beam') class CrossEntropyOverBeamLayer(LayerBase): def __init__(self, name, inputs, **xargs): - config_assert(len(inputs) % 3 == 0, "Error input numbers.") + config_assert(len(inputs) % 3 == 0, "Error input number.") super(CrossEntropyOverBeamLayer, self).__init__( name, 'cross_entropy_over_beam', 0, inputs, **xargs) input_num = len(inputs) / 3 for i in range(input_num): - input_layer = self.get_input_layer(i * 2) - config_assert( - input_layer.size == 1, "Inputs for this layer are made up of " - "several pairs and the first one in a pair is scores for " - "all the candidates, so its size should be equal to 1.") + input_layer = self.get_input_layer(i * 3) + config_assert(input_layer.size == 1, ( + "Inputs for this layer are made up of " + "several triples, in which the first one is scores over " + "all candidate paths, whose size should be equal to 1.")) @config_layer('fc') diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b027f84b5d576103b6e03ef6709a6c1f335aabe2..053c92d005f7a7929e7cef35ce0cf7ad62efa760 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -103,6 +103,7 @@ __all__ = [ 'nce_layer', 'cross_entropy_with_selfnorm', 'cross_entropy', + 'BeamInput', 'cross_entropy_over_beam', 'multi_binary_label_cross_entropy', 'sum_cost', @@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input, if input.activation is None or \ not isinstance(input.activation, SigmoidActivation): - logger.log( - logging.WARN, - "%s is not recommend for multi_binary_label_cross_entropy's activation, " - "maybe the sigmoid is better" % repr(input.activation)) + logger.log(logging.WARN, + ("%s is not a recommended activation for " + "multi_binary_label_cross_entropy, sigmoid is better") % + repr(input.activation)) Layer( name=name, @@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input, size=1) +class BeamInput(object): + """ + Define the input for cross_entropy_over_beam layer. + + A beam is made up of a triple: the first one is scores over all + candidates; the second one is indices of top k selected candidates; the + third one is the index of ground truth, which is also always called + gold. + """ + + def __init__(self, candidate_scores, selected_candidates, gold): + assert isinstance(candidate_scores, LayerOutput) + self.candidate_scores = candidate_scores + assert candidate_scores.size == 1 + + assert isinstance(selected_candidates, LayerOutput) + self.selected_candidates = selected_candidates + + assert isinstance(gold, LayerOutput) + self.gold = gold + + @wrap_name_default() @layer_support() -def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None): - """ - TODO(caoying) add comments. +def cross_entropy_over_beam(input, name=None): """ + This layer is used in learning to search models, which is to solve complex + joint prediction problems based on learning to search through a + problem-defined search space. - assert len(input) / 2 == len(label), "Error input numbers." - for i in range(0, len(input), 2): - assert (input[i].size == 1), ( - "Inputs for this layer are made up of " - "several pairs and the first one in a pair is scores for " - "all the candidates, so its size should be equal to 1.") + Specifically, the learning to search process for this layer begins with + searching a target sequence from a nested sequence. In the first search + step, top beam size sequences with highest scores, indices of these top k + sequences in the original nested sequence, and the ground truth (also + called gold) altogether (a triple) make up of the first beam. - ipts, parents = __cost_input__(input, label, weight) - Layer( - name=name, - type=LayerType.CROSS_ENTROPY_OVER_BEAM, - inputs=ipts, - coeff=coeff) + Then, several special positions, for example, start and end positions + that define meaningful segments are searched. In these searches, top k + positions with highest scores are selected, and then sequence, starting + from the selected starts till ends of the sequences (or a fixed position) + are taken to search next. + + We call the possible top k results returned in one search the beam. This + search process can be repeated for pre-defined turns and leads to several + beam expansions. + + Finally, the layer cross_entropy_over_beam takes all the beam expansions + which contain several candidate targets found along the multi-step search. + cross_entropy_over_beam calculates cross entropy over the expanded beams + which all the candidates in the beam as the normalized factor. + + Note that, if gold falls off the beam at search step t, then the cost is + calculated over the beam at step t. + + This cost layer always works together with kmax_sequence_score_layer, + sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a + sub-search space. + + + The example usage is: + + .. code-block:: python + + cost = cross_entropy_over_beam(input=[ + BeamInput( + candidate_scores=beam1_candidates, + selected_candidates=beam1_topk, + gold=gold1), + BeamInput( + candidate_scores=beam2_candidates, + selected_candidates=beam2_topk, + gold=gold2), + ]) + + + :param input: input beams for this layer. + :type input: BeamInput + :param name: input beams for this layer. + :type name: basestring + :return: LayerOutput object. + :rtype: LayerOutput + """ + + if isinstance(input, BeamInput): + input = [input] + else: + assert isinstance(input, list), ( + 'input for cross_entropy_over_beam shold be a python list ' + 'of BeamInput object.') + for ipt in input: + assert isinstance(ipt, BeamInput), ( + 'input for cross_entropy_over_beam ' + 'should be a BeamInput object.') + + ipts = [] + parents = [] + for beam in input: + parents += [beam.candidate_scores, beam.selected_candidates, beam.gold] + ipts += [ + beam.candidate_scores.name, beam.selected_candidates.name, + beam.gold.name + ] + + Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts) return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1) @@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1): @wrap_bias_attr_default() def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None): """ - A layer applies a linear transformation to each element in each row of - the input matrix. For each element, the layer first re-scale it and then + A layer applies a linear transformation to each element in each row of + the input matrix. For each element, the layer first re-scale it and then adds a bias to it. - This layer is very like the SlopeInterceptLayer, except the scale and + This layer is very like the SlopeInterceptLayer, except the scale and bias are trainable. .. math:: diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr index e44478ec2ba1fbbcc935f418540441f99fda6d4e..c43fc48e222044b65d83b6162e7dc3954e119887 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr @@ -114,27 +114,26 @@ layers { input_layer_name: "__kmax_sequence_score_layer_0__" } inputs { - input_layer_name: "__fc_layer_0__" + input_layer_name: "sentences_ids" } inputs { - input_layer_name: "__kmax_sequence_score_layer_1__" + input_layer_name: "__fc_layer_0__" } inputs { - input_layer_name: "__fc_layer_1__" + input_layer_name: "__kmax_sequence_score_layer_1__" } inputs { - input_layer_name: "__kmax_sequence_score_layer_2__" + input_layer_name: "start_ids" } inputs { - input_layer_name: "sentences_ids" + input_layer_name: "__fc_layer_1__" } inputs { - input_layer_name: "start_ids" + input_layer_name: "__kmax_sequence_score_layer_2__" } inputs { input_layer_name: "end_ids" } - coeff: 1.0 } parameters { name: "___fc_layer_0__.w0" @@ -177,8 +176,8 @@ parameters { initial_smart: false } input_layer_names: "sentence_scores" -input_layer_names: "sentence_states" input_layer_names: "sentences_ids" +input_layer_names: "sentence_states" input_layer_names: "start_ids" input_layer_names: "end_ids" output_layer_names: "__cross_entropy_over_beam_0__" @@ -198,8 +197,8 @@ sub_models { layer_names: "end_ids" layer_names: "__cross_entropy_over_beam_0__" input_layer_names: "sentence_scores" - input_layer_names: "sentence_states" input_layer_names: "sentences_ids" + input_layer_names: "sentence_states" input_layer_names: "start_ids" input_layer_names: "end_ids" output_layer_names: "__cross_entropy_over_beam_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py index edc2d32fca1c911ad72277b5175578565443b783..240e703dc904e718c2c1ddaf2b6d7dccb4dabf41 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py @@ -29,11 +29,17 @@ topk_end_pos_ids = kmax_sequence_score_layer( sentence_idx = data_layer(name="sentences_ids", size=1) start_idx = data_layer(name="start_ids", size=1) end_idx = data_layer(name="end_ids", size=1) -cost = cross_entropy_over_beam( - input=[ - sentence_scores, topk_sentence_ids, start_pos_scores, - topk_start_pos_ids, end_pos_scores, topk_end_pos_ids - ], - label=[sentence_idx, start_idx, end_idx]) +cost = cross_entropy_over_beam(input=[ + BeamInput( + candidate_scores=sentence_scores, + selected_candidates=topk_sentence_ids, + gold=sentence_idx), BeamInput( + candidate_scores=start_pos_scores, + selected_candidates=topk_start_pos_ids, + gold=start_idx), BeamInput( + candidate_scores=end_pos_scores, + selected_candidates=topk_end_pos_ids, + gold=end_idx) +]) outputs(cost)