提交 5e59ca7c 编写于 作者: C caoying03

fix config helper.

上级 25083de9
...@@ -161,7 +161,17 @@ real CostForOneSequence::forward() { ...@@ -161,7 +161,17 @@ real CostForOneSequence::forward() {
} }
void CostForOneSequence::backward() { void CostForOneSequence::backward() {
/*
* when softmax layer is the output layer, and it is combined with
* cross-entropy as cost. The derivate with regard to softmax's input
* is simply:
*
* grad_i = softmax_out_i - target_i,
*
* and here hard label is used.
*/
softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.; softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
MatrixPtr tmp = Matrix::create( MatrixPtr tmp = Matrix::create(
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false); softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
......
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
/* This struct stores the beams in all search steps for a single sequence. */
struct BeamExpansion { struct BeamExpansion {
// store the entire beam expansion for a single sequence
std::vector<MatrixPtr> scores; std::vector<MatrixPtr> scores;
std::vector<IVectorPtr> seqInfo; std::vector<IVectorPtr> seqInfo;
...@@ -111,8 +111,11 @@ private: ...@@ -111,8 +111,11 @@ private:
size_t batchSize_; size_t batchSize_;
size_t beamSize_; size_t beamSize_;
// Currently, this layer only works on CPU, if its inputs is on GPU, /*
// copy them to CPU memory. * the process of constructing beams is not friendly to GPU, currently, this
* layer only runs on CPU, if any of its inputs is on GPU memory, then copy
* it to CPU memory.
*/
std::vector<MatrixPtr> candidateScores_; std::vector<MatrixPtr> candidateScores_;
std::vector<MatrixPtr> candidateScoreGrad_; std::vector<MatrixPtr> candidateScoreGrad_;
std::vector<MatrixPtr> candidateInBeam_; std::vector<MatrixPtr> candidateInBeam_;
...@@ -120,9 +123,12 @@ private: ...@@ -120,9 +123,12 @@ private:
std::vector<IVectorPtr> goldSequence_; std::vector<IVectorPtr> goldSequence_;
std::vector<std::vector<int>> beamSplitPos_; std::vector<std::vector<int>> beamSplitPos_;
// split entire bath of beams into beam per sequnence. /*
* split entire bath of beams into beam per sequnence and store the result
* into this member.
*/
std::vector<BeamExpansion> beamPerSeq_; std::vector<BeamExpansion> beamPerSeq_;
// beamCosts_ is used to propagate error in one sequence. /* beamCosts_ is used to propagate error in one sequence. */
std::vector<CostForOneSequence> beamCosts_; std::vector<CostForOneSequence> beamCosts_;
}; };
......
...@@ -28,16 +28,10 @@ using namespace paddle; // NOLINT ...@@ -28,16 +28,10 @@ using namespace paddle; // NOLINT
DECLARE_int32(gpu_id); DECLARE_int32(gpu_id);
DECLARE_bool(thread_local_rand_use_global_seed); DECLARE_bool(thread_local_rand_use_global_seed);
// const size_t MAX_SEQ_NUM = 5;
// const size_t MAX_SEQ_LEN = 10;
// const size_t MAX_BEAM_SIZE = 3;
const size_t MAX_SEQ_NUM = 23; const size_t MAX_SEQ_NUM = 23;
const size_t MAX_SEQ_LEN = 50; const size_t MAX_SEQ_LEN = 50;
const size_t MAX_BEAM_SIZE = 27; const size_t MAX_BEAM_SIZE = 27;
// const size_t SEED = 1503391792;
// const size_t SEED = 1;
const size_t SEED = (size_t)(time(NULL)); const size_t SEED = (size_t)(time(NULL));
struct SingleBeamExpansion { struct SingleBeamExpansion {
...@@ -176,10 +170,12 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions, ...@@ -176,10 +170,12 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
beam.resetGroundTruth(seqNum); beam.resetGroundTruth(seqNum);
for (size_t i = 0; i < seqNum; ++i) { for (size_t i = 0; i < seqNum; ++i) {
if (randFloat() > 0.5) { if (randFloat() > 0.5) {
// force the randomly generated label falls in the beam by chance 0.5. /*
// otherwise, when sequence length is relatively long and beam size is * force the randomly generated label falls in the beam by chance 0.5.
// relatively small, the gold sequences falls off the beam at in * otherwise, when sequence length is relatively long and beam size is
// the first search. * relatively small, the gold sequences falls off the beam at in the
* first search.
*/
real* begPos = beam.selectedIndices.data() + i * beamSize; real* begPos = beam.selectedIndices.data() + i * beamSize;
beam.colIdxInBeam[i] = beam.colIdxInBeam[i] =
rand() % count_if(begPos, begPos + beamSize, [](const real& val) { rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
...@@ -222,9 +218,7 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions, ...@@ -222,9 +218,7 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
if (randFloat() > 0.5) { if (randFloat() > 0.5) {
// force the randomly generated label falls in the beam by chance 0.5. // force the randomly generated label falls in the beam by chance 0.5.
// otherwise, when sequence length is relatively long and beam size is
// relatively small, the gold sequences falls off the beam at in
// the first search.
real* start = real* start =
curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize; curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
int n = rand() % count_if(start, start + beamSize, [](const real& val) { int n = rand() % count_if(start, start + beamSize, [](const real& val) {
...@@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) { ...@@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) {
const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE; const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
LOG(INFO) << "beamSize = " << beamSize; LOG(INFO) << "beamSize = " << beamSize;
// TODO(caoying): test with more beam expansions. // TODO(caoying): test with random beam expansions.
const size_t expansionCount = 3; const size_t expansionCount = 3;
vector<SingleBeamExpansion> beams; vector<SingleBeamExpansion> beams;
genRandomBeamExpansion(expansionCount, beamSize, beams); genRandomBeamExpansion(expansionCount, beamSize, beams);
......
...@@ -1605,16 +1605,16 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase): ...@@ -1605,16 +1605,16 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
@config_layer('cross_entropy_over_beam') @config_layer('cross_entropy_over_beam')
class CrossEntropyOverBeamLayer(LayerBase): class CrossEntropyOverBeamLayer(LayerBase):
def __init__(self, name, inputs, **xargs): def __init__(self, name, inputs, **xargs):
config_assert(len(inputs) % 3 == 0, "Error input numbers.") config_assert(len(inputs) % 3 == 0, "Error input number.")
super(CrossEntropyOverBeamLayer, self).__init__( super(CrossEntropyOverBeamLayer, self).__init__(
name, 'cross_entropy_over_beam', 0, inputs, **xargs) name, 'cross_entropy_over_beam', 0, inputs, **xargs)
input_num = len(inputs) / 3 input_num = len(inputs) / 3
for i in range(input_num): for i in range(input_num):
input_layer = self.get_input_layer(i * 2) input_layer = self.get_input_layer(i * 3)
config_assert( config_assert(input_layer.size == 1, (
input_layer.size == 1, "Inputs for this layer are made up of " "Inputs for this layer are made up of "
"several pairs and the first one in a pair is scores for " "several triples, in which the first one is scores over "
"all the candidates, so its size should be equal to 1.") "all candidate paths, whose size should be equal to 1."))
@config_layer('fc') @config_layer('fc')
......
...@@ -103,6 +103,7 @@ __all__ = [ ...@@ -103,6 +103,7 @@ __all__ = [
'nce_layer', 'nce_layer',
'cross_entropy_with_selfnorm', 'cross_entropy_with_selfnorm',
'cross_entropy', 'cross_entropy',
'BeamInput',
'cross_entropy_over_beam', 'cross_entropy_over_beam',
'multi_binary_label_cross_entropy', 'multi_binary_label_cross_entropy',
'sum_cost', 'sum_cost',
...@@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input, ...@@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input,
if input.activation is None or \ if input.activation is None or \
not isinstance(input.activation, SigmoidActivation): not isinstance(input.activation, SigmoidActivation):
logger.log( logger.log(logging.WARN,
logging.WARN, ("%s is not a recommended activation for "
"%s is not recommend for multi_binary_label_cross_entropy's activation, " "multi_binary_label_cross_entropy, sigmoid is better") %
"maybe the sigmoid is better" % repr(input.activation)) repr(input.activation))
Layer( Layer(
name=name, name=name,
...@@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input, ...@@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input,
size=1) size=1)
class BeamInput(object):
"""
Define the input for cross_entropy_over_beam layer.
A beam is made up of a triple: the first one is scores over all
candidates; the second one is indices of top k selected candidates; the
third one is the index of ground truth, which is also always called
gold.
"""
def __init__(self, candidate_scores, selected_candidates, gold):
assert isinstance(candidate_scores, LayerOutput)
self.candidate_scores = candidate_scores
assert candidate_scores.size == 1
assert isinstance(selected_candidates, LayerOutput)
self.selected_candidates = selected_candidates
assert isinstance(gold, LayerOutput)
self.gold = gold
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None): def cross_entropy_over_beam(input, name=None):
"""
TODO(caoying) add comments.
""" """
This layer is used in learning to search models, which is to solve complex
joint prediction problems based on learning to search through a
problem-defined search space.
assert len(input) / 2 == len(label), "Error input numbers." Specifically, the learning to search process for this layer begins with
for i in range(0, len(input), 2): searching a target sequence from a nested sequence. In the first search
assert (input[i].size == 1), ( step, top beam size sequences with highest scores, indices of these top k
"Inputs for this layer are made up of " sequences in the original nested sequence, and the ground truth (also
"several pairs and the first one in a pair is scores for " called gold) altogether (a triple) make up of the first beam.
"all the candidates, so its size should be equal to 1.")
ipts, parents = __cost_input__(input, label, weight) Then, several special positions, for example, start and end positions
Layer( that define meaningful segments are searched. In these searches, top k
name=name, positions with highest scores are selected, and then sequence, starting
type=LayerType.CROSS_ENTROPY_OVER_BEAM, from the selected starts till ends of the sequences (or a fixed position)
inputs=ipts, are taken to search next.
coeff=coeff)
We call the possible top k results returned in one search the beam. This
search process can be repeated for pre-defined turns and leads to several
beam expansions.
Finally, the layer cross_entropy_over_beam takes all the beam expansions
which contain several candidate targets found along the multi-step search.
cross_entropy_over_beam calculates cross entropy over the expanded beams
which all the candidates in the beam as the normalized factor.
Note that, if gold falls off the beam at search step t, then the cost is
calculated over the beam at step t.
This cost layer always works together with kmax_sequence_score_layer,
sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
sub-search space.
The example usage is:
.. code-block:: python
cost = cross_entropy_over_beam(input=[
BeamInput(
candidate_scores=beam1_candidates,
selected_candidates=beam1_topk,
gold=gold1),
BeamInput(
candidate_scores=beam2_candidates,
selected_candidates=beam2_topk,
gold=gold2),
])
:param input: input beams for this layer.
:type input: BeamInput
:param name: input beams for this layer.
:type name: basestring
:return: LayerOutput object.
:rtype: LayerOutput
"""
if isinstance(input, BeamInput):
input = [input]
else:
assert isinstance(input, list), (
'input for cross_entropy_over_beam shold be a python list '
'of BeamInput object.')
for ipt in input:
assert isinstance(ipt, BeamInput), (
'input for cross_entropy_over_beam '
'should be a BeamInput object.')
ipts = []
parents = []
for beam in input:
parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
ipts += [
beam.candidate_scores.name, beam.selected_candidates.name,
beam.gold.name
]
Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1) return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
...@@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1): ...@@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
@wrap_bias_attr_default() @wrap_bias_attr_default()
def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None): def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
""" """
A layer applies a linear transformation to each element in each row of A layer applies a linear transformation to each element in each row of
the input matrix. For each element, the layer first re-scale it and then the input matrix. For each element, the layer first re-scale it and then
adds a bias to it. adds a bias to it.
This layer is very like the SlopeInterceptLayer, except the scale and This layer is very like the SlopeInterceptLayer, except the scale and
bias are trainable. bias are trainable.
.. math:: .. math::
......
...@@ -114,27 +114,26 @@ layers { ...@@ -114,27 +114,26 @@ layers {
input_layer_name: "__kmax_sequence_score_layer_0__" input_layer_name: "__kmax_sequence_score_layer_0__"
} }
inputs { inputs {
input_layer_name: "__fc_layer_0__" input_layer_name: "sentences_ids"
} }
inputs { inputs {
input_layer_name: "__kmax_sequence_score_layer_1__" input_layer_name: "__fc_layer_0__"
} }
inputs { inputs {
input_layer_name: "__fc_layer_1__" input_layer_name: "__kmax_sequence_score_layer_1__"
} }
inputs { inputs {
input_layer_name: "__kmax_sequence_score_layer_2__" input_layer_name: "start_ids"
} }
inputs { inputs {
input_layer_name: "sentences_ids" input_layer_name: "__fc_layer_1__"
} }
inputs { inputs {
input_layer_name: "start_ids" input_layer_name: "__kmax_sequence_score_layer_2__"
} }
inputs { inputs {
input_layer_name: "end_ids" input_layer_name: "end_ids"
} }
coeff: 1.0
} }
parameters { parameters {
name: "___fc_layer_0__.w0" name: "___fc_layer_0__.w0"
...@@ -177,8 +176,8 @@ parameters { ...@@ -177,8 +176,8 @@ parameters {
initial_smart: false initial_smart: false
} }
input_layer_names: "sentence_scores" input_layer_names: "sentence_scores"
input_layer_names: "sentence_states"
input_layer_names: "sentences_ids" input_layer_names: "sentences_ids"
input_layer_names: "sentence_states"
input_layer_names: "start_ids" input_layer_names: "start_ids"
input_layer_names: "end_ids" input_layer_names: "end_ids"
output_layer_names: "__cross_entropy_over_beam_0__" output_layer_names: "__cross_entropy_over_beam_0__"
...@@ -198,8 +197,8 @@ sub_models { ...@@ -198,8 +197,8 @@ sub_models {
layer_names: "end_ids" layer_names: "end_ids"
layer_names: "__cross_entropy_over_beam_0__" layer_names: "__cross_entropy_over_beam_0__"
input_layer_names: "sentence_scores" input_layer_names: "sentence_scores"
input_layer_names: "sentence_states"
input_layer_names: "sentences_ids" input_layer_names: "sentences_ids"
input_layer_names: "sentence_states"
input_layer_names: "start_ids" input_layer_names: "start_ids"
input_layer_names: "end_ids" input_layer_names: "end_ids"
output_layer_names: "__cross_entropy_over_beam_0__" output_layer_names: "__cross_entropy_over_beam_0__"
......
...@@ -29,11 +29,17 @@ topk_end_pos_ids = kmax_sequence_score_layer( ...@@ -29,11 +29,17 @@ topk_end_pos_ids = kmax_sequence_score_layer(
sentence_idx = data_layer(name="sentences_ids", size=1) sentence_idx = data_layer(name="sentences_ids", size=1)
start_idx = data_layer(name="start_ids", size=1) start_idx = data_layer(name="start_ids", size=1)
end_idx = data_layer(name="end_ids", size=1) end_idx = data_layer(name="end_ids", size=1)
cost = cross_entropy_over_beam( cost = cross_entropy_over_beam(input=[
input=[ BeamInput(
sentence_scores, topk_sentence_ids, start_pos_scores, candidate_scores=sentence_scores,
topk_start_pos_ids, end_pos_scores, topk_end_pos_ids selected_candidates=topk_sentence_ids,
], gold=sentence_idx), BeamInput(
label=[sentence_idx, start_idx, end_idx]) candidate_scores=start_pos_scores,
selected_candidates=topk_start_pos_ids,
gold=start_idx), BeamInput(
candidate_scores=end_pos_scores,
selected_candidates=topk_end_pos_ids,
gold=end_idx)
])
outputs(cost) outputs(cost)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册