提交 5e59ca7c 编写于 作者: C caoying03

fix config helper.

上级 25083de9
......@@ -161,7 +161,17 @@ real CostForOneSequence::forward() {
}
void CostForOneSequence::backward() {
/*
* when softmax layer is the output layer, and it is combined with
* cross-entropy as cost. The derivate with regard to softmax's input
* is simply:
*
* grad_i = softmax_out_i - target_i,
*
* and here hard label is used.
*/
softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
MatrixPtr tmp = Matrix::create(
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
......
......@@ -19,8 +19,8 @@ limitations under the License. */
namespace paddle {
/* This struct stores the beams in all search steps for a single sequence. */
struct BeamExpansion {
// store the entire beam expansion for a single sequence
std::vector<MatrixPtr> scores;
std::vector<IVectorPtr> seqInfo;
......@@ -111,8 +111,11 @@ private:
size_t batchSize_;
size_t beamSize_;
// Currently, this layer only works on CPU, if its inputs is on GPU,
// copy them to CPU memory.
/*
* the process of constructing beams is not friendly to GPU, currently, this
* layer only runs on CPU, if any of its inputs is on GPU memory, then copy
* it to CPU memory.
*/
std::vector<MatrixPtr> candidateScores_;
std::vector<MatrixPtr> candidateScoreGrad_;
std::vector<MatrixPtr> candidateInBeam_;
......@@ -120,9 +123,12 @@ private:
std::vector<IVectorPtr> goldSequence_;
std::vector<std::vector<int>> beamSplitPos_;
// split entire bath of beams into beam per sequnence.
/*
* split entire bath of beams into beam per sequnence and store the result
* into this member.
*/
std::vector<BeamExpansion> beamPerSeq_;
// beamCosts_ is used to propagate error in one sequence.
/* beamCosts_ is used to propagate error in one sequence. */
std::vector<CostForOneSequence> beamCosts_;
};
......
......@@ -28,16 +28,10 @@ using namespace paddle; // NOLINT
DECLARE_int32(gpu_id);
DECLARE_bool(thread_local_rand_use_global_seed);
// const size_t MAX_SEQ_NUM = 5;
// const size_t MAX_SEQ_LEN = 10;
// const size_t MAX_BEAM_SIZE = 3;
const size_t MAX_SEQ_NUM = 23;
const size_t MAX_SEQ_LEN = 50;
const size_t MAX_BEAM_SIZE = 27;
// const size_t SEED = 1503391792;
// const size_t SEED = 1;
const size_t SEED = (size_t)(time(NULL));
struct SingleBeamExpansion {
......@@ -176,10 +170,12 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
beam.resetGroundTruth(seqNum);
for (size_t i = 0; i < seqNum; ++i) {
if (randFloat() > 0.5) {
// force the randomly generated label falls in the beam by chance 0.5.
// otherwise, when sequence length is relatively long and beam size is
// relatively small, the gold sequences falls off the beam at in
// the first search.
/*
* force the randomly generated label falls in the beam by chance 0.5.
* otherwise, when sequence length is relatively long and beam size is
* relatively small, the gold sequences falls off the beam at in the
* first search.
*/
real* begPos = beam.selectedIndices.data() + i * beamSize;
beam.colIdxInBeam[i] =
rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
......@@ -222,9 +218,7 @@ void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
if (randFloat() > 0.5) {
// force the randomly generated label falls in the beam by chance 0.5.
// otherwise, when sequence length is relatively long and beam size is
// relatively small, the gold sequences falls off the beam at in
// the first search.
real* start =
curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
int n = rand() % count_if(start, start + beamSize, [](const real& val) {
......@@ -339,7 +333,7 @@ TEST(Layer, CrossEntropyOverBeam) {
const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
LOG(INFO) << "beamSize = " << beamSize;
// TODO(caoying): test with more beam expansions.
// TODO(caoying): test with random beam expansions.
const size_t expansionCount = 3;
vector<SingleBeamExpansion> beams;
genRandomBeamExpansion(expansionCount, beamSize, beams);
......
......@@ -1605,16 +1605,16 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
@config_layer('cross_entropy_over_beam')
class CrossEntropyOverBeamLayer(LayerBase):
def __init__(self, name, inputs, **xargs):
config_assert(len(inputs) % 3 == 0, "Error input numbers.")
config_assert(len(inputs) % 3 == 0, "Error input number.")
super(CrossEntropyOverBeamLayer, self).__init__(
name, 'cross_entropy_over_beam', 0, inputs, **xargs)
input_num = len(inputs) / 3
for i in range(input_num):
input_layer = self.get_input_layer(i * 2)
config_assert(
input_layer.size == 1, "Inputs for this layer are made up of "
"several pairs and the first one in a pair is scores for "
"all the candidates, so its size should be equal to 1.")
input_layer = self.get_input_layer(i * 3)
config_assert(input_layer.size == 1, (
"Inputs for this layer are made up of "
"several triples, in which the first one is scores over "
"all candidate paths, whose size should be equal to 1."))
@config_layer('fc')
......
......@@ -103,6 +103,7 @@ __all__ = [
'nce_layer',
'cross_entropy_with_selfnorm',
'cross_entropy',
'BeamInput',
'cross_entropy_over_beam',
'multi_binary_label_cross_entropy',
'sum_cost',
......@@ -5681,10 +5682,10 @@ def multi_binary_label_cross_entropy(input,
if input.activation is None or \
not isinstance(input.activation, SigmoidActivation):
logger.log(
logging.WARN,
"%s is not recommend for multi_binary_label_cross_entropy's activation, "
"maybe the sigmoid is better" % repr(input.activation))
logger.log(logging.WARN,
("%s is not a recommended activation for "
"multi_binary_label_cross_entropy, sigmoid is better") %
repr(input.activation))
Layer(
name=name,
......@@ -5699,26 +5700,110 @@ def multi_binary_label_cross_entropy(input,
size=1)
class BeamInput(object):
"""
Define the input for cross_entropy_over_beam layer.
A beam is made up of a triple: the first one is scores over all
candidates; the second one is indices of top k selected candidates; the
third one is the index of ground truth, which is also always called
gold.
"""
def __init__(self, candidate_scores, selected_candidates, gold):
assert isinstance(candidate_scores, LayerOutput)
self.candidate_scores = candidate_scores
assert candidate_scores.size == 1
assert isinstance(selected_candidates, LayerOutput)
self.selected_candidates = selected_candidates
assert isinstance(gold, LayerOutput)
self.gold = gold
@wrap_name_default()
@layer_support()
def cross_entropy_over_beam(input, label, name=None, coeff=1.0, weight=None):
"""
TODO(caoying) add comments.
def cross_entropy_over_beam(input, name=None):
"""
This layer is used in learning to search models, which is to solve complex
joint prediction problems based on learning to search through a
problem-defined search space.
assert len(input) / 2 == len(label), "Error input numbers."
for i in range(0, len(input), 2):
assert (input[i].size == 1), (
"Inputs for this layer are made up of "
"several pairs and the first one in a pair is scores for "
"all the candidates, so its size should be equal to 1.")
Specifically, the learning to search process for this layer begins with
searching a target sequence from a nested sequence. In the first search
step, top beam size sequences with highest scores, indices of these top k
sequences in the original nested sequence, and the ground truth (also
called gold) altogether (a triple) make up of the first beam.
ipts, parents = __cost_input__(input, label, weight)
Layer(
name=name,
type=LayerType.CROSS_ENTROPY_OVER_BEAM,
inputs=ipts,
coeff=coeff)
Then, several special positions, for example, start and end positions
that define meaningful segments are searched. In these searches, top k
positions with highest scores are selected, and then sequence, starting
from the selected starts till ends of the sequences (or a fixed position)
are taken to search next.
We call the possible top k results returned in one search the beam. This
search process can be repeated for pre-defined turns and leads to several
beam expansions.
Finally, the layer cross_entropy_over_beam takes all the beam expansions
which contain several candidate targets found along the multi-step search.
cross_entropy_over_beam calculates cross entropy over the expanded beams
which all the candidates in the beam as the normalized factor.
Note that, if gold falls off the beam at search step t, then the cost is
calculated over the beam at step t.
This cost layer always works together with kmax_sequence_score_layer,
sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
sub-search space.
The example usage is:
.. code-block:: python
cost = cross_entropy_over_beam(input=[
BeamInput(
candidate_scores=beam1_candidates,
selected_candidates=beam1_topk,
gold=gold1),
BeamInput(
candidate_scores=beam2_candidates,
selected_candidates=beam2_topk,
gold=gold2),
])
:param input: input beams for this layer.
:type input: BeamInput
:param name: input beams for this layer.
:type name: basestring
:return: LayerOutput object.
:rtype: LayerOutput
"""
if isinstance(input, BeamInput):
input = [input]
else:
assert isinstance(input, list), (
'input for cross_entropy_over_beam shold be a python list '
'of BeamInput object.')
for ipt in input:
assert isinstance(ipt, BeamInput), (
'input for cross_entropy_over_beam '
'should be a BeamInput object.')
ipts = []
parents = []
for beam in input:
parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
ipts += [
beam.candidate_scores.name, beam.selected_candidates.name,
beam.gold.name
]
Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
......@@ -6247,11 +6332,11 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
@wrap_bias_attr_default()
def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
"""
A layer applies a linear transformation to each element in each row of
the input matrix. For each element, the layer first re-scale it and then
A layer applies a linear transformation to each element in each row of
the input matrix. For each element, the layer first re-scale it and then
adds a bias to it.
This layer is very like the SlopeInterceptLayer, except the scale and
This layer is very like the SlopeInterceptLayer, except the scale and
bias are trainable.
.. math::
......
......@@ -114,27 +114,26 @@ layers {
input_layer_name: "__kmax_sequence_score_layer_0__"
}
inputs {
input_layer_name: "__fc_layer_0__"
input_layer_name: "sentences_ids"
}
inputs {
input_layer_name: "__kmax_sequence_score_layer_1__"
input_layer_name: "__fc_layer_0__"
}
inputs {
input_layer_name: "__fc_layer_1__"
input_layer_name: "__kmax_sequence_score_layer_1__"
}
inputs {
input_layer_name: "__kmax_sequence_score_layer_2__"
input_layer_name: "start_ids"
}
inputs {
input_layer_name: "sentences_ids"
input_layer_name: "__fc_layer_1__"
}
inputs {
input_layer_name: "start_ids"
input_layer_name: "__kmax_sequence_score_layer_2__"
}
inputs {
input_layer_name: "end_ids"
}
coeff: 1.0
}
parameters {
name: "___fc_layer_0__.w0"
......@@ -177,8 +176,8 @@ parameters {
initial_smart: false
}
input_layer_names: "sentence_scores"
input_layer_names: "sentence_states"
input_layer_names: "sentences_ids"
input_layer_names: "sentence_states"
input_layer_names: "start_ids"
input_layer_names: "end_ids"
output_layer_names: "__cross_entropy_over_beam_0__"
......@@ -198,8 +197,8 @@ sub_models {
layer_names: "end_ids"
layer_names: "__cross_entropy_over_beam_0__"
input_layer_names: "sentence_scores"
input_layer_names: "sentence_states"
input_layer_names: "sentences_ids"
input_layer_names: "sentence_states"
input_layer_names: "start_ids"
input_layer_names: "end_ids"
output_layer_names: "__cross_entropy_over_beam_0__"
......
......@@ -29,11 +29,17 @@ topk_end_pos_ids = kmax_sequence_score_layer(
sentence_idx = data_layer(name="sentences_ids", size=1)
start_idx = data_layer(name="start_ids", size=1)
end_idx = data_layer(name="end_ids", size=1)
cost = cross_entropy_over_beam(
input=[
sentence_scores, topk_sentence_ids, start_pos_scores,
topk_start_pos_ids, end_pos_scores, topk_end_pos_ids
],
label=[sentence_idx, start_idx, end_idx])
cost = cross_entropy_over_beam(input=[
BeamInput(
candidate_scores=sentence_scores,
selected_candidates=topk_sentence_ids,
gold=sentence_idx), BeamInput(
candidate_scores=start_pos_scores,
selected_candidates=topk_start_pos_ids,
gold=start_idx), BeamInput(
candidate_scores=end_pos_scores,
selected_candidates=topk_end_pos_ids,
gold=end_idx)
])
outputs(cost)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册