From 2ac5d7d0189c7095c22db68a220be1459abb5486 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Sat, 4 Nov 2017 19:26:41 -0700 Subject: [PATCH] Fixing documentation for operators (#5373) * Adding documentation for seq_expand * Adding documentation for seq_concat_op * Adding documentation for sequence_conv * Adding sequence_pool * Fixing review comment * Adding sequence_softmax * Updating doc for sigmoid_cross_entropy_with_logits --- paddle/operators/seq_expand_op.cc | 4 +- paddle/operators/sequence_concat_op.cc | 6 +- paddle/operators/sequence_conv_op.cc | 24 ++++---- paddle/operators/sequence_pool_op.cc | 55 ++++++++++--------- paddle/operators/sequence_softmax_op.cc | 16 ++++-- .../sigmoid_cross_entropy_with_logits_op.cc | 20 ++++--- 6 files changed, 70 insertions(+), 55 deletions(-) diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc index 08fda9b445..b862056ad4 100644 --- a/paddle/operators/seq_expand_op.cc +++ b/paddle/operators/seq_expand_op.cc @@ -53,8 +53,10 @@ class SeqExpandOpMaker : public framework::OpProtoAndCheckerMaker { "(LodTensor)The output of seq_expand op." "The lod of output will be as same as input(Y)'s lod."); AddComment(R"DOC( -Expand input(X) according to LOD of input(Y). +Seq Expand Operator. +This operator expands input(X) according to LOD of input(Y). +Following are cases to better explain how this works: Case 1: Given 2-level a LoDTensor input(X) diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc index ec4ad50dab..64097ef252 100644 --- a/paddle/operators/sequence_concat_op.cc +++ b/paddle/operators/sequence_concat_op.cc @@ -68,11 +68,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker { "The level should be less than the level number of inputs.") .SetDefault(0); AddComment(R"DOC( -Sequence Concat operator +Sequence Concat Operator. The sequence_concat operator concatenates multiple LoDTensors. -It only supports sequence (LoD Tensor with level number is 1) +It supports a sequence (LoD Tensor with level number is 1) or a nested sequence (LoD tensor with level number is 2) as its input. +The following examples explain how the operator works: - Case1: If the axis is other than 0(here, axis is 1 and level is 1), each input should have the same LoD information and the LoD @@ -98,6 +99,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input. LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4) NOTE: The levels of all the inputs should be the same. + )DOC"); } }; diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc index a3f2ed1443..41cadce4c6 100644 --- a/paddle/operators/sequence_conv_op.cc +++ b/paddle/operators/sequence_conv_op.cc @@ -105,10 +105,10 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput( "X", - "(LoDTensor) the input(X) is a LodTensor, which support " + "(LoDTensor) the input(X) is a LodTensor, which supports " "variable-time length input sequence. The underlying tensor in " - "this LoDTensor is a matrix with shape (T, N), where, T is the " - "total time steps in this mini-batch, N is the input_hidden_size."); + "this LoDTensor is a matrix with shape (T, N), where T is the " + "total time steps in this mini-batch and N is the input_hidden_size."); AddInput("PaddingData", "(Tensor, optional) the input(PaddingData) is an optional " "parameter, and it is learnable. " @@ -157,14 +157,16 @@ class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker { .GreaterThan(0); AddComment(R"DOC( - SequenceConvOp performs convolution operation on features of - contextLength time-steps of each instance. - The convolution operation calculates the output based on the input, filter - and strides, paddings parameters. The size of each dimension of the - parameters is checked in the infer-shape. In order to ensure the equal - length of sequence before and after convolution, it is necessary to fill - the top and bottom of each sequence according to context_length, - context_stride and context_start. +Sequence Conv Operator. + +SequenceConvOp performs convolution operation on features of contextLength +time-steps of each instance. The convolution operation calculates the output +based on the input, filter, strides and paddings parameters. +The size of each dimension of the parameters is checked during infer-shape. +In order to ensure the equal length of sequence before and after convolution, +it is necessary to fill the top and bottom of each sequence based on +context_length, context_stride and context_start. + )DOC"); } }; diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index dfe8de4985..63050a4ec2 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -45,33 +45,36 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault("AVERAGE") .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"}); AddComment(R"DOC( - SequencePoolOp pools features of all time-steps of each instance. - - It supports six pooling pooltype: - - AVERAGE: Out[i] = average_{for each instance in i-th sequence}{X[i]} - - SUM: Out[i] = sum_{for each instance in i-th sequence}{X[i]} - - SQRT: Out[i] = sum_{for each instance in i-th sequence}{X[i]} - / sqrt(i-th sequence length) - - LAST: Out[i] = last instance in i-th sequence X[i] - - FIRST: Out[i] = first instance in i-th sequence X[i] - - MAX: Out[i] = max_{for each instance in i-th sequence}{X[i]} - - For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps: - - Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2. - Besides, for the sake of simplicity, we assume M=1 and N=1, - and the value of X = [[1, 3], [2, 4, 6], [5, 1]]. - - Thus, Out is a [3,1,1] Tensor without LoD infomation. - And for different pooltype, the value of Out is as follows: - - - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2 - - SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1 - - SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), +Sequence Pool Operator. + +The SequencePoolOp pools features of all time-steps of each instance. +It supports six pooling types: +1. AVERAGE: Out[i] = $$avg(X_i)$$ +2. SUM: Out[i] = $$\sum_jX_{ij}$$ +3. SQRT: Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$ +4. LAST: Out[i] = last instance in i-th sequence X[i] +5. FIRST: Out[i] = first instance in i-th sequence X[i] +6. MAX: Out[i] = $$max(X_i)$$ + +The following example explains how this works: +For a mini-batch of 3 variable-length sentences, +containing 2, 3, and 2 time-steps: + +Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2. +Besides, for the sake of simplicity, we assume M=1 and N=1, +and the value of X = [[1, 3], [2, 4, 6], [5, 1]]. + +Thus, Out is a [3,1,1] Tensor without LoD infomation. +And for different pooltype, the value of Out is as follows: + +- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2 +- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1 +- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2), 6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2) - - MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1) - - LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1) - - FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1) +- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1) +- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1) +- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1) + )DOC"); } }; diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc index c891ab1fdc..32c1502566 100644 --- a/paddle/operators/sequence_softmax_op.cc +++ b/paddle/operators/sequence_softmax_op.cc @@ -43,20 +43,24 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension " "of length 1."); AddComment(R"DOC( -SequenceSoftmaxOp computes softmax activation among all time-steps for each +Sequence Softmax Operator. + +SequenceSoftmaxOp computes the softmax activation among all time-steps for each sequence. The dimension of each time-step should be 1. Thus, the shape of -input Tensor can be either [N, 1] or [N], where N is the sum of all sequences' -lengths. +input Tensor can be either [N, 1] or [N], where N is the sum of the length +of all sequences. -Equation: +The algorithm works as follows: for i-th sequence in a mini-batch: - Out(X[lod[i]:lod[i+1]], :) = - exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :])) + $$Out(X[lod[i]:lod[i+1]], :) = + \frac{\exp(X[lod[i]:lod[i+1], :])} + {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$ For example, for a mini-batch of 3 sequences with variable-length, each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7], then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :] and N turns out to be 7. + )DOC"); } }; diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc index e781c8db20..d9e4054652 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -107,26 +107,28 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. -This measures the elementwise probability error in discrete classification tasks +This measures the element-wise probability error in classification tasks in which each class is independent. This can be thought of as predicting labels -for a data-point that are not mutually exclusive. For example, a news article -can be about politics, technology or sports at the same time or none of these. +for a data-point, where labels are not mutually exclusive. +For example, a news article can be about politics, technology or sports +at the same time or none of these. The logistic loss is given as follows: - loss = -Labels * log(sigmoid(X)) - (1 - Labels) * log(1 - sigmoid(X)) + $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$ -We know that sigmoid(X) = (1 / (1 + exp(-X))). By substituting this we get +We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get: - loss = X - X * Labels + log(1 + exp(-X)) + $$loss = X - X * Labels + \log(1 + \exp(-X))$$ -For stability and to prevent overflow of exp(-X) when X < 0, -we can reformulate the loss as follows: +For stability and to prevent overflow of $$\exp(-X)$$ when X < 0, +we reformulate the loss as follows: - loss = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) + $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$ Both the input `X` and `Labels` can carry the LoD (Level of Details) information. However the output only shares the LoD with input `X`. + )DOC"); } }; -- GitLab