未验证 提交 4e3c535a 编写于 作者: G Guo Sheng 提交者: GitHub

Add support for dynamic_decode(while) training. (#22231) (#22574)

* Add support for dynamic_decode(while) training. test=develop

* Fix assign_op and tensor_array_read_write_op after solving conflict. test=develop

* Fix test_rnn_decode_api.py. test=develop

* Refine docs for apis in rnn.py. test=develop

* Adjust outputs of dynamic_decode. test=develop

* Remove the force_cpu update in assign_op. test=develop

* Remove the force_cpu update in assign_op. test=develop

* Make RNNCell.get_initial_states support batch_dim_idx argument. test=develop

* Rename _create_array_outof_while as _create_array_out_of_while in rnn.py.

test=release/1.7
上级 a8f85f2c
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/array_operator.h" #include "paddle/fluid/operators/array_operator.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -152,6 +154,21 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -152,6 +154,21 @@ class ReadFromArrayOp : public ArrayOp {
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else { } else {
VLOG(10) << "offset " << offset << " >= " << x_array.size(); VLOG(10) << "offset " << offset << " >= " << x_array.size();
// set grad of the writed tensor to 0 when used as write_to_array_grad
auto *fw_var = scope.FindVar(Input("X_W"));
if (fw_var == nullptr) return;
auto &fw_var_tensor = fw_var->Get<framework::LoDTensor>();
framework::AttributeMap attrs;
attrs["dtype"] = fw_var_tensor.type();
attrs["shape"] = framework::vectorize<int>(fw_var_tensor.dims());
attrs["value"] = 0.0f;
auto zero_op = framework::OpRegistry::CreateOp(
"fill_constant", {}, {{"Out", {Output("Out")}}}, attrs);
zero_op->Run(scope, place);
auto *out_tensor = out->GetMutable<framework::LoDTensor>();
out_tensor->set_lod(fw_var_tensor.lod());
} }
} }
}; };
...@@ -163,6 +180,10 @@ class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -163,6 +180,10 @@ class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
AddInput("I", AddInput("I",
"(Tensor) the subscript index in tensor array. The number of " "(Tensor) the subscript index in tensor array. The number of "
"element should be 1"); "element should be 1");
AddInput("X_W",
"(Tensor) the writed tensor when used as the grad op of "
"write_to_array. We use this to fill zero gradient.")
.AsDispensable();
AddOutput("Out", "(LoDTensor) the tensor will be read from."); AddOutput("Out", "(LoDTensor) the tensor will be read from.");
AddComment(R"DOC( AddComment(R"DOC(
ReadFromArray Operator. ReadFromArray Operator.
...@@ -199,6 +220,7 @@ class WriteToArrayGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -199,6 +220,7 @@ class WriteToArrayGradMaker : public framework::SingleGradOpMaker<T> {
grad_op->SetType("read_from_array"); grad_op->SetType("read_from_array");
grad_op->SetInput("I", this->Input("I")); grad_op->SetInput("I", this->Input("I"));
grad_op->SetInput("X", this->OutputGrad("Out")); grad_op->SetInput("X", this->OutputGrad("Out"));
grad_op->SetInput("X_W", this->Input("X"));
grad_op->SetOutput("Out", this->InputGrad("X")); grad_op->SetOutput("Out", this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs()); grad_op->SetAttrMap(this->Attrs());
return std::unique_ptr<T>(grad_op); return std::unique_ptr<T>(grad_op);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import sys
from functools import partial, reduce from functools import partial, reduce
from . import nn from . import nn
...@@ -22,6 +23,8 @@ from . import control_flow ...@@ -22,6 +23,8 @@ from . import control_flow
from . import utils from . import utils
from . import sequence_lod from . import sequence_lod
from .utils import * from .utils import *
from ..framework import default_main_program
from ..data_feeder import convert_dtype
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..framework import in_dygraph_mode from ..framework import in_dygraph_mode
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
...@@ -34,6 +37,11 @@ __all__ = [ ...@@ -34,6 +37,11 @@ __all__ = [
'BeamSearchDecoder', 'BeamSearchDecoder',
'rnn', 'rnn',
'dynamic_decode', 'dynamic_decode',
'DecodeHelper',
'TrainingHelper',
'GreedyEmbeddingHelper',
'SampleEmbeddingHelper',
'BasicDecoder',
'dynamic_lstm', 'dynamic_lstm',
'dynamic_lstmp', 'dynamic_lstmp',
'dynamic_gru', 'dynamic_gru',
...@@ -81,7 +89,8 @@ class RNNCell(object): ...@@ -81,7 +89,8 @@ class RNNCell(object):
batch_ref, batch_ref,
shape=None, shape=None,
dtype=None, dtype=None,
init_value=0): init_value=0,
batch_dim_idx=0):
""" """
Generate initialized states according to provided shape, data type and Generate initialized states according to provided shape, data type and
value. value.
...@@ -100,6 +109,8 @@ class RNNCell(object): ...@@ -100,6 +109,8 @@ class RNNCell(object):
property `cell.state_shape` is not available, float32 will be used property `cell.state_shape` is not available, float32 will be used
as the data type. The default value is None. as the data type. The default value is None.
init_value: A float value used to initialize states. init_value: A float value used to initialize states.
batch_dim_idx: An integer indicating which dimension of the tensor in
inputs represents batch size. The default value is 0.
Returns: Returns:
Variable: tensor variable[s] packed in the same structure provided \ Variable: tensor variable[s] packed in the same structure provided \
...@@ -109,10 +120,16 @@ class RNNCell(object): ...@@ -109,10 +120,16 @@ class RNNCell(object):
batch_ref = flatten(batch_ref)[0] batch_ref = flatten(batch_ref)[0]
def _is_shape_sequence(seq): def _is_shape_sequence(seq):
if sys.version_info < (3, ):
integer_types = (
int,
long, )
else:
integer_types = (int, )
"""For shape, list/tuple of integer is the finest-grained objection""" """For shape, list/tuple of integer is the finest-grained objection"""
if (isinstance(seq, list) or isinstance(seq, tuple)): if (isinstance(seq, list) or isinstance(seq, tuple)):
if reduce(lambda flag, x: isinstance(x, int) and flag, seq, if reduce(lambda flag, x: isinstance(x, integer_types) and flag,
True): seq, True):
return False return False
# TODO: Add check for the illegal # TODO: Add check for the illegal
if isinstance(seq, dict): if isinstance(seq, dict):
...@@ -145,12 +162,14 @@ class RNNCell(object): ...@@ -145,12 +162,14 @@ class RNNCell(object):
input=batch_ref, input=batch_ref,
shape=shape.shape, shape=shape.shape,
dtype=dtype, dtype=dtype,
value=init_value), states_shapes, states_dtypes) value=init_value,
input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
return init_states return init_states
@property @property
def state_shape(self): def state_shape(self):
""" """
Abstract method (property).
Used to initialize states. Used to initialize states.
A (possiblely nested structure of) shape[s], where a shape is represented A (possiblely nested structure of) shape[s], where a shape is represented
as a list/tuple of integers (-1 for batch size would be automatically as a list/tuple of integers (-1 for batch size would be automatically
...@@ -159,11 +178,13 @@ class RNNCell(object): ...@@ -159,11 +178,13 @@ class RNNCell(object):
`get_initial_states` or the `shape` argument is provided when using `get_initial_states` or the `shape` argument is provided when using
`get_initial_states`. `get_initial_states`.
""" """
raise NotImplementedError raise NotImplementedError(
"Please add implementaion for `state_shape` in the used cell.")
@property @property
def state_dtype(self): def state_dtype(self):
""" """
Abstract method (property).
Used to initialize states. Used to initialize states.
A (possiblely nested structure of) data types[s]. The structure must be A (possiblely nested structure of) data types[s]. The structure must be
same as that of `shape`, except when all tensors' in states has the same same as that of `shape`, except when all tensors' in states has the same
...@@ -172,7 +193,8 @@ class RNNCell(object): ...@@ -172,7 +193,8 @@ class RNNCell(object):
by `get_initial_states` or the `dtype` argument is provided when using by `get_initial_states` or the `dtype` argument is provided when using
`get_initial_states`. `get_initial_states`.
""" """
raise NotImplementedError raise NotImplementedError(
"Please add implementaion for `state_dtype` in the used cell.")
class GRUCell(RNNCell): class GRUCell(RNNCell):
...@@ -437,7 +459,8 @@ def rnn(cell, ...@@ -437,7 +459,8 @@ def rnn(cell,
return x return x
if initial_states is None: if initial_states is None:
initial_states = cell.get_initial_states(batch_ref=inputs) initial_states = cell.get_initial_states(
batch_ref=inputs, batch_dim_idx=1 if time_major else 0)
initial_states = map_structure(_switch_grad, initial_states) initial_states = map_structure(_switch_grad, initial_states)
if not time_major: if not time_major:
...@@ -499,7 +522,7 @@ class Decoder(object): ...@@ -499,7 +522,7 @@ class Decoder(object):
1. :code:`(initial_input, initial_state, finished) = initialize(inits)` , 1. :code:`(initial_input, initial_state, finished) = initialize(inits)` ,
which generates the input and state for the first decoding step, and gives the which generates the input and state for the first decoding step, and gives the
inintial status telling whether each sequence in the batch is finished. initial status telling whether each sequence in the batch is finished.
It would be called once before the decoding iterations. It would be called once before the decoding iterations.
2. :code:`(output, next_state, next_input, finished) = step(time, input, state)` , 2. :code:`(output, next_state, next_input, finished) = step(time, input, state)` ,
...@@ -528,14 +551,14 @@ class Decoder(object): ...@@ -528,14 +551,14 @@ class Decoder(object):
inits: Argument provided by the caller. inits: Argument provided by the caller.
Returns: Returns:
tuple: A tuple( :code:(initial_inputs, initial_states, finished)` ). \ tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
`initial_inputs` and `initial_states` both are a (possibly nested \ `initial_inputs` and `initial_states` both are a (possibly nested \
structure of) tensor variable[s], and `finished` is a tensor with \ structure of) tensor variable[s], and `finished` is a tensor with \
bool data type. bool data type.
""" """
raise NotImplementedError raise NotImplementedError
def step(self, time, inputs, states): def step(self, time, inputs, states, **kwargs):
""" """
Called per step of decoding. Called per step of decoding.
...@@ -544,6 +567,7 @@ class Decoder(object): ...@@ -544,6 +567,7 @@ class Decoder(object):
The data type is int64. The data type is int64.
inputs(Variable): A (possibly nested structure of) tensor variable[s]. inputs(Variable): A (possibly nested structure of) tensor variable[s].
states(Variable): A (possibly nested structure of) tensor variable[s]. states(Variable): A (possibly nested structure of) tensor variable[s].
**kwargs: Additional keyword arguments, provided by the caller.
Returns: Returns:
tuple: A tuple( :code:(outputs, next_states, next_inputs, finished)` ). \ tuple: A tuple( :code:(outputs, next_states, next_inputs, finished)` ). \
...@@ -555,14 +579,6 @@ class Decoder(object): ...@@ -555,14 +579,6 @@ class Decoder(object):
""" """
raise NotImplementedError raise NotImplementedError
@property
def output_dtype(self):
"""
A (possiblely nested structure of) data type[s]. The structure must be
same as `outputs` returned by `decoder.step`.
"""
raise NotImplementedError
def finalize(self, outputs, final_states, sequence_lengths): def finalize(self, outputs, final_states, sequence_lengths):
""" """
Called once after the decoding iterations if implemented. Called once after the decoding iterations if implemented.
...@@ -796,12 +812,14 @@ class BeamSearchDecoder(Decoder): ...@@ -796,12 +812,14 @@ class BeamSearchDecoder(Decoder):
batch_size = tensor.cast( batch_size = tensor.cast(
batch_size, batch_size,
indices.dtype) if batch_size.dtype != indices.dtype else batch_size indices.dtype) if batch_size.dtype != indices.dtype else batch_size
batch_size.stop_gradient = True # TODO: remove this
batch_pos = nn.expand( batch_pos = nn.expand(
nn.unsqueeze( nn.unsqueeze(
tensor.range( tensor.range(
0, batch_size, 1, dtype=indices.dtype), [1]), 0, batch_size, 1, dtype=indices.dtype), [1]),
[1, self.beam_size]) [1, self.beam_size])
topk_coordinates = nn.stack([batch_pos, indices], axis=2) topk_coordinates = nn.stack([batch_pos, indices], axis=2)
topk_coordinates.stop_gradient = True
return nn.gather_nd(x, topk_coordinates) return nn.gather_nd(x, topk_coordinates)
class OutputWrapper( class OutputWrapper(
...@@ -834,7 +852,7 @@ class BeamSearchDecoder(Decoder): ...@@ -834,7 +852,7 @@ class BeamSearchDecoder(Decoder):
Returns: Returns:
tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \ tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
`initial_inputs` is a tensor t filled by `start_token` with shape \ `initial_inputs` is a tensor t filled by `start_token` with shape \
`[batch_size, beam_size, 1]` when `embedding_fn` is None, or the \ `[batch_size, beam_size]` when `embedding_fn` is None, or the \
returned value of `embedding_fn(t)` when `embedding_fn` is provided. \ returned value of `embedding_fn(t)` when `embedding_fn` is provided. \
`initial_states` is a nested structure(namedtuple including cell_states, \ `initial_states` is a nested structure(namedtuple including cell_states, \
log_probs, finished, lengths as fields) of tensor variables, where \ log_probs, finished, lengths as fields) of tensor variables, where \
...@@ -895,7 +913,7 @@ class BeamSearchDecoder(Decoder): ...@@ -895,7 +913,7 @@ class BeamSearchDecoder(Decoder):
beam_state(Variable): A structure of tensor variables. beam_state(Variable): A structure of tensor variables.
It is same as the `initial_states` returned by `initialize()` for It is same as the `initial_states` returned by `initialize()` for
the first decoding step and `beam_search_state` returned by the first decoding step and `beam_search_state` returned by
`initialize()` for the others. `step()` for the others.
Returns: Returns:
tuple: A tuple( :code:`(beam_search_output, beam_search_state)` ). \ tuple: A tuple( :code:`(beam_search_output, beam_search_state)` ). \
...@@ -921,6 +939,7 @@ class BeamSearchDecoder(Decoder): ...@@ -921,6 +939,7 @@ class BeamSearchDecoder(Decoder):
# TODO: length penalty # TODO: length penalty
scores = log_probs scores = log_probs
scores = nn.reshape(scores, [-1, self.beam_size * self.vocab_size]) scores = nn.reshape(scores, [-1, self.beam_size * self.vocab_size])
# TODO: add grad for topk then this beam search can be used to train
topk_scores, topk_indices = nn.topk(input=scores, k=self.beam_size) topk_scores, topk_indices = nn.topk(input=scores, k=self.beam_size)
beam_indices = nn.elementwise_floordiv(topk_indices, beam_indices = nn.elementwise_floordiv(topk_indices,
self.vocab_size_tensor) self.vocab_size_tensor)
...@@ -993,6 +1012,7 @@ class BeamSearchDecoder(Decoder): ...@@ -993,6 +1012,7 @@ class BeamSearchDecoder(Decoder):
beam_state=states) beam_state=states)
finished = beam_search_state.finished finished = beam_search_state.finished
sample_ids = beam_search_output.predicted_ids sample_ids = beam_search_output.predicted_ids
sample_ids.stop_gradient = True
next_inputs = self.embedding_fn( next_inputs = self.embedding_fn(
sample_ids) if self.embedding_fn else sample_ids sample_ids) if self.embedding_fn else sample_ids
...@@ -1027,20 +1047,14 @@ class BeamSearchDecoder(Decoder): ...@@ -1027,20 +1047,14 @@ class BeamSearchDecoder(Decoder):
# TODO: use FinalBeamSearchDecoderOutput as output # TODO: use FinalBeamSearchDecoderOutput as output
return predicted_ids, final_states return predicted_ids, final_states
@property
def output_dtype(self):
"""
The nested structure of data types for beam search output. It is a namedtuple
including scores, predicted_ids, parent_ids as fields.
"""
return self.OutputWrapper(
scores="float32", predicted_ids="int64", parent_ids="int64")
def dynamic_decode(decoder, def dynamic_decode(decoder,
inits=None, inits=None,
max_step_num=None, max_step_num=None,
output_time_major=False, output_time_major=False,
impute_finished=False,
is_test=False,
return_length=False,
**kwargs): **kwargs):
""" """
Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
...@@ -1058,24 +1072,40 @@ def dynamic_decode(decoder, ...@@ -1058,24 +1072,40 @@ def dynamic_decode(decoder,
max_step_num(int, optional): The maximum number of steps. If not provided, max_step_num(int, optional): The maximum number of steps. If not provided,
decode until the decoder is fully done, or in other words, the returned decode until the decoder is fully done, or in other words, the returned
Tensor by :code:`decoder.step()` indicating finished status contains Tensor by :code:`decoder.step()` indicating finished status contains
all True). Default `None`. all True. Default `None`.
output_time_major(bool, optional): Indicate the data layout of Tensor included output_time_major(bool, optional): Indicate the data layout of Tensor included
in the final outpus(the first returned value of this method). If in the final outpus(the first returned value of this method). If
attr:`False`, the data layout would be batch major with shape attr:`False`, the data layout would be batch major with shape
`[batch_size, seq_len, ...]`. If attr:`True`, the data layout would `[batch_size, seq_len, ...]`. If attr:`True`, the data layout would
be time major with shape `[seq_len, batch_size, ...]`. Default: `False`. be time major with shape `[seq_len, batch_size, ...]`. Default: `False`.
impute_finished(bool, optional): If `True`, then states get copied through
for batch entries which are marked as finished, which differs with the
unfinished using the new states returned by :code:`decoder.step()` and
ensures that the final states have the correct values. Otherwise, states
wouldn't be copied through when finished. If the returned `final_states`
is needed, it should be set as True, which causes some slowdown.
Default `False`.
is_test(bool, optional): A flag indicating whether to use test mode. In
test mode, it is more memory saving. Default `False`.
return_length(bool, optional): A flag indicating whether to return an
extra Tensor variable in the output tuple, which stores the actual
lengths of all decoded sequences. Default `False`.
**kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`.
Returns: Returns:
tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ tuple: A tuple( :code:`(final_outputs, final_states, sequence_lengths)` ) \
outputs and states, both are Tensor or nested structure of Tensor. \ when `return_length` is True, otherwise a tuple( :code:`(final_outputs, final_states)` ). \
`final_outputs` has the same structure and data types as \ The final outputs and states, both are Tensor or nested structure of Tensor. \
:code:`decoder.output_dtype` , and each Tenser in `final_outputs` \ `final_outputs` has the same structure and data types as the :code:`outputs` \
returned by :code:`decoder.step()` , and each Tenser in `final_outputs` \
is the stacked of all decoding steps' outputs, which might be revised \ is the stacked of all decoding steps' outputs, which might be revised \
by :code:`decoder.finalize` . `final_states` is the counterpart \ by :code:`decoder.finalize()` if the decoder has implemented `finalize`. \
at last time step of initial states returned by :code:`decoder.initialize` , \ `final_states` is the counterpart at last time step of initial states \
thus has the same structure with it and has tensors with same shapes \ returned by :code:`decoder.initialize()` , thus has the same structure \
and data types. with it and has tensors with same shapes and data types. `sequence_lengths` \
is an `int64` tensor with the same shape as `finished` returned \
by :code:`decoder.initialize()` , and it stores the actual lengths of \
all decoded sequences.
Examples: Examples:
...@@ -1111,70 +1141,720 @@ def dynamic_decode(decoder, ...@@ -1111,70 +1141,720 @@ def dynamic_decode(decoder,
initial_inputs, initial_states, initial_finished = decoder.initialize(inits) initial_inputs, initial_states, initial_finished = decoder.initialize(inits)
global_inputs, global_states, global_finished = ( global_inputs, global_states, global_finished = (
initial_inputs, initial_states, initial_finished) initial_inputs, initial_states, initial_finished)
global_finished.stop_gradient = True
step_idx = tensor.fill_constant(shape=[1], dtype="int64", value=0) step_idx = tensor.fill_constant(shape=[1], dtype="int64", value=0)
cond = control_flow.logical_not((nn.reduce_all(initial_finished))) cond = control_flow.logical_not((nn.reduce_all(initial_finished)))
if max_step_num is not None: if max_step_num is not None:
max_step_num = tensor.fill_constant( max_step_num = tensor.fill_constant(
shape=[1], dtype="int64", value=max_step_num) shape=[1], dtype="int64", value=max_step_num)
while_op = control_flow.While(cond) while_op = control_flow.While(cond, is_test=is_test)
inputs = map_structure(lambda x: x, initial_inputs)
states = map_structure(lambda x: x, initial_states)
outputs_arrays = map_structure(
lambda dtype: control_flow.create_array(dtype), decoder.output_dtype)
sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64") sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64")
sequence_lengths.stop_gradient = True
if is_test:
# for test, reuse inputs and states variables to save memory
inputs = map_structure(lambda x: x, initial_inputs)
states = map_structure(lambda x: x, initial_states)
else:
# inputs and states of all steps must be saved for backward and training
inputs_arrays = map_structure(
lambda x: control_flow.array_write(x, step_idx), initial_inputs)
states_arrays = map_structure(
lambda x: control_flow.array_write(x, step_idx), initial_states)
def _maybe_copy(state, new_state, step_mask): def _maybe_copy(state, new_state, step_mask):
# TODO: use where_op # TODO: use where_op
state_dtype = state.dtype
if convert_dtype(state_dtype) in ["bool"]:
state = tensor.cast(state, dtype="float32")
new_state = tensor.cast(new_state, dtype="float32")
if step_mask.dtype != state.dtype:
step_mask = tensor.cast(step_mask, dtype=state.dtype)
# otherwise, renamed bool gradients of would be summed up leading
# to sum(bool) error.
step_mask.stop_gradient = True
new_state = nn.elementwise_mul( new_state = nn.elementwise_mul(
new_state, step_mask, axis=0) - nn.elementwise_mul( state, step_mask, axis=0) - nn.elementwise_mul(
state, (step_mask - 1), axis=0) new_state, (step_mask - 1), axis=0)
if convert_dtype(state_dtype) in ["bool"]:
new_state = tensor.cast(new_state, dtype=state_dtype)
return new_state return new_state
def _transpose_batch_time(x): def _transpose_batch_time(x):
return nn.transpose(x, [1, 0] + list(range(2, len(x.shape)))) return nn.transpose(x, [1, 0] + list(range(2, len(x.shape))))
def _create_array_out_of_while(dtype):
current_block_idx = default_main_program().current_block_idx
default_main_program().current_block_idx = default_main_program(
).current_block().parent_idx
tensor_array = control_flow.create_array(dtype)
default_main_program().current_block_idx = current_block_idx
return tensor_array
# While # While
with while_op.block(): with while_op.block():
if not is_test:
inputs = map_structure(
lambda array: control_flow.array_read(array, step_idx),
inputs_arrays)
states = map_structure(
lambda array: control_flow.array_read(array, step_idx),
states_arrays)
(outputs, next_states, next_inputs, (outputs, next_states, next_inputs,
next_finished) = decoder.step(step_idx, inputs, states, **kwargs) next_finished) = decoder.step(step_idx, inputs, states, **kwargs)
next_finished = control_flow.logical_or(next_finished, global_finished)
next_sequence_lengths = nn.elementwise_add( next_sequence_lengths = nn.elementwise_add(
sequence_lengths, sequence_lengths,
tensor.cast( tensor.cast(
control_flow.logical_not(global_finished), control_flow.logical_not(global_finished),
sequence_lengths.dtype)) sequence_lengths.dtype))
if impute_finished: # rectify the states for the finished.
next_states = map_structure(
lambda x, y: _maybe_copy(x, y, global_finished),
states,
next_states, )
# create tensor array in global block after dtype[s] of outputs can be got
outputs_arrays = map_structure(
lambda x: _create_array_out_of_while(x.dtype), outputs)
map_structure( map_structure(
lambda x, x_array: control_flow.array_write( lambda x, x_array: control_flow.array_write(
x, i=step_idx, array=x_array), outputs, outputs_arrays) x, i=step_idx, array=x_array), outputs, outputs_arrays)
control_flow.increment(x=step_idx, value=1.0, in_place=True) control_flow.increment(x=step_idx, value=1.0, in_place=True)
map_structure(tensor.assign, next_inputs, global_inputs) if is_test:
map_structure(tensor.assign, next_states, global_states) map_structure(tensor.assign, next_inputs, global_inputs)
map_structure(tensor.assign, next_states, global_states)
else:
map_structure(
lambda x, x_array: control_flow.array_write(
x, i=step_idx, array=x_array), next_inputs, inputs_arrays)
map_structure(
lambda x, x_array: control_flow.array_write(
x, i=step_idx, array=x_array), next_states, states_arrays)
tensor.assign(next_finished, global_finished) tensor.assign(next_finished, global_finished)
tensor.assign(next_sequence_lengths, sequence_lengths) tensor.assign(next_sequence_lengths, sequence_lengths)
if max_step_num is not None: if max_step_num is not None:
control_flow.logical_and( control_flow.logical_and(
control_flow.logical_not(nn.reduce_all(next_finished)), control_flow.logical_not(nn.reduce_all(global_finished)),
control_flow.less_equal(step_idx, max_step_num), cond) control_flow.less_equal(step_idx, max_step_num), cond)
else: else:
control_flow.logical_not(nn.reduce_all(next_finished), cond) control_flow.logical_not(nn.reduce_all(global_finished), cond)
final_outputs = map_structure( final_outputs = map_structure(
lambda array: tensor.tensor_array_to_tensor( lambda array: tensor.tensor_array_to_tensor(
array, axis=0, use_stack=True)[0], outputs_arrays) array, axis=0, use_stack=True)[0], outputs_arrays)
final_states = global_states if is_test:
final_states = global_states
else:
final_states = map_structure(
lambda array: control_flow.array_read(array, step_idx),
states_arrays)
try: try:
final_outputs, final_states = decoder.finalize( final_outputs, final_states = decoder.finalize(
final_outputs, global_states, sequence_lengths) final_outputs, final_states, sequence_lengths)
except NotImplementedError: except NotImplementedError:
pass pass
if not output_time_major: if not output_time_major:
final_outputs = map_structure(_transpose_batch_time, final_outputs) final_outputs = map_structure(_transpose_batch_time, final_outputs)
return final_outputs, final_states return (final_outputs, final_states,
sequence_lengths) if return_length else (final_outputs,
final_states)
class DecodeHelper(object):
"""
DecodeHelper is the base class for any helper instance used in `BasicDecoder`.
It provides interface to implement sampling and produce inputs for the next
time step in dynamic decoding.
"""
def initialize(self):
"""
DecodeHelper initialization to produce inputs for the first decoding step
and give the initial status telling whether each sequence in the batch
is finished. It is the partial of the initialization of `BasicDecoder`.
Returns:
tuple: A tuple( :code:`(initial_inputs, initial_finished)` ). \
`initial_inputs` is a (possibly nested structure of) tensor \
variable[s], and the tensor's shape is `[batch_size, ...]`. \
`initial_finished` is a bool tensor with shape `[batch_size]`.
"""
pass
def sample(self, time, outputs, states):
"""
Perform sampling with some strategies according to `outputs`. It is the
partial of `BasicDecoder.step`.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
Returns:
Variable: An `int64` tensor representing the sampled ids.
"""
pass
def next_inputs(self, time, outputs, states, sample_ids):
"""
Produce the inputs and states for next time step and give status telling
whether each minibatch entry is finished. It is called after `sample` in
`BasicDecoder.step`. It is the partial of `BasicDecoder.step`.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
sample_ids(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `sample_ids` returned by `sample()`.
Returns:
tuple: A tuple( :code:`(finished, next_inputs, next_states)` ). \
`next_inputs` and `next_states` both are a (possibly nested \
structure of) tensor variable[s], and the structure, shape and \
data type of `next_states` must be same as the input argument \
`states`. `finished` is a bool tensor with shape `[batch_size]`.
"""
pass
class TrainingHelper(DecodeHelper):
"""
TrainingHelper is a subclass of DecodeHelper. It is a decoding helper
slicing from the full sequence inputs as the inputs for corresponding
step. And it uses `argmax` to sample from the outputs of `cell.call()`.
Since the needs of sequence inputs, it is used mostly for teach-forcing MLE
(maximum likelihood) training, and the sampled would not be used.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.layers as layers
trg_emb = fluid.data(name="trg_emb",
shape=[None, None, 128],
dtype="float32")
trg_seq_length = fluid.data(name="trg_seq_length",
shape=[None],
dtype="int64")
helper = layers.TrainingHelper(trg_emb, trg_seq_length)
decoder_cell = layers.GRUCell(hidden_size=128)
decoder = layers.BasicDecoder(decoder_cell, helper)
outputs = layers.dynamic_decode(
decoder,
inits=decoder_cell.get_initial_states(trg_emb),
is_test=False)
"""
def __init__(self, inputs, sequence_length, time_major=False):
"""
Constructor of TrainingHelper.
Parameters:
inputs(Variable): A (possibly nested structure of) tensor variable[s].
The shape of tensor should be `[batch_size, sequence_length, ...]`
for `time_major == False` or `[sequence_length, batch_size, ...]`
for `time_major == True`. It represents the inputs to be sliced
from at every decoding step.
sequence_length(Variable): A tensor with shape `[batch_size]`.
It stores real length of each instance in `inputs`, by which we
can label the finished status of each instance at every decoding
step.
time_major(bool, optional): Indicate the data layout of Tensor included
in `inputs`. If `False`, the data layout would be batch major with
shape `[batch_size, sequence_length, ...]`. If `True`, the data
layout would be time major with shape `[sequence_length, batch_size, ...]`.
Default: `False`.
"""
self.inputs = inputs
self.sequence_length = sequence_length
self.time_major = time_major
# extend inputs to avoid to slice out of range in `next_inputs`
# may be easier and have better performance than condition_op
self.inputs_ = map_structure(
lambda x: nn.pad(x,
paddings=([0, 1] + [0, 0] * (len(x.shape) - 1))
if time_major else ([0, 0, 0, 1] + [0, 0] *
(len(x.shape) - 2))),
self.inputs)
def initialize(self):
"""
TrainingHelper initialization produces inputs for the first decoding
step by slicing at the first time step of full sequence inputs, and it
gives initial status telling whether each sequence in the batch is
finished. It is the partial of the initialization of `BasicDecoder`.
Returns:
tuple: A tuple( :code:`(initial_inputs, initial_finished)` ). \
`initial_inputs` is a (possibly nested structure of) tensor \
variable[s], and the tensor's shape is `[batch_size, ...]`. \
`initial_finished` is a bool tensor with shape `[batch_size]`.
"""
init_finished = control_flow.equal(
self.sequence_length,
tensor.fill_constant(
shape=[1], dtype=self.sequence_length.dtype, value=0))
# TODO: support zero length
init_inputs = map_structure(
lambda x: x[0] if self.time_major else x[:, 0], self.inputs)
return init_inputs, init_finished
def sample(self, time, outputs, states):
"""
Perform sampling by using `argmax` according to the `outputs`. Mostly
the sampled ids would not be used since the inputs for next decoding
step would be got by slicing.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the
caller, representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
Returns:
Variable: An `int64` tensor with shape `[batch_size]`, representing \
the sampled ids.
"""
sample_ids = tensor.argmax(outputs, axis=-1)
return sample_ids
def next_inputs(self, time, outputs, states, sample_ids):
"""
Generate inputs for the next decoding step by slicing at corresponding
step of the full sequence inputs. Simultaneously, produce the states
for next time step by directly using the input `states` and emit status
telling whether each minibatch entry reaches to the corresponding length.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the
caller, representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
sample_ids(Variable): An `int64` tensor variable shaped `[batch_size]`.
It is same as `sample_ids` returned by `sample()`.
Returns:
tuple: A tuple( :code:`(finished, next_inputs, next_states)` ). \
`next_inputs` and `next_states` both are a (possibly nested \
structure of) tensor variable[s], and the tensor's shape is \
`[batch_size, ...]`. `next_states` is identical to the input \
argument `states`. `finished` is a `bool` Tensor with \
shape `[batch_size]`.
"""
# TODO: compatibility of int32 and int64
time = tensor.cast(
time,
"int32") if convert_dtype(time.dtype) not in ["int32"] else time
if self.sequence_length.dtype != time.dtype:
self.sequence_length = tensor.cast(self.sequence_length, time.dtype)
next_time = time + 1
finished = control_flow.less_equal(self.sequence_length, next_time)
def _slice(x): # TODO: use Variable.__getitem__
axes = [0 if self.time_major else 1]
return nn.squeeze(
nn.slice(
x, axes=axes, starts=[next_time], ends=[next_time + 1]),
axes=axes)
next_inputs = map_structure(_slice, self.inputs_)
return finished, next_inputs, states
class GreedyEmbeddingHelper(DecodeHelper):
"""
GreedyEmbeddingHelper is a subclass of DecodeHelper. It is a decoding helper
uses the argmax of the output (treated as logits) and passes the results
through an embedding layer to get inputs for the next decoding step.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.layers as layers
trg_emb = fluid.data(name="trg_emb",
shape=[None, None, 128],
dtype="float32")
trg_embeder = lambda x: fluid.embedding(
x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
output_layer = lambda x: layers.fc(x,
size=10000,
num_flatten_dims=len(x.shape) - 1,
param_attr=fluid.ParamAttr(name=
"output_w"),
bias_attr=False)
helper = layers.GreedyEmbeddingHelper(trg_embeder, start_tokens=0, end_token=1)
decoder_cell = layers.GRUCell(hidden_size=128)
decoder = layers.BasicDecoder(decoder_cell, helper, output_fn=output_layer)
outputs = layers.dynamic_decode(
decoder=decoder, inits=decoder_cell.get_initial_states(encoder_output))
"""
def __init__(self, embedding_fn, start_tokens, end_token):
"""
Constructor of GreedyEmbeddingHelper.
Parameters:
embedding_fn(callable): A functor to apply on the argmax results.
Mostly it is an embedding layer to transform ids to embeddings.
**Note that fluid.embedding should be used here rather than
fluid.layers.embedding, since shape of ids is [batch_size].
when using fluid.layers.embedding, must unsqueeze in embedding_fn.**
start_tokens(Variable): A `int64` tensor shaped `[batch_size]`,
representing the start tokens.
end_token(int): The end token id.
Returns:
tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
`initial_inputs` and `initial_states` both are a (possibly nested \
structure of) tensor variable[s], and `finished` is a tensor with \
bool data type.
"""
self.embedding_fn = embedding_fn
self.start_tokens = start_tokens
self.end_token = tensor.fill_constant(
shape=[1], dtype="int64", value=end_token)
def initialize(self):
"""
GreedyEmbeddingHelper initialization produces inputs for the first decoding
step by using `start_tokens` of the constructor, and gives initial
status telling whether each sequence in the batch is finished.
It is the partial of the initialization of `BasicDecoder`.
Returns:
tuple: A tuple( :code:`(initial_inputs, initial_finished)` ). \
`initial_inputs` is same as `start_tokens` of the constructor. \
`initial_finished` is a `bool` tensor filled by False and has \
the same shape as `start_tokens`.
"""
# TODO: remove the restriction of force_cpu
init_finished = tensor.fill_constant_batch_size_like(
input=self.start_tokens,
shape=[-1],
dtype="bool",
value=False,
force_cpu=True)
init_inputs = self.embedding_fn(self.start_tokens)
return init_inputs, init_finished
def sample(self, time, outputs, states):
"""
Perform sampling by using `argmax` according to the `outputs`.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the
caller, representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
Returns:
Variable: An `int64` tensor with shape `[batch_size]`, representing \
the sampled ids.
"""
sample_ids = tensor.argmax(outputs, axis=-1)
return sample_ids
def next_inputs(self, time, outputs, states, sample_ids):
"""
Generate inputs for the next decoding step by applying `embedding_fn`
to `sample_ids`. Simultaneously, produce the states for next time step
by directly using the input `states` and emit status telling whether
each minibatch entry gets an `end_token` sample.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the
caller, representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
sample_ids(Variable): An `int64` tensor variable shaped `[batch_size]`.
It is same as `sample_ids` returned by `sample()`.
Returns:
tuple: A tuple( :code:`(finished, next_inputs, next_states)` ). \
`next_inputs` and `next_states` both are a (possibly nested \
structure of) tensor variable[s], and the tensor's shape is \
`[batch_size, ...]`. `next_states` is identical to the input \
argument `states`. `finished` is a `bool` Tensor with \
shape `[batch_size]`.
"""
finished = control_flow.equal(sample_ids, self.end_token)
next_inputs = self.embedding_fn(sample_ids)
return finished, next_inputs, states
class SampleEmbeddingHelper(GreedyEmbeddingHelper):
"""
SampleEmbeddingHelper is a subclass of GreedyEmbeddingHelper. It is a decoding
helper uses sampling (from a distribution) instead of argmax of the output
(treated as logits) and passes the results through an embedding layer to get
inputs for the next decoding step.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.layers as layers
trg_emb = fluid.data(name="trg_emb",
shape=[None, None, 128],
dtype="float32")
trg_embeder = lambda x: fluid.embedding(
x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
output_layer = lambda x: layers.fc(x,
size=10000,
num_flatten_dims=len(x.shape) - 1,
param_attr=fluid.ParamAttr(name=
"output_w"),
bias_attr=False)
helper = layers.SampleEmbeddingHelper(trg_embeder, start_tokens=0, end_token=1)
decoder_cell = layers.GRUCell(hidden_size=128)
decoder = layers.BasicDecoder(decoder_cell, helper, output_fn=output_layer)
outputs = layers.dynamic_decode(
decoder=decoder, inits=decoder_cell.get_initial_states(encoder_output))
"""
def __init__(self,
embedding_fn,
start_tokens,
end_token,
softmax_temperature=None,
seed=None):
"""
Constructor of SampleEmbeddingHelper.
Parameters:
embedding_fn(callable): A functor to apply on the argmax results.
Mostly it is an embedding layer to transform ids to embeddings.
**Note that fluid.embedding should be used here rather than
fluid.layers.embedding, since shape of ids is [batch_size].
when using fluid.layers.embedding, must unsqueeze in embedding_fn.**
start_tokens(Variable): A `int64` tensor shaped `[batch_size]`,
representing the start tokens.
end_token(int): The end token id.
softmax_temperature(float, optional): the value to divide the logits
by before computing the softmax. Higher temperatures (above 1.0)
lead to more random, while lower temperatures push the sampling
distribution towards the argmax. It must be strictly greater than
0. Defaults to None, meaning using a temperature valued 1.0.
seed: (int, optional) The sampling seed. Defaults to None, meaning not
to use fixed seed.
Returns:
tuple: A tuple( :code:`(initial_inputs, initial_states, finished)` ). \
`initial_inputs` and `initial_states` both are a (possibly nested \
structure of) tensor variable[s], and `finished` is a tensor with \
bool data type.
"""
super(SampleEmbeddingHelper, self).__init__(embedding_fn, start_tokens,
end_token)
self.softmax_temperature = tensor.fill_constant(
shape=[1], dtype="float32", value=softmax_temperature
) if softmax_temperature is not None else None
self.seed = seed
def sample(self, time, outputs, states):
"""
Perform sampling from a categorical distribution, and the distribution
is computed by `softmax(outputs/softmax_temperature)`.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the
caller, representing the current time step number of decoding.
outputs(Variable): A tensor variable. Usually it's data type is float32
or float64, and it's shape is `[batch_size, vocabulary_size]`,
representing the predicted logits of current step. It is same as
`outputs` returned by `BasicDecoder.output_fn(BasicDecoder.cell.call())`.
states(Variable): A (possibly nested structure of) tensor variable[s].
It is same as `new_states` returned by `BasicDecoder.cell.call()`.
Returns:
Variable: An `int64` tensor with shape `[batch_size]`, representing \
the sampled ids.
"""
logits = (outputs / self.softmax_temperature
) if self.softmax_temperature is not None else outputs
probs = nn.softmax(logits)
# TODO: remove this stop_gradient. The stop_gradient of sample_ids can
# not pass to probs, since sampling_id op does not have corresponding
# grad op and thus can not pass.
probs.stop_gradient = True
sample_ids = nn.sampling_id(
probs, seed=self.seed, dtype=self.start_tokens.dtype)
return sample_ids
class BasicDecoder(Decoder):
"""
BasicDecoder is a subclass of Decoder and assembles a RNNCell and DecodeHelper
instance as members, where the DecodeHelper helps to implement customed
decoding strategies.. It performs one decoding step as following steps:
1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
to get outputs and new states from cell.
2. Perform `sample_ids = helper.sample(time, cell_outputs, cell_states)`
to sample ids as decoded results of the current time step.
3. Perform `finished, next_inputs, next_states = helper.next_inputs(time,
cell_outputs, cell_states, sample_ids)` to generate inputs, states and
finished status for the next decoding step.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.layers as layers
trg_emb = fluid.data(name="trg_emb",
shape=[None, None, 128],
dtype="float32")
trg_embeder = lambda x: fluid.embedding(
x, size=[10000, 128], param_attr=fluid.ParamAttr(name="trg_embedding"))
output_layer = lambda x: layers.fc(x,
size=10000,
num_flatten_dims=len(x.shape) - 1,
param_attr=fluid.ParamAttr(name=
"output_w"),
bias_attr=False)
helper = layers.SampleEmbeddingHelper(trg_embeder, start_tokens=0, end_token=1)
decoder_cell = layers.GRUCell(hidden_size=128)
decoder = layers.BasicDecoder(decoder_cell, helper, output_fn=output_layer)
outputs = layers.dynamic_decode(
decoder=decoder, inits=decoder_cell.get_initial_states(encoder_output))
"""
def __init__(self, cell, helper, output_fn=None):
"""
Constructor of BasicDecoder.
Parameters:
cell(RNNCell): An instance of `RNNCell` or object with the same interface.
helper(DecodeHelper): An instance of `DecodeHelper`.
output_fn(optional): A callable to apply to the cell's output prior to
sampling. Default None.
"""
self.cell = cell
self.helper = helper
self.output_fn = output_fn
def initialize(self, initial_cell_states):
"""
BasicDecoder initialization includes helper initialization and cell
initialization, and cell initialization uses `initial_cell_states` as
the result directly.
Parameters:
initial_cell_states(Variable): A (possibly nested structure of)
tensor variable[s]. An argument provided by the caller `dynamic_decode`.
Returns:
tuple: A tuple( :code:(initial_inputs, initial_cell_states, finished)` ). \
`initial_inputs` and `initial_states` both are a (possibly nested \
structure of) tensor variable[s], and `finished` is a tensor with \
bool data type. `initial_inputs` and `finished` are the results \
of `helper.initialize()`, and `initial_cell_states` is same as \
the input argument counterpart.
"""
(initial_inputs, initial_finished) = self.helper.initialize()
return initial_inputs, initial_cell_states, initial_finished
class OutputWrapper(
collections.namedtuple("OutputWrapper",
("cell_outputs", "sample_ids"))):
"""
The structure for the returned value `outputs` of `decoder.step`.
A namedtuple includes cell_outputs, sample_ids as fields.
"""
pass
def step(self, time, inputs, states, **kwargs):
"""
Perform one decoding step as following steps:
1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
to get outputs and new states from cell.
2. Perform `sample_ids = helper.sample(time, cell_outputs, cell_states)`
to sample ids as decoded results of the current time step.
3. Perform `finished, next_inputs, next_states = helper.next_inputs(time,
cell_outputs, cell_states, sample_ids)` to generate inputs, states and
finished status for the next decoding step.
Parameters:
time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
representing the current time step number of decoding.
inputs(Variable): A tensor variable. It is same as `initial_inputs`
returned by `initialize()` for the first decoding step and
`next_inputs` returned by `step()` for the others.
states(Variable): A structure of tensor variables.
It is same as the `initial_cell_states` returned by `initialize()`
for the first decoding step and `next_states` returned by
`step()` for the others.
**kwargs: Additional keyword arguments, provided by the caller
`dynamic_decode`.
Returns:
tuple: A tuple( :code:`(outputs, next_states, next_inputs, finished)` ). \
`outputs` is a namedtuple(including cell_outputs, sample_ids, \
as fields) of tensor variables, where `cell_outputs` is the result \
fof `cell.call()` and `sample_ids` is the result of `helper.sample()`. \
`next_states` and `next_inputs` have the same structure, shape \
and data type as the input arguments `states` and `inputs` separately. \
`finished` is a `bool` tensor with shape `[batch_size]`.
"""
cell_outputs, cell_states = self.cell(inputs, states, **kwargs)
if self.output_fn is not None:
cell_outputs = self.output_fn(cell_outputs)
sample_ids = self.helper.sample(
time=time, outputs=cell_outputs, states=cell_states)
sample_ids.stop_gradient = True
(finished, next_inputs, next_states) = self.helper.next_inputs(
time=time,
outputs=cell_outputs,
states=cell_states,
sample_ids=sample_ids)
outputs = self.OutputWrapper(cell_outputs, sample_ids)
return (outputs, next_states, next_inputs, finished)
def dynamic_lstm(input, def dynamic_lstm(input,
......
...@@ -787,6 +787,7 @@ def argmin(x, axis=0): ...@@ -787,6 +787,7 @@ def argmin(x, axis=0):
inputs={'X': x}, inputs={'X': x},
outputs={'Out': [out]}, outputs={'Out': [out]},
attrs={'axis': axis}) attrs={'axis': axis})
out.stop_gradient = True
return out return out
...@@ -846,6 +847,7 @@ def argmax(x, axis=0): ...@@ -846,6 +847,7 @@ def argmax(x, axis=0):
inputs={'X': x}, inputs={'X': x},
outputs={'Out': [out]}, outputs={'Out': [out]},
attrs={'axis': axis}) attrs={'axis': axis})
out.stop_gradient = True
return out return out
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
...@@ -24,22 +24,15 @@ import paddle.fluid.core as core ...@@ -24,22 +24,15 @@ import paddle.fluid.core as core
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid import framework from paddle.fluid import framework
from paddle.fluid.layers.rnn import LSTMCell, GRUCell, RNNCell, BeamSearchDecoder, dynamic_decode
from paddle.fluid.layers import rnn as dynamic_rnn
from paddle.fluid import contrib
from paddle.fluid.contrib.layers import basic_lstm
import numpy as np
class EncoderCell(RNNCell): class EncoderCell(layers.RNNCell):
def __init__(self, num_layers, hidden_size, dropout_prob=0.): def __init__(self, num_layers, hidden_size, dropout_prob=0.):
self.num_layers = num_layers self.num_layers = num_layers
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.dropout_prob = dropout_prob self.dropout_prob = dropout_prob
self.lstm_cells = [] self.lstm_cells = [
for i in range(num_layers): layers.LSTMCell(hidden_size) for i in range(num_layers)
self.lstm_cells.append(LSTMCell(hidden_size)) ]
def call(self, step_input, states): def call(self, step_input, states):
new_states = [] new_states = []
...@@ -55,14 +48,14 @@ class EncoderCell(RNNCell): ...@@ -55,14 +48,14 @@ class EncoderCell(RNNCell):
return [cell.state_shape for cell in self.lstm_cells] return [cell.state_shape for cell in self.lstm_cells]
class DecoderCell(RNNCell): class DecoderCell(layers.RNNCell):
def __init__(self, num_layers, hidden_size, dropout_prob=0.): def __init__(self, num_layers, hidden_size, dropout_prob=0.):
self.num_layers = num_layers self.num_layers = num_layers
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.dropout_prob = dropout_prob self.dropout_prob = dropout_prob
self.lstm_cells = [] self.lstm_cells = [
for i in range(num_layers): layers.LSTMCell(hidden_size) for i in range(num_layers)
self.lstm_cells.append(LSTMCell(hidden_size)) ]
def attention(self, hidden, encoder_output, encoder_padding_mask): def attention(self, hidden, encoder_output, encoder_padding_mask):
query = layers.fc(hidden, query = layers.fc(hidden,
...@@ -97,117 +90,456 @@ class DecoderCell(RNNCell): ...@@ -97,117 +90,456 @@ class DecoderCell(RNNCell):
return out, [new_lstm_states, out] return out, [new_lstm_states, out]
class TestDynamicDecode(unittest.TestCase): class Encoder(object):
def setUp(self): def __init__(self, num_layers, hidden_size, dropout_prob=0.):
self.batch_size = 4 self.encoder_cell = EncoderCell(num_layers, hidden_size, dropout_prob)
self.input_size = 16
self.hidden_size = 16
self.seq_len = 4
def test_run(self):
start_token = 0
end_token = 1
src_vocab_size = 10
trg_vocab_size = 10
num_layers = 1
hidden_size = self.hidden_size
beam_size = 8
max_length = self.seq_len
src = layers.data(name="src", shape=[-1, 1], dtype='int64')
src_len = layers.data(name="src_len", shape=[-1], dtype='int64')
trg = layers.data(name="trg", shape=[-1, 1], dtype='int64')
trg_len = layers.data(name="trg_len", shape=[-1], dtype='int64')
src_embeder = lambda x: fluid.embedding(
x,
size=[src_vocab_size, hidden_size],
param_attr=fluid.ParamAttr(name="src_embedding"))
trg_embeder = lambda x: fluid.embedding( def __call__(self, src_emb, src_sequence_length):
x, encoder_output, encoder_final_state = layers.rnn(
size=[trg_vocab_size, hidden_size], cell=self.encoder_cell,
param_attr=fluid.ParamAttr(name="trg_embedding")) inputs=src_emb,
sequence_length=src_sequence_length,
# use basic_lstm
encoder_cell = EncoderCell(num_layers, hidden_size)
encoder_output, encoder_final_state = dynamic_rnn(
cell=encoder_cell,
inputs=src_embeder(src),
sequence_length=src_len,
is_reverse=False) is_reverse=False)
return encoder_output, encoder_final_state
class Decoder(object):
def __init__(self,
num_layers,
hidden_size,
dropout_prob,
decoding_strategy="infer_sample",
max_decoding_length=20):
self.decoder_cell = DecoderCell(num_layers, hidden_size, dropout_prob)
self.decoding_strategy = decoding_strategy
self.max_decoding_length = None if (
self.decoding_strategy == "train_greedy") else max_decoding_length
def __call__(self, decoder_initial_states, encoder_output,
encoder_padding_mask, **kwargs):
output_layer = kwargs.pop("output_layer", None)
if self.decoding_strategy == "train_greedy":
# for teach-forcing MLE pre-training
helper = layers.TrainingHelper(**kwargs)
elif self.decoding_strategy == "infer_sample":
helper = layers.SampleEmbeddingHelper(**kwargs)
elif self.decoding_strategy == "infer_greedy":
helper = layers.GreedyEmbeddingHelper(**kwargs)
if self.decoding_strategy == "beam_search":
beam_size = kwargs.get("beam_size", 4)
encoder_output = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
encoder_output, beam_size)
encoder_padding_mask = layers.BeamSearchDecoder.tile_beam_merge_with_batch(
encoder_padding_mask, beam_size)
decoder = layers.BeamSearchDecoder(
cell=self.decoder_cell, output_fn=output_layer, **kwargs)
else:
decoder = layers.BasicDecoder(
self.decoder_cell, helper, output_fn=output_layer)
(decoder_output, decoder_final_state,
dec_seq_lengths) = layers.dynamic_decode(
decoder,
inits=decoder_initial_states,
max_step_num=self.max_decoding_length,
encoder_output=encoder_output,
encoder_padding_mask=encoder_padding_mask,
impute_finished=False # for test coverage
if self.decoding_strategy == "beam_search" else True,
is_test=True if self.decoding_strategy == "beam_search" else False,
return_length=True)
return decoder_output, decoder_final_state, dec_seq_lengths
class Seq2SeqModel(object):
"""Seq2Seq model: RNN encoder-decoder with attention"""
def __init__(self,
num_layers,
hidden_size,
dropout_prob,
src_vocab_size,
trg_vocab_size,
start_token,
end_token,
decoding_strategy="infer_sample",
max_decoding_length=20,
beam_size=4):
self.start_token, self.end_token = start_token, end_token
self.max_decoding_length, self.beam_size = max_decoding_length, beam_size
self.src_embeder = lambda x: fluid.embedding(
input=x,
size=[src_vocab_size, hidden_size],
dtype="float32",
param_attr=fluid.ParamAttr(name="source_embedding"))
self.trg_embeder = lambda x: fluid.embedding(
input=x,
size=[trg_vocab_size, hidden_size],
dtype="float32",
param_attr=fluid.ParamAttr(name="target_embedding"))
self.encoder = Encoder(num_layers, hidden_size, dropout_prob)
self.decoder = Decoder(num_layers, hidden_size, dropout_prob,
decoding_strategy, max_decoding_length)
self.output_layer = lambda x: layers.fc(
x,
size=trg_vocab_size,
num_flatten_dims=len(x.shape) - 1,
param_attr=fluid.ParamAttr(name="output_w"),
bias_attr=False)
src_mask = layers.sequence_mask( def __call__(self, src, src_length, trg=None, trg_length=None):
src_len, maxlen=layers.shape(src)[1], dtype='float32') # encoder
encoder_padding_mask = (src_mask - 1.0) * 1000000000 encoder_output, encoder_final_state = self.encoder(
encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) self.src_embeder(src), src_length)
decoder_cell = DecoderCell(num_layers, hidden_size)
decoder_initial_states = [ decoder_initial_states = [
encoder_final_state, decoder_cell.get_initial_states( encoder_final_state, self.decoder.decoder_cell.get_initial_states(
batch_ref=encoder_output, shape=[hidden_size]) batch_ref=encoder_output, shape=[encoder_output.shape[-1]])
] ]
src_mask = layers.sequence_mask(
src_length, maxlen=layers.shape(src)[1], dtype="float32")
encoder_padding_mask = (src_mask - 1.0) * 1e9
encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
decoder_output, _ = dynamic_rnn( # decoder
cell=decoder_cell, decoder_kwargs = {
inputs=trg_embeder(trg), "inputs": self.trg_embeder(trg),
initial_states=decoder_initial_states, "sequence_length": trg_length,
sequence_length=None, } if self.decoder.decoding_strategy == "train_greedy" else ({
encoder_output=encoder_output, "embedding_fn": self.trg_embeder,
encoder_padding_mask=encoder_padding_mask) "beam_size": self.beam_size,
"start_token": self.start_token,
output_layer = lambda x: layers.fc(x, "end_token": self.end_token
size=trg_vocab_size, } if self.decoder.decoding_strategy == "beam_search" else {
num_flatten_dims=len(x.shape) - 1, "embedding_fn": self.trg_embeder,
param_attr=fluid.ParamAttr( "start_tokens": layers.fill_constant_batch_size_like(
name="output_w"), input=encoder_output,
bias_attr=False) shape=[-1],
dtype=src.dtype,
# inference value=self.start_token),
encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch( "end_token": self.end_token
encoder_output, beam_size) })
encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch( decoder_kwargs["output_layer"] = self.output_layer
encoder_padding_mask, beam_size)
beam_search_decoder = BeamSearchDecoder( (decoder_output, decoder_final_state,
decoder_cell, dec_seq_lengths) = self.decoder(decoder_initial_states, encoder_output,
start_token, encoder_padding_mask, **decoder_kwargs)
end_token, if self.decoder.decoding_strategy == "beam_search": # for inference
beam_size, return decoder_output
embedding_fn=trg_embeder, logits, samples, sample_length = (decoder_output.cell_outputs,
output_fn=output_layer) decoder_output.sample_ids,
outputs, _ = dynamic_decode( dec_seq_lengths)
beam_search_decoder, probs = layers.softmax(logits)
inits=decoder_initial_states, return probs, samples, sample_length
max_step_num=max_length,
encoder_output=encoder_output,
encoder_padding_mask=encoder_padding_mask) class PolicyGradient(object):
"""policy gradient"""
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0) def __init__(self, lr=None):
self.lr = lr
def learn(self, act_prob, action, reward, length=None):
"""
update policy model self.model with policy gradient algorithm
"""
self.reward = fluid.layers.py_func(
func=reward_func, x=[action, length], out=reward)
neg_log_prob = layers.cross_entropy(act_prob, action)
cost = neg_log_prob * reward
cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
) if length is not None else layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
return cost
def reward_func(samples, sample_length):
"""toy reward"""
def discount_reward(reward, sequence_length, discount=1.):
return discount_reward_1d(reward, sequence_length, discount)
def discount_reward_1d(reward, sequence_length, discount=1., dtype=None):
if sequence_length is None:
raise ValueError(
'sequence_length must not be `None` for 1D reward.')
reward = np.array(reward)
sequence_length = np.array(sequence_length)
batch_size = reward.shape[0]
max_seq_length = np.max(sequence_length)
dtype = dtype or reward.dtype
if discount == 1.:
dmat = np.ones([batch_size, max_seq_length], dtype=dtype)
else: else:
place = core.CPUPlace() steps = np.tile(np.arange(max_seq_length), [batch_size, 1])
exe = Executor(place) mask = np.asarray(
exe.run(framework.default_startup_program()) steps < (sequence_length - 1)[:, None], dtype=dtype)
# Make each row = [discount, ..., discount, 1, ..., 1]
src_np = np.random.randint( dmat = mask * discount + (1 - mask)
0, src_vocab_size, (self.batch_size, max_length)).astype('int64') dmat = np.cumprod(dmat[:, ::-1], axis=1)[:, ::-1]
src_len_np = np.ones(self.batch_size, dtype='int64') * max_length disc_reward = dmat * reward[:, None]
trg_np = np.random.randint( disc_reward = mask_sequences(disc_reward, sequence_length, dtype=dtype)
0, trg_vocab_size, (self.batch_size, max_length)).astype('int64') return disc_reward
trg_len_np = np.ones(self.batch_size, dtype='int64') * max_length
def mask_sequences(sequence, sequence_length, dtype=None, time_major=False):
out = exe.run(feed={ sequence = np.array(sequence)
'src': src_np, sequence_length = np.array(sequence_length)
'src_len': src_len_np, rank = sequence.ndim
'trg': trg_np, if rank < 2:
'trg_len': trg_len_np raise ValueError("`sequence` must be 2D or higher order.")
}, batch_size = sequence.shape[0]
fetch_list=[outputs]) max_time = sequence.shape[1]
dtype = dtype or sequence.dtype
self.assertTrue(out[0].shape[0] == self.batch_size) if time_major:
self.assertTrue(out[0].shape[1] <= max_length + 1) sequence = np.transpose(sequence, axes=[1, 0, 2])
self.assertTrue(out[0].shape[2] == beam_size) steps = np.tile(np.arange(max_time), [batch_size, 1])
mask = np.asarray(steps < sequence_length[:, None], dtype=dtype)
for _ in range(2, rank):
mask = np.expand_dims(mask, -1)
sequence = sequence * mask
if time_major:
sequence = np.transpose(sequence, axes=[1, 0, 2])
return sequence
samples = np.array(samples)
sample_length = np.array(sample_length)
# length reward
reward = (5 - np.abs(sample_length - 5)).astype("float32")
# repeat punishment to trapped into local minima getting all same words
# beam search to get more than one sample may also can avoid this
for i in range(reward.shape[0]):
reward[i] += -10 if sample_length[i] > 1 and np.all(
samples[i][:sample_length[i] - 1] == samples[i][0]) else 0
return discount_reward(reward, sample_length, discount=1.).astype("float32")
class MLE(object):
"""teacher-forcing MLE training"""
def __init__(self, lr=None):
self.lr = lr
def learn(self, probs, label, weight=None, length=None):
loss = layers.cross_entropy(input=probs, label=label, soft_label=False)
max_seq_len = layers.shape(probs)[1]
mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32")
loss = loss * mask
loss = layers.reduce_mean(loss, dim=[0])
loss = layers.reduce_sum(loss)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(loss)
return loss
class SeqPGAgent(object):
def __init__(self,
model_cls,
alg_cls=PolicyGradient,
model_hparams={},
alg_hparams={},
executor=None,
main_program=None,
startup_program=None,
seed=None):
self.main_program = fluid.Program(
) if main_program is None else main_program
self.startup_program = fluid.Program(
) if startup_program is None else startup_program
if seed is not None:
self.main_program.random_seed = seed
self.startup_program.random_seed = seed
self.build_program(model_cls, alg_cls, model_hparams, alg_hparams)
self.executor = executor
def build_program(self, model_cls, alg_cls, model_hparams, alg_hparams):
with fluid.program_guard(self.main_program, self.startup_program):
source = fluid.data(name="src", shape=[None, None], dtype="int64")
source_length = fluid.data(
name="src_sequence_length", shape=[None], dtype="int64")
# only for teacher-forcing MLE training
target = fluid.data(name="trg", shape=[None, None], dtype="int64")
target_length = fluid.data(
name="trg_sequence_length", shape=[None], dtype="int64")
label = fluid.data(
name="label", shape=[None, None, 1], dtype="int64")
self.model = model_cls(**model_hparams)
self.alg = alg_cls(**alg_hparams)
self.probs, self.samples, self.sample_length = self.model(
source, source_length, target, target_length)
self.samples.stop_gradient = True
self.reward = fluid.layers.create_global_var(
name="reward",
shape=[-1, -1], # batch_size, seq_len
value="1",
dtype=self.probs.dtype)
self.cost = self.alg.learn(self.probs, self.samples, self.reward,
self.sample_length)
# to define the same parameters between different programs
self.pred_program = self.main_program._prune_with_input(
[source.name, source_length.name],
[self.probs, self.samples, self.sample_length])
def predict(self, feed_dict):
samples, sample_length = self.executor.run(
self.pred_program,
feed=feed_dict,
fetch_list=[self.samples, self.sample_length])
return samples, sample_length
def learn(self, feed_dict, fetch_list):
results = self.executor.run(self.main_program,
feed=feed_dict,
fetch_list=fetch_list)
return results
class TestDynamicDecode(unittest.TestCase):
def setUp(self):
np.random.seed(123)
self.model_hparams = {
"num_layers": 2,
"hidden_size": 32,
"dropout_prob": 0.1,
"src_vocab_size": 100,
"trg_vocab_size": 100,
"start_token": 0,
"end_token": 1,
"decoding_strategy": "infer_greedy",
"max_decoding_length": 10
}
self.iter_num = iter_num = 2
self.batch_size = batch_size = 4
src_seq_len = 10
trg_seq_len = 12
self.data = {
"src": np.random.randint(
2, self.model_hparams["src_vocab_size"],
(iter_num * batch_size, src_seq_len)).astype("int64"),
"src_sequence_length": np.random.randint(
1, src_seq_len, (iter_num * batch_size, )).astype("int64"),
"trg": np.random.randint(
2, self.model_hparams["src_vocab_size"],
(iter_num * batch_size, trg_seq_len)).astype("int64"),
"trg_sequence_length": np.random.randint(
1, trg_seq_len, (iter_num * batch_size, )).astype("int64"),
"label": np.random.randint(
2, self.model_hparams["src_vocab_size"],
(iter_num * batch_size, trg_seq_len, 1)).astype("int64"),
}
place = core.CUDAPlace(0) if core.is_compiled_with_cuda(
) else core.CPUPlace()
self.exe = Executor(place)
def test_mle_train(self):
self.model_hparams["decoding_strategy"] = "train_greedy"
agent = SeqPGAgent(
model_cls=Seq2SeqModel,
alg_cls=MLE,
model_hparams=self.model_hparams,
alg_hparams={"lr": 0.001},
executor=self.exe,
main_program=fluid.Program(),
startup_program=fluid.Program(),
seed=123)
self.exe.run(agent.startup_program)
for iter_idx in range(self.iter_num):
reward, cost = agent.learn(
{
"src": self.data["src"][iter_idx * self.batch_size:(
iter_idx + 1) * self.batch_size, :],
"src_sequence_length": self.data["src_sequence_length"][
iter_idx * self.batch_size:(iter_idx + 1
) * self.batch_size],
"trg": self.data["trg"][iter_idx * self.batch_size:(
iter_idx + 1) * self.batch_size, :],
"trg_sequence_length": self.data["trg_sequence_length"]
[iter_idx * self.batch_size:(iter_idx + 1) *
self.batch_size],
"label": self.data["label"][iter_idx * self.batch_size:(
iter_idx + 1) * self.batch_size]
},
fetch_list=[agent.cost, agent.cost])
print("iter_idx: %d, reward: %f, cost: %f" %
(iter_idx, reward.mean(), cost))
def test_greedy_train(self):
self.model_hparams["decoding_strategy"] = "infer_greedy"
agent = SeqPGAgent(
model_cls=Seq2SeqModel,
alg_cls=PolicyGradient,
model_hparams=self.model_hparams,
alg_hparams={"lr": 0.001},
executor=self.exe,
main_program=fluid.Program(),
startup_program=fluid.Program(),
seed=123)
self.exe.run(agent.startup_program)
for iter_idx in range(self.iter_num):
reward, cost = agent.learn(
{
"src": self.data["src"][iter_idx * self.batch_size:(
iter_idx + 1) * self.batch_size, :],
"src_sequence_length": self.data["src_sequence_length"]
[iter_idx * self.batch_size:(iter_idx + 1) *
self.batch_size]
},
fetch_list=[agent.reward, agent.cost])
print("iter_idx: %d, reward: %f, cost: %f" %
(iter_idx, reward.mean(), cost))
def test_sample_train(self):
self.model_hparams["decoding_strategy"] = "infer_sample"
agent = SeqPGAgent(
model_cls=Seq2SeqModel,
alg_cls=PolicyGradient,
model_hparams=self.model_hparams,
alg_hparams={"lr": 0.001},
executor=self.exe,
main_program=fluid.Program(),
startup_program=fluid.Program(),
seed=123)
self.exe.run(agent.startup_program)
for iter_idx in range(self.iter_num):
reward, cost = agent.learn(
{
"src": self.data["src"][iter_idx * self.batch_size:(
iter_idx + 1) * self.batch_size, :],
"src_sequence_length": self.data["src_sequence_length"]
[iter_idx * self.batch_size:(iter_idx + 1) *
self.batch_size]
},
fetch_list=[agent.reward, agent.cost])
print("iter_idx: %d, reward: %f, cost: %f" %
(iter_idx, reward.mean(), cost))
def test_beam_search_infer(self):
self.model_hparams["decoding_strategy"] = "beam_search"
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
source = fluid.data(name="src", shape=[None, None], dtype="int64")
source_length = fluid.data(
name="src_sequence_length", shape=[None], dtype="int64")
model = Seq2SeqModel(**self.model_hparams)
output = model(source, source_length)
self.exe.run(startup_program)
for iter_idx in range(self.iter_num):
trans_ids = self.exe.run(
program=main_program,
feed={
"src": self.data["src"][iter_idx * self.batch_size:(
iter_idx + 1) * self.batch_size, :],
"src_sequence_length": self.data["src_sequence_length"]
[iter_idx * self.batch_size:(iter_idx + 1) *
self.batch_size]
},
fetch_list=[output])[0]
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册