From 3f08b085ae9750868f94190db9729b3135c892cf Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sat, 13 Oct 2018 17:47:37 +0800 Subject: [PATCH] Merge pull request #13819 from panyx0718/doc Explain LoD and a few other concepts test=release/1.0.0 --- paddle/fluid/pybind/pybind.cc | 45 +++++++++++++++++++++++++++- python/paddle/fluid/layers/io.py | 6 +++- python/paddle/fluid/layers/tensor.py | 2 +- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 15268aebe4d..29c9e37473b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -156,7 +156,50 @@ PYBIND11_PLUGIN(core) { .def("_get_double_element", TensorGetElement) .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); }); - py::class_(m, "LoDTensor") + py::class_(m, "LoDTensor", R"DOC( + LoDTensor is a Tensor with optional LoD information. + + np.array(lod_tensor) can convert LoDTensor to numpy array. + lod_tensor.lod() can retrieve the LoD information. + + LoD is short for Level of Details and is usually used for varied sequence + length. You can skip the following comment if you don't need optional LoD. + + For example: + A LoDTensor X can look like the example below. It contains 2 sequences. + The first has length 2 and the second has length 3, as described by x.lod. + + The first tensor dimension 6=2+3 is calculated from LoD if it's available. + It means the total number of sequence element. In X, each element has 2 + columns, hence [6, 2]. + + x.lod = [[2, 3]] + x.data = [[1, 2], [3, 4], + [5, 6], [7, 8], [9, 10], [11, 12]] + x.shape = [6, 2] + + LoD can have multiple levels (for example, a paragraph can have multiple + sentences and a sentence can have multiple words). In the following + LodTensor Y, the lod_level is 2. It means there are 2 sequence, the + first sequence length is 2 (has 2 sub-sequences), the second one's + length is 1. The first sequence's 2 sub-sequences have length 2 and 2, + respectively. And the second sequence's 1 sub-sequence has length 3. + + y.lod = [[2 1], [2 2 3]] + y.shape = [2+2+3, ...] + + Note: + In above description, LoD is length-based. In Paddle internal + implementation, lod is offset-based. Hence, internally, + y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based + equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]). + + Sometimes LoD is called recursive_sequence_length to be more + self-explanatory. In this case, it must be length-based. Due to history + reasons. when LoD is called lod in public API, it might be offset-based. + Users should be careful about it. + + )DOC") .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) .def("__init__", diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 27c9cc3280b..2e73daf89ab 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -55,7 +55,11 @@ def data(name, Args: name(str): The name/alias of the function shape(list): Tuple declaring the shape. - append_batch_size(bool): Whether or not to append the data as a batch. + append_batch_size(bool): + 1. If true, it prepends -1 to the shape. + For example if shape=[1], the resulting shape is [-1, 1]. + 2. If shape contains -1, such as shape=[1, -1], + append_batch_size will be enforced to be be False (ineffective). dtype(int|float): The type of data : float32, float_16, int etc type(VarType): The output type. By default it is LOD_TENSOR. lod_level(int): The LoD Level. 0 means the input data is not a sequence. diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 04e71497aa7..d5353cf2dfd 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -111,7 +111,7 @@ def create_global_var(shape, force_cpu=False, name=None): """ - Create a new variable in the global block(block 0). + Create a new tensor variable with value in the global block(block 0). Args: shape(list[int]): shape of the variable -- GitLab