diff --git a/develop/doc/api/v1/data_provider/pydataprovider2_en.html b/develop/doc/api/v1/data_provider/pydataprovider2_en.html index d9b11178c0c18ebf1bdedfa205b1b35a6d60f658..37d2038780ebee2ba4ee2c00ff8c9163b7f58e4a 100644 --- a/develop/doc/api/v1/data_provider/pydataprovider2_en.html +++ b/develop/doc/api/v1/data_provider/pydataprovider2_en.html @@ -239,19 +239,20 @@ label of an image. The second part contains 28x28 pixel float values.

The corresponding dataprovider is shown as below:

-
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer.PyDataProvider2 import *
 
 
@@ -307,19 +308,20 @@ sample by using keywords y
 generator.

Only a few lines of codes need to be added into the training configuration file, you can take this as an example.

-
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 define_py_data_sources2(
@@ -373,19 +375,21 @@ the dataprovider
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+37
+38
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer.PyDataProvider2 import *
 
 
@@ -447,19 +451,20 @@ negative sentiment (marked by 0 and 1 respectively).

The corresponding data provider can be found in the path below:

-
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer.PyDataProvider2 import *
 
 
@@ -520,19 +525,20 @@ configuration file, and it maps word string to word id.
 
 

To pass these parameters into DataProvider, the following lines should be added into trainer configuration file.

-
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 dictionary = dict()
diff --git a/develop/doc_cn/api/v1/data_provider/pydataprovider2_cn.html b/develop/doc_cn/api/v1/data_provider/pydataprovider2_cn.html
index 963514864ee3dfb74e49cde6df9bad14f0861a85..3967fca772a765071105641f10bd5e39b4b4eff8 100644
--- a/develop/doc_cn/api/v1/data_provider/pydataprovider2_cn.html
+++ b/develop/doc_cn/api/v1/data_provider/pydataprovider2_cn.html
@@ -282,19 +282,20 @@
 

dataprovider的使用

-
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer.PyDataProvider2 import *
 
 
@@ -327,8 +328,8 @@
 
  • 其次,定义一个Python的 Decorator @provider 。用于将下一行的数据输入函数标记成一个PyDataProvider2,同时设置它的input_types属性。

    • input_types:设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 data_layer 的名字,显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。

      -
      #Unless required by applicable law or agreed to in writing, software
      -#distributed under the License is distributed on an "AS IS" BASIS,
      +
      # Unless required by applicable law or agreed to in writing, software
      +# distributed under the License is distributed on an "AS IS" BASIS,
       
    • @@ -356,13 +357,13 @@

      网络配置中的调用

      在网络配置里,只需要一行代码就可以调用这个PyDataProvider2,如,

      -
      #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
      +
      #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
       #
      -#Licensed under the Apache License, Version 2.0 (the "License");
      -#you may not use this file except in compliance with the License.
      -#You may obtain a copy of the License at
      +# Licensed under the Apache License, Version 2.0 (the "License");
      +# you may not use this file except in compliance with the License.
      +# You may obtain a copy of the License at
       #
      -#    http://www.apache.org/licenses/LICENSE-2.0
      +#     http://www.apache.org/licenses/LICENSE-2.0
       

      训练数据是 train.list ,没有测试数据,调用的PyDataProvider2是 mnist_provider 模块中的 process 函数。

      @@ -399,19 +400,20 @@
    • 其中 input_types 和在 @provider 中配置的效果一致。本例中的输入特征是词ID的序列,因此使用 integer_value_sequence 类型来设置。
    • dictionary 存入settings对象,在 process 函数中使用。 dictionary是从网络配置中传入的dict对象,即一个将单词字符串映射到单词ID的字典。
    -
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
     from paddle.trainer.PyDataProvider2 import *
     
     
    @@ -465,21 +467,22 @@
     
  • 在配置中需要读取外部字典。
  • 在声明DataProvider的时候传入dictionary作为参数。
  • -
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    -from paddle.trainer_config_helpers import *
    -
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +from paddle.trainer_config_helpers import *
    +
     dictionary = dict()
     ...  #  read dictionary from outside
     
    diff --git a/develop/doc_cn/faq/local/index_cn.html b/develop/doc_cn/faq/local/index_cn.html
    index 409448eeeb66e3c154137d07235a01b13453afcc..0bf82bbb6f59bc36478494b3bd5d42d6b0f586dd 100644
    --- a/develop/doc_cn/faq/local/index_cn.html
    +++ b/develop/doc_cn/faq/local/index_cn.html
    @@ -279,19 +279,21 @@ PaddlePaddle的内存占用主要分为如下几个方面:

    所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这 个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的, 那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为

    -
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +
     @provider(min_pool_size=0, ...)
     def process(settings, filename):
         os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
    @@ -335,19 +337,21 @@ PaddlePaddle的内存占用主要分为如下几个方面:

    减少数据载入的耗时

    使用pydataprovider时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。 DataProvider 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。

    -
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
    +
     @provider(min_pool_size=0, ...)
     def process(settings, filename):
         os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
    @@ -363,19 +367,20 @@ PaddlePaddle的内存占用主要分为如下几个方面:

    PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 sparse_binary_vectorsparse_vector 、或者 integer_value 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 sparse_update=True

    这里使用简单的 word2vec 训练语言模型距离,具体使用方法为:

    使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为:

    -
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
     DICT_DIM = 3000
     
     
    @@ -389,19 +394,20 @@ PaddlePaddle的内存占用主要分为如下几个方面:

    这个任务的配置为:

    -
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
     ...  # the settings and define data provider is omitted.
     DICT_DIM = 3000  # dictionary dimension.
     word_ids = data_layer('word_ids', size=DICT_DIM)
    diff --git a/develop/doc_cn/getstarted/concepts/use_concepts_cn.html b/develop/doc_cn/getstarted/concepts/use_concepts_cn.html
    index db8882924dd6319f1c683ef62da7a30a55103520..9329aa999d94c02ce51898ec68b2d3dcfdb88477 100644
    --- a/develop/doc_cn/getstarted/concepts/use_concepts_cn.html
    +++ b/develop/doc_cn/getstarted/concepts/use_concepts_cn.html
    @@ -442,19 +442,21 @@ trainer.train(
     67
     68
     69
    -70
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +70
    +71
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
     import paddle.v2 as paddle
     import numpy as np
     
    @@ -545,19 +547,21 @@ trainer.train(
     28
     29
     30
    -31
    #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
    +31
    +32
    #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
     #
    -#Licensed under the Apache License, Version 2.0 (the "License");
    -#you may not use this file except in compliance with the License.
    -#You may obtain a copy of the License at
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
     #
    -#    http://www.apache.org/licenses/LICENSE-2.0
    +#     http://www.apache.org/licenses/LICENSE-2.0
     #
    -#Unless required by applicable law or agreed to in writing, software
    -#distributed under the License is distributed on an "AS IS" BASIS,
    -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    -#See the License for the specific language governing permissions and
    -#limitations under the License.
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +
     import paddle.v2 as paddle
     import numpy as np