diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py index e0fc0e04bbd21f691caa1ce3fb95c8a7065d1b3f..7134fa61e861b9f41f49f44f55f3b19de96f5850 100644 --- a/demo/image_classification/api_v2_train.py +++ b/demo/image_classification/api_v2_train.py @@ -13,8 +13,9 @@ # limitations under the License import sys + import paddle.v2 as paddle -from api_v2_vgg import vgg_bn_drop + from api_v2_resnet import resnet_cifar10 @@ -23,7 +24,7 @@ def main(): classdim = 10 # PaddlePaddle init - paddle.init(use_gpu=True, trainer_count=1) + paddle.init(use_gpu=False, trainer_count=1) image = paddle.layer.data( name="image", type=paddle.data_type.dense_vector(datadim)) @@ -68,8 +69,8 @@ def main(): result = trainer.test( reader=paddle.batch( paddle.dataset.cifar.test10(), batch_size=128), - reader_dict={'image': 0, - 'label': 1}) + feeding={'image': 0, + 'label': 1}) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) # Create trainer @@ -83,8 +84,8 @@ def main(): batch_size=128), num_passes=5, event_handler=event_handler, - reader_dict={'image': 0, - 'label': 1}) + feeding={'image': 0, + 'label': 1}) if __name__ == '__main__': diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py index 75dd65f9fc8cd8e7fab5bf30a6337574a645e89f..84125c3b4b621a128fd488ff7fa374a75f620bf1 100644 --- a/demo/introduction/api_train_v2.py +++ b/demo/introduction/api_train_v2.py @@ -30,26 +30,26 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=paddle.reader.batched( - uci_housing.test(), batch_size=2), - reader_dict={'x': 0, + if (event.pass_id + 1) % 10 == 0: + result = trainer.test( + reader=paddle.batch( + uci_housing.test(), batch_size=2), + feeding={'x': 0, 'y': 1}) - if event.pass_id % 10 == 0: - print "Test %d, %s" % (event.pass_id, result.metrics) + print "Test %d, %.2f" % (event.pass_id, result.cost) # training trainer.train( - reader=paddle.reader.batched( + reader=paddle.batch( paddle.reader.shuffle( uci_housing.train(), buf_size=500), batch_size=2), - reader_dict={'x': 0, - 'y': 1}, + feeding={'x': 0, + 'y': 1}, event_handler=event_handler, num_passes=30) diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore index 8bd9837523ccf98e6e72d5b82934b7b104816217..7e61d5e3a0cabd46d4185454d46610ac2ee2e63f 100644 --- a/demo/mnist/.gitignore +++ b/demo/mnist/.gitignore @@ -5,3 +5,6 @@ plot.png train.log *pyc .ipynb_checkpoints +params.pkl +params.tar +params.tar.gz diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py index 4fb1808ca11a6e6937c77737dcf21475c36b4650..68761be80f24f074c041109d6769e84fa7204367 100644 --- a/demo/mnist/api_train_v2.py +++ b/demo/mnist/api_train_v2.py @@ -1,4 +1,5 @@ import paddle.v2 as paddle +import gzip def softmax_regression(img): @@ -71,7 +72,11 @@ def main(): cost = paddle.layer.classification_cost(input=predict, label=label) - parameters = paddle.parameters.create(cost) + try: + with gzip.open('params.tar.gz', 'r') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + except IOError: + parameters = paddle.parameters.create(cost) optimizer = paddle.optimizer.Momentum( learning_rate=0.1 / 128.0, @@ -86,11 +91,19 @@ def main(): def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=paddle.reader.batched( + if event.batch_id % 1000 == 0: + result = trainer.test(reader=paddle.batch( + paddle.dataset.mnist.test(), batch_size=256)) + + print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics, + result.metrics) + + with gzip.open('params.tar.gz', 'w') as f: + parameters.to_tar(f) + + elif isinstance(event, paddle.event.EndPass): + result = trainer.test(reader=paddle.batch( paddle.dataset.mnist.test(), batch_size=128)) print "Test with Pass %d, Cost %f, %s\n" % ( event.pass_id, result.cost, result.metrics) diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py index 15db922b97abc5ae79f095edfd632604eec8ab94..036cad4b0a32357bb42580ef577a1eba558be8fe 100644 --- a/demo/semantic_role_labeling/api_train_v2.py +++ b/demo/semantic_role_labeling/api_train_v2.py @@ -163,11 +163,11 @@ def main(): update_equation=optimizer) parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) - trn_reader = paddle.reader.batched( + trn_reader = paddle.batch( paddle.reader.shuffle( conll05.test(), buf_size=8192), batch_size=10) - reader_dict = { + feeding = { 'word_data': 0, 'ctx_n2_data': 1, 'ctx_n1_data': 2, @@ -183,7 +183,7 @@ def main(): reader=trn_reader, event_handler=event_handler, num_passes=10000, - reader_dict=reader_dict) + feeding=feeding) if __name__ == '__main__': diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py index 3a266e74ea93068cad2757d0076a4ae664ad4cf8..fd7243cbe69977dcabc9ecf1d060e62f313b8cfd 100644 --- a/demo/sentiment/train_v2.py +++ b/demo/sentiment/train_v2.py @@ -18,11 +18,7 @@ from paddle.trainer_config_helpers.poolings import MaxPooling import paddle.v2 as paddle -def convolution_net(input_dim, - class_dim=2, - emb_dim=128, - hid_dim=128, - is_predict=False): +def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128): data = paddle.layer.data("word", paddle.data_type.integer_value_sequence(input_dim)) emb = paddle.layer.embedding(input=data, size=emb_dim) @@ -42,8 +38,7 @@ def stacked_lstm_net(input_dim, class_dim=2, emb_dim=128, hid_dim=512, - stacked_num=3, - is_predict=False): + stacked_num=3): """ A Wrapper for sentiment classification task. This network uses bi-directional recurrent network, @@ -110,7 +105,7 @@ def stacked_lstm_net(input_dim, if __name__ == '__main__': # init - paddle.init(use_gpu=True, trainer_count=4) + paddle.init(use_gpu=False, trainer_count=4) # network config print 'load dictionary...' @@ -143,11 +138,11 @@ if __name__ == '__main__': sys.stdout.flush() if isinstance(event, paddle.event.EndPass): result = trainer.test( - reader=paddle.reader.batched( + reader=paddle.batch( lambda: paddle.dataset.imdb.test(word_dict), batch_size=128), - reader_dict={'word': 0, - 'label': 1}) + feeding={'word': 0, + 'label': 1}) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) # create trainer @@ -156,11 +151,11 @@ if __name__ == '__main__': update_equation=adam_optimizer) trainer.train( - reader=paddle.reader.batched( + reader=paddle.batch( paddle.reader.shuffle( lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000), batch_size=100), event_handler=event_handler, - reader_dict={'word': 0, - 'label': 1}, + feeding={'word': 0, + 'label': 1}, num_passes=10) diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 177fd26d681d19b70ea71c12faac471520c6fff1..6efd254e7a48703a69c9f09dd35d41ba7ac5689a 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -110,11 +110,12 @@ def main(): update_equation=optimizer) # define data reader - reader_dict = { + feeding = { 'source_language_word': 0, 'target_language_word': 1, 'target_language_next_word': 2 } + wmt14_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192), @@ -132,7 +133,7 @@ def main(): reader=wmt14_reader, event_handler=event_handler, num_passes=10000, - reader_dict=reader_dict) + feeding=feeding) if __name__ == '__main__': diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst index 874dd9cb2278ce36a029b8745f2d82a7e642128e..fca981221e490686e468ae8d385d844d49767883 100644 --- a/doc/api/index_cn.rst +++ b/doc/api/index_cn.rst @@ -1,2 +1,26 @@ API -=== \ No newline at end of file +=== + +模型配置 API +------------ + +.. toctree:: + :maxdepth: 1 + + v2/model_configs.rst + +数据 API +-------- + +.. toctree:: + :maxdepth: 1 + + v2/data.rst + +训练 API +-------- + +.. toctree:: + :maxdepth: 1 + + v2/run_logic.rst \ No newline at end of file diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst index b7f470e1f8a9a1c720e7d70832ec069339ddc60f..f0ad0fb2aee7345db1dd5f175a342598366f5e3c 100644 --- a/doc/api/index_en.rst +++ b/doc/api/index_en.rst @@ -7,4 +7,20 @@ Model Config API .. toctree:: :maxdepth: 1 - v2/model_configs.rst \ No newline at end of file + v2/model_configs.rst + +Data API +-------- + +.. toctree:: + :maxdepth: 1 + + v2/data.rst + +Train API +--------- + +.. toctree:: + :maxdepth: 1 + + v2/run_logic.rst \ No newline at end of file diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst new file mode 100644 index 0000000000000000000000000000000000000000..1c0a202a8c04322de3e9533c11fb5c74abac6c62 --- /dev/null +++ b/doc/api/v2/data.rst @@ -0,0 +1,93 @@ +================ +Data Related API +================ + + +######### +DataTypes +######### + +.. automodule:: paddle.v2.data_type + :members: + +########## +DataFeeder +########## + +.. automodule:: paddle.v2.data_feeder + :members: + +###### +Reader +###### + +.. automodule:: paddle.v2.reader + :members: + +.. automodule:: paddle.v2.reader.creator + :members: + +######### +minibatch +######### + +.. automodule:: paddle.v2.minibatch + :members: + +####### +Dataset +####### + +.. automodule:: paddle.v2.dataset + :members: + + +mnist ++++++ + +.. automodule:: paddle.v2.dataset.mnist + :members: + + +cifar ++++++ + +.. automodule:: paddle.v2.dataset.cifar + :members: + +conll05 ++++++++ + +.. automodule:: paddle.v2.dataset.conll05 + :members: + +imdb +++++ + +.. automodule:: paddle.v2.dataset.imdb + :members: + +imikolov +++++++++ + +.. automodule:: paddle.v2.dataset.imikolov + :members: + +movielens ++++++++++ + +.. automodule:: paddle.v2.dataset.movielens + :members: + +sentiment ++++++++++ + +.. automodule:: paddle.v2.dataset.sentiment + :members: + +uci_housing ++++++++++++ + +.. automodule:: paddle.v2.dataset.uci_housing + :members: + diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst index b848bd7045a701a1a0d6e6b53da971ada2c569f5..e9cd3d5bf7b0e9e59c231bcabdb163a740909de1 100644 --- a/doc/api/v2/model_configs.rst +++ b/doc/api/v2/model_configs.rst @@ -1,3 +1,7 @@ +######################### +Configuration Related API +######################### + ====== Layers ====== @@ -33,3 +37,10 @@ Networks .. automodule:: paddle.v2.networks :members: + +========== +Optimizers +========== + +.. automodule:: paddle.v2.optimizer + :members: diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst new file mode 100644 index 0000000000000000000000000000000000000000..904d45966dfc16a474016ff48fd5a951988b0ab0 --- /dev/null +++ b/doc/api/v2/run_logic.rst @@ -0,0 +1,26 @@ +########### +Trainer API +########### + +========== +Parameters +========== + +.. automodule:: paddle.v2.parameters + :members: + + +======= +Trainer +======= + +.. automodule:: paddle.v2.trainer + :members: + + +===== +Event +===== + +.. automodule:: paddle.v2.event + :members: diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 03119fdd74502a4534c2e6a576580ce96a721c7e..f21f7af520df5171798326818ecb97c3bcd14a12 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -23,19 +23,19 @@ An example implementation for single item data reader creator: ```python def reader_creator_random_image(width, height): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height) - return reader + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader ``` An example implementation for multiple item data reader creator: ```python -def reader_creator_random_imageand_label(widht, height, label): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height), label - return reader +def reader_creator_random_image_and_label(width, height, label): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height), label + return reader ``` ## Batch Reader Interface @@ -74,11 +74,11 @@ mnist_train_batch_reader = paddle.batch(mnist_train, 128) Also easy to create custom batch reader: ```python def custom_batch_reader(): - while True: - batch = [] - for i in xrange(128): - batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. - yield batch + while True: + batch = [] + for i in xrange(128): + batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended. + yield batch mnist_random_image_batch_reader = custom_batch_reader ``` @@ -123,16 +123,16 @@ We can do: ```python def reader_creator_random_image(width, height): - def reader(): - while True: - yield numpy.random.uniform(-1, 1, size=width*height) - return reader + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader def reader_creator_bool(t): - def reader: - while True: - yield t - return reader + def reader: + while True: + yield t + return reader true_reader = reader_creator_bool(True) false_reader = reader_creator_bool(False) @@ -172,18 +172,18 @@ We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["imag ```python def image_reader_creator(image_path, label_path, n): - def reader(): - f = open(image_path) - l = open(label_path) - images = numpy.fromfile( - f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32') - images = images / 255.0 * 2.0 - 1.0 - labels = numpy.fromfile(l, 'ubyte', count=n).astype("int") - for i in xrange(n): - yield images[i, :], labels[i] # a single entry of data is created each time - f.close() - l.close() - return reader + def reader(): + f = open(image_path) + l = open(label_path) + images = numpy.fromfile( + f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32') + images = images / 255.0 * 2.0 - 1.0 + labels = numpy.fromfile(l, 'ubyte', count=n).astype("int") + for i in xrange(n): + yield images[i, :], labels[i] # a single entry of data is created each time + f.close() + l.close() + return reader # images_reader_creator creates a reader reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024) @@ -196,7 +196,7 @@ An example implementation of paddle.train could be: ```python def train(batch_reader, mapping, batch_size, total_pass): - for pass_idx in range(total_pass): - for mini_batch in batch_reader(): # this loop will never end in online learning. - do_forward_backward(mini_batch, mapping) + for pass_idx in range(total_pass): + for mini_batch in batch_reader(): # this loop will never end in online learning. + do_forward_backward(mini_batch, mapping) ``` diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md index 7213a977b8a2a4241f4eae22b5bdd65f03c574ac..2a7a6c8c17882a6f2c95e933e051c4b8f1a8eeee 100644 --- a/doc/howto/usage/k8s/k8s_distributed_cn.md +++ b/doc/howto/usage/k8s/k8s_distributed_cn.md @@ -43,22 +43,55 @@ docker push [YOUR_REPO]/paddle:mypaddle 注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址,读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。 -### 上传训练文件 +### 准备训练数据 -本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容,我们将训练文件与数据放在一个job name命名的目录中,上传到volume所在的共享存储(使用不同分布式存储会有不同的挂载方式,需要要先挂载这个目录,然后拷贝数据)。完成后volume中的文件内容大致如下: +这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据,也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image. -```bash -[root@paddle-kubernetes-node0 mfs]# tree -d +在启动Job之前,需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下. + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: paddle-data +spec: + template: + metadata: + name: pi + spec: + hostNetwork: true + containers: + - name: paddle-data + image: paddledev/paddle-tutorial:k8s_data + imagePullPolicy: Always + volumeMounts: + - mountPath: "/mnt" + name: nfs + env: + - name: OUT_DIR + value: /home/work/mfs/paddle-cluster-job + - name: SPLIT_COUNT + value: "3" + volumes: + - name: nfs + persistentVolumeClaim: + claimName: mfs + restartPolicy: Never +``` + +完成后volume中的文件内容大致如下: +```base +[root@paddle-kubernetes-node0 nfsdir]$ tree -d . -└── paddle-cluster-job - ├── data - │   ├── 0 - │   │ - │   ├── 1 - │   │ - │   └── 2 - ├── output - └── recommendation +`-- paddle-cluster-job + |-- 0 + | `-- data + |-- 1 + | `-- data + |-- 2 + | `-- data + |-- output + |-- quick_start ``` 目录中paddle-cluster-job是本次训练对应的job name,本次训练要求有3个PaddlePaddle节点,在paddle-cluster-job/data目录中存放切分好的数据,文件夹0,1,2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件,output文件夹存放训练结果与日志。 @@ -118,15 +151,16 @@ spec: `env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内。 -`JOB_PATH`表示共享存储挂载的路径,`JOB_NAME`表示job名字,`TRAIN_CONFIG_DIR`表示本次训练文件所在目录,这三个变量组合就可以找到本次训练需要的文件路径。 - -`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数,即网卡名 - -`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数,`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量,也就是`--ports_num`参数。 - -`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量,也就是`--ports_num_for_sparse`参数。 - -`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量,即`--num_gradient_servers`参数 +环境变量 | 说明 +--- | --- +JOB_PATH | 共享存储挂在的路径 +JOB_NAME | Job的名字 +TRAIN_CONFIG_DIR | 本次训练文件所在目录,与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径 +CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数,即网卡名 +CONF_PADDLE_PORT | `paddle paserver`的`--port`参数 +CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量,即`--ports_num`参数 +CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量,即`--ports_num_for_sparse`参数 +CONF_PADDLE_GRADIENT_NUM | 训练节点数量,即`--num_gradient_servers参数` 这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。 diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py index 4e3c4db853205bb12272e86295784a6069483ffe..0e752c117c1ecfab72e2da2f830380e9524236e7 100644 --- a/python/paddle/trainer/PyDataProvider2.py +++ b/python/paddle/trainer/PyDataProvider2.py @@ -45,6 +45,23 @@ class CacheType(object): class InputType(object): + """ + InputType is the base class for paddle input types. + + .. note:: + + this is a base class, and should never be used by user. + + :param dim: dimension of input. If the input is an integer, it means the + value range. Otherwise, it means the size of layer. + :type dim: int + :param seq_type: sequence type of input. 0 means it is not a sequence. 1 + means it is a variable length sequence. 2 means it is a + nested sequence. + :type seq_type: int + :param type: data type of input. + :type type: int + """ __slots__ = ['dim', 'seq_type', 'type'] def __init__(self, dim, seq_type, tp): @@ -54,20 +71,61 @@ class InputType(object): def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE): + """ + Dense Vector. It means the input feature is dense float vector. For example, + if the input is an image with 28*28 pixels, the input of Paddle neural + network should be a dense vector with dimension 784. + + :param dim: dimension of this vector. + :type dim: int + :param seq_type: sequence type of input. + :type seq_type: int + :return: An input type object. + :rtype: InputType + """ return InputType(dim, seq_type, DataType.Dense) def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE): + """ + Sparse binary vector. It means the input feature is a sparse vector and the + every element in this vector is either zero or one. + + :param dim: dimension of this vector. + :type dim: int + :param seq_type: sequence type of this input. + :type seq_type: int + :return: An input type object. + :rtype: InputType + """ return InputType(dim, seq_type, DataType.SparseNonValue) def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE): + """ + Sparse vector. It means the input feature is a sparse vector. Most of the + elements in this vector are zero, others could be any float value. + + :param dim: dimension of this vector. + :type dim: int + :param seq_type: sequence type of this input. + :type seq_type: int + :return: An input type object. + :rtype: InputType + """ return InputType(dim, seq_type, DataType.SparseValue) def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE): - """Data type of integer. + """ + Data type of integer. + + :param seq_type: sequence type of this input. + :type seq_type: int :param value_range: range of this integer. + :type value_range: int + :return: An input type object + :rtype: InputType """ return InputType(value_range, seq_type, DataType.Index) @@ -76,10 +134,17 @@ dense_vector = dense_slot sparse_binary_vector = sparse_non_value_slot sparse_vector = sparse_value_slot integer_value = index_slot -integer_value.__doc__ = index_slot.__doc__ def dense_vector_sequence(dim): + """ + Data type of a sequence of dense vector. + + :param dim: dimension of dense vector. + :type dim: int + :return: An input type object + :rtype: InputType + """ return dense_vector(dim, seq_type=SequenceType.SEQUENCE) @@ -88,6 +153,15 @@ def dense_vector_sub_sequence(dim): def sparse_binary_vector_sequence(dim): + """ + Data type of a sequence of sparse vector, which every element is either zero + or one. + + :param dim: dimension of sparse vector. + :type dim: int + :return: An input type object + :rtype: InputType + """ return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE) @@ -96,6 +170,15 @@ def sparse_binary_vector_sub_sequence(dim): def sparse_vector_sequence(dim): + """ + Data type of a sequence of sparse vector, which most elements are zero, + others could be any float value. + + :param dim: dimension of sparse vector. + :type dim: int + :return: An input type object + :rtype: InputType + """ return sparse_vector(dim, seq_type=SequenceType.SEQUENCE) @@ -104,8 +187,11 @@ def sparse_vector_sub_sequence(dim): def integer_value_sequence(value_range): - """Data type of a sequence of integer. + """ + Data type of a sequence of integer. + :param value_range: range of each element. + :type value_range: int """ return integer_value(value_range, seq_type=SequenceType.SEQUENCE) @@ -115,7 +201,6 @@ def integer_value_sub_sequence(dim): integer_sequence = integer_value_sequence -integer_sequence.__doc__ = integer_value_sequence.__doc__ class SingleSlotWrapper(object): diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py index 3b106e100cff7539611d95bb4123b4e0dfbfa6cb..ba77fecf21eecf9115cc1b20720383b790294eb0 100644 --- a/python/paddle/v2/data_feeder.py +++ b/python/paddle/v2/data_feeder.py @@ -12,13 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from py_paddle import swig_paddle from py_paddle import DataProviderConverter -import data_type + +import paddle.trainer.PyDataProvider2 as pydp2 __all__ = ['DataFeeder'] +def default_feeding_map(data_types): + reader_dict = dict() + for i, tp in enumerate(data_types): + reader_dict[tp[0]] = i + return reader_dict + + class DataFeeder(DataProviderConverter): """ DataFeeder converts the data returned by paddle.reader into a data structure @@ -29,7 +36,10 @@ class DataFeeder(DataProviderConverter): to feed it to C++ interface. The example usage: - + + + .. code-block:: python + data_types = [('image', paddle.data_type.dense_vector(784)), ('label', paddle.data_type.integer_value(10))] reader_dict = {'image':0, 'label':1} @@ -43,26 +53,35 @@ class DataFeeder(DataProviderConverter): # [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ] # second sample # ] arg = feeder(minibatch_data) + + .. note:: + + This module is for internal use only. Users should use the `reader` + interface. + + + + :param data_types: A list to specify data name and type. Each item is + a tuple of (data_name, data_type). + + :type data_types: list + :param reader_dict: A dictionary to specify the position of each data + in the input data. + :type feeding: dict """ - def __init__(self, data_types, reader_dict): - """ - :param data_types: A list to specify data name and type. Each item is - a tuple of (data_name, data_type). For example: - [('image', paddle.data_type.dense_vector(784)), - ('label', paddle.data_type.integer_value(10))] - - :type data_types: A list of tuple - :param reader_dict: A dictionary to specify the position of each data - in the input data. - :type reader_dict: dict() - """ + def __init__(self, data_types, feeding=None): self.input_names = [] input_types = [] - self.reader_dict = reader_dict + if feeding is None: + feeding = default_feeding_map(data_types) + + self.feeding = feeding for each in data_types: self.input_names.append(each[0]) - assert isinstance(each[1], data_type.InputType) + if not isinstance(each[1], pydp2.InputType): + raise TypeError("second item in each data_type should be an " + "InputType") input_types.append(each[1]) DataProviderConverter.__init__(self, input_types) @@ -70,22 +89,12 @@ class DataFeeder(DataProviderConverter): """ :param dat: A list of mini-batch data. Each sample is a list or tuple one feature or multiple features. - for example: - [ - ([0.2, 0.2], ), # first sample - ([0.8, 0.3], ), # second sample - ] - or, - [ - [[0.2, 0.2], ], # first sample - [[0.8, 0.3], ], # second sample - ] - - :type dat: List + + :type dat: list :param argument: An Arguments object contains this mini-batch data with one or multiple features. The Arguments definition is in the API. - :type argument: swig_paddle.Arguments + :type argument: py_paddle.swig_paddle.Arguments """ def reorder_data(data): @@ -93,7 +102,7 @@ class DataFeeder(DataProviderConverter): for each in data: reorder = [] for name in self.input_names: - reorder.append(each[self.reader_dict[name]]) + reorder.append(each[self.feeding[name]]) retv.append(reorder) return retv diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py index 522ddfdaacce44be7cf27bdbfc1009d4a0c0bbe6..d582f76ddf01ed3430a1d075624bbb8e0bf3f2a9 100644 --- a/python/paddle/v2/data_type.py +++ b/python/paddle/v2/data_type.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.trainer.PyDataProvider2 import \ - InputType, DataType, dense_vector, sparse_binary_vector,\ - sparse_vector, integer_value, integer_value_sequence +import paddle.trainer.PyDataProvider2 as pydp2 -__all__ = [ - 'InputType', 'DataType', 'dense_vector', 'sparse_binary_vector', - 'sparse_vector', 'integer_value', 'integer_value_sequence' +import_list = [ + nm for nm in dir(pydp2) + if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm) ] +import_list.extend(['InputType']) + +for nm in import_list: + globals()[nm] = getattr(pydp2, nm) + +__all__ = import_list diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py index 6c371d3c9bdee94a91b9a48ff7c4a006c8d7eb21..80ff6295c34e853d8f69b9e78719af23a56d1fbb 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/v2/dataset/__init__.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Dataset package. +""" import mnist import imikolov diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py index 5c6f5d85567fa19f2835ee4f3951531b6dfd3209..d9f7a830ee60a331b55a1e218923e690103e1c5b 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/v2/dataset/cifar.py @@ -13,6 +13,8 @@ # limitations under the License. """ CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html + +TODO(yuyang18): Complete the comments. """ import cPickle diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py index e96a701c1a944e2d6d84f897157cb357c5aa0824..9eab49ee39325c1c60fc511e0bd834e83aa987f0 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/v2/dataset/conll05.py @@ -16,15 +16,17 @@ import tarfile import gzip import itertools from common import download - -__all__ = ['test, get_dict', 'get_embedding'] """ Conll 2005 dataset. Paddle semantic role labeling Book and demo use this dataset as an example. Because Conll 2005 is not free in public, the default downloaded URL is test set of Conll 2005 (which is public). Users can change URL and MD5 to their Conll dataset. + +TODO(yuyang18): Complete comments. """ +__all__ = ['test, get_dict', 'get_embedding'] + DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz' DATA_MD5 = '387719152ae52d60422c016e92a742fc' WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt' diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index f27756a38a9cd809fdaaf92e7f8a72b681915fc8..76019d9f54020ff6f02c17eb6047cbd014a8ccf2 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -13,6 +13,8 @@ # limitations under the License. """ IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz + +TODO(yuyang18): Complete comments. """ import paddle.v2.dataset.common diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py index deb556942d9b0490ffab8cef90aae8f365652129..97c160f111d09d61eb860c7f02552e635f2400a7 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/v2/dataset/imikolov.py @@ -13,6 +13,8 @@ # limitations under the License. """ imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/ + +Complete comments. """ import paddle.v2.dataset.common import tarfile diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 6a621a2aaad14bf9598b838ce7c2ebf297bb0d30..16f2fcb99de4cb1971a7375a97b5daa209ee95ef 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -13,6 +13,9 @@ # limitations under the License. """ MNIST dataset. + +This module will download dataset from http://yann.lecun.com/exdb/mnist/ and +parse train set and test set into paddle reader creators. """ import paddle.v2.dataset.common import subprocess @@ -72,6 +75,15 @@ def reader_creator(image_filename, label_filename, buffer_size): def train(): + """ + MNIST train set creator. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Train reader creator + :rtype: callable + """ return reader_creator( paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5), @@ -80,6 +92,15 @@ def train(): def test(): + """ + MNIST test set cretor. + + It returns a reader creator, each sample in the reader is image pixels in + [0, 1] and label in [0, 9]. + + :return: Test reader creator. + :rtype: callable + """ return reader_creator( paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5), diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py index c22bcfa38b5f501732768dd4f62d8e088d57a7ff..dc65e8f8b6f04b078a3449c622478095086cecbe 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/v2/dataset/movielens.py @@ -11,6 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Movielens 1-M dataset. + +TODO(yuyang18): Complete comments. +""" import zipfile from common import download diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py index cbd08fa73684be42e8d8d2eb7b684d66894d7761..71689fd61b6b14a7b5072caff4e2fd48a7f74072 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/v2/dataset/sentiment.py @@ -15,18 +15,19 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -The script fetch and preprocess movie_reviews data set +The script fetch and preprocess movie_reviews data set that provided by NLTK -that provided by NLTK +TODO(yuyang18): Complete dataset. """ -import common import collections -import nltk -import numpy as np from itertools import chain + +import nltk from nltk.corpus import movie_reviews +import common + __all__ = ['train', 'test', 'get_word_dict'] NUM_TRAINING_INSTANCES = 1600 NUM_TOTAL_INSTANCES = 2000 diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py index b5a0537af66a3fae4e1b267ae25441a6cb75416b..27f454b137e3a40febd19cf085e2f4034cc16b24 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/v2/dataset/uci_housing.py @@ -11,6 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +UCI Housing dataset. + +TODO(yuyang18): Complete comments. +""" import numpy as np import os diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py index a429e36b63c9e812332673b66f4d8b99f3303cf8..1ad52b8baa411269d29732685871a875df5185cc 100644 --- a/python/paddle/v2/event.py +++ b/python/paddle/v2/event.py @@ -34,6 +34,10 @@ class WithMetric(object): class TestResult(WithMetric): + """ + Result that trainer.test return. + """ + def __init__(self, evaluator, cost): super(TestResult, self).__init__(evaluator) self.cost = cost diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 476fd3fa4523a77709f68c73c73e6851e04064aa..7d889bce7fe5ded22755a527575595f375691df4 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -21,10 +21,8 @@ class Inference(object): self.__gradient_machine__ = gm self.__data_types__ = topo.data_type() - def iter_infer(self, reader, reader_dict=None): - if reader_dict is None: - reader_dict = self.default_reader_dict() - feeder = DataFeeder(self.__data_types__, reader_dict) + def iter_infer(self, reader, feeding=None): + feeder = DataFeeder(self.__data_types__, feeding) self.__gradient_machine__.start() for data_batch in reader(): yield self.__gradient_machine__.forwardTest(feeder(data_batch)) @@ -47,13 +45,7 @@ class Inference(object): else: return retv - def default_reader_dict(self): - reader_dict = dict() - for i, tp in enumerate(self.__data_types__): - reader_dict[tp[0]] = i - return reader_dict - -def infer(output, parameters, reader, reader_dict=None, field='value'): +def infer(output, parameters, reader, feeding=None, field='value'): inferer = Inference(output=output, parameters=parameters) - return inferer.infer(field=field, reader=reader, reader_dict=reader_dict) + return inferer.infer(field=field, reader=reader, feeding=feeding) diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py index f01815a0ce068d503c23c4126e16cfcb28202bb8..317cf037c69f8639e3760fbfce20565127794fcb 100644 --- a/python/paddle/v2/minibatch.py +++ b/python/paddle/v2/minibatch.py @@ -12,24 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. +__all__ = ['batch'] + def batch(reader, batch_size): """ - Create a batch reader. + Create a batched reader. + :param reader: the data reader to read from. - :param batch_size: batch_size - :return: the batch reader. + :type reader: callable + :param batch_size: size of each mini-batch + :type batch_size: int + :return: the batched reader. + :rtype: callable """ def batch_reader(): r = reader() - batch = [] + b = [] for instance in r: - batch.append(instance) - if len(batch) == batch_size: - yield batch - batch = [] - if batch: - yield batch + b.append(instance) + if len(b) == batch_size: + yield b + b = [] + if b: + yield b return batch_reader diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py index 10e255dc945efb8b20f09dc1806d2ba7ef856c55..1a01d95c205c0626374e1814a170ce2d58f23a60 100644 --- a/python/paddle/v2/optimizer.py +++ b/python/paddle/v2/optimizer.py @@ -1,7 +1,12 @@ import py_paddle.swig_paddle as swig_api -import paddle.trainer_config_helpers.optimizers as v1_optimizers + import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils -import paddle.v2 +import paddle.trainer_config_helpers.optimizers as v1_optimizers +""" +Optimizers(update equation) for SGD method. + +TODO(yuyang18): Complete comments. +""" __all__ = [ 'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta', @@ -44,7 +49,7 @@ class Optimizer(object): class Momentum(Optimizer): def __init__(self, momentum=None, sparse=False, **kwargs): learning_method = v1_optimizers.MomentumOptimizer( - momentum=None, sparse=False) + momentum=momentum, sparse=sparse) super(Momentum, self).__init__( learning_method=learning_method, **kwargs) diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py index 2a6026bcab1c8a373d8dd5eac480dec62a8eb3b9..05dc5c68dd97b00fb15b74564a32313430c45345 100644 --- a/python/paddle/v2/parameters.py +++ b/python/paddle/v2/parameters.py @@ -1,7 +1,9 @@ import numpy as np import py_paddle.swig_paddle as api from paddle.proto.ParameterConfig_pb2 import ParameterConfig - +import struct +import tarfile +import cStringIO from topology import Topology __all__ = ['Parameters', 'create'] @@ -10,6 +12,7 @@ __all__ = ['Parameters', 'create'] def create(layers): """ Create parameter pool by topology. + :param layers: :return: """ @@ -67,6 +70,7 @@ class Parameters(object): def keys(self): """ keys are the names of each parameter. + :return: list of parameter name :rtype: list """ @@ -75,6 +79,7 @@ class Parameters(object): def names(self): """ names of each parameter. + :return: list of parameter name :rtype: list """ @@ -83,6 +88,7 @@ class Parameters(object): def has_key(self, key): """ has_key return true if there are such parameter name == key + :param key: Parameter name :type key: basestring :return: True if contains such key @@ -118,6 +124,12 @@ class Parameters(object): if len(self.__gradient_machines__) == 0: # create new parameter in python numpy. + if len(self.__tmp_params__) != 0: + ret_list = [ + mat for name, mat in self.__tmp_params__ if name == key + ] + if len(ret_list) == 1: + return ret_list[0] return np.ndarray(shape=shape, dtype=np.float32) else: for each_gradient_machine in self.__gradient_machines__: @@ -136,6 +148,7 @@ class Parameters(object): def get_shape(self, key): """ get shape of the parameter. + :param key: parameter name :type key: basestring :return: parameter's shape @@ -190,6 +203,7 @@ class Parameters(object): def set(self, parameter_name, value): """ Set parameter by parameter name & matrix. + :param parameter_name: parameter name :type parameter_name: basestring :param value: parameter matrix @@ -222,6 +236,67 @@ class Parameters(object): self.__gradient_machines__.append(gradient_machine) + def serialize(self, name, f): + """ + + :param name: + :param f: + :type f: file + :return: + """ + param = self.get(name) + size = reduce(lambda a, b: a * b, param.shape) + f.write(struct.pack("IIQ", 0, 4, size)) + param = param.astype(np.float32) + f.write(param.tobytes()) + + def deserialize(self, name, f): + """ + + :param name: + :param f: + :type f: file + :return: + """ + f.read(16) # header + arr = np.frombuffer(f.read(), dtype=np.float32) + self.set(name, arr.reshape(self.get_shape(name))) + + def to_tar(self, f): + tar = tarfile.TarFile(fileobj=f, mode='w') + for nm in self.names(): + buf = cStringIO.StringIO() + self.serialize(nm, buf) + tarinfo = tarfile.TarInfo(name=nm) + buf.seek(0) + tarinfo.size = len(buf.getvalue()) + tar.addfile(tarinfo, buf) + + conf = self.__param_conf__[nm] + confStr = conf.SerializeToString() + tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm) + tarinfo.size = len(confStr) + buf = cStringIO.StringIO(confStr) + buf.seek(0) + tar.addfile(tarinfo, fileobj=buf) + + @staticmethod + def from_tar(f): + params = Parameters() + tar = tarfile.TarFile(fileobj=f, mode='r') + for finfo in tar: + assert isinstance(finfo, tarfile.TarInfo) + if finfo.name.endswith('.protobuf'): + f = tar.extractfile(finfo) + conf = ParameterConfig() + conf.ParseFromString(f.read()) + params.__append_config__(conf) + + for param_name in params.names(): + f = tar.extractfile(param_name) + params.deserialize(param_name, f) + return params + def __get_parameter_in_gradient_machine__(gradient_machine, name): """ diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py index 7373dc461b1d3115c03b37c5102a469a52aa7441..3b059735a924d58714cd88a761eb83143f1192d6 100644 --- a/python/paddle/v2/reader/__init__.py +++ b/python/paddle/v2/reader/__init__.py @@ -11,15 +11,64 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +At training and testing time, PaddlePaddle programs need to read data. To ease +the users' work to write data reading code, we define that -# It would be too lengthy to require our users to prefix decorators with `decorator`. -# For example, we want the following line -# -# r = paddle.reader.decorator.bufferd(paddle.reader.creator.text("hello.txt")) -# -# to be a shorter version: -# -# r = paddle.reader.buffered(paddle.reader.creator.text("hello.txt")) +- A *reader* is a function that reads data (from file, network, random number + generator, etc) and yields data items. +- A *reader creator* is a function that returns a reader function. +- A *reader decorator* is a function, which accepts one or more readers, and + returns a reader. +- A *batch reader* is a function that reads data (from *reader*, file, network, + random number generator, etc) and yields a batch of data items. + +##################### +Data Reader Interface +##################### + +Indeed, *data reader* doesn't have to be a function that reads and yields data +items. It can be any function with no parameter that creates a iterable +(anything can be used in :code:`for x in iterable`)\: + +.. code-block:: python + + iterable = data_reader() + +Element produced from the iterable should be a **single** entry of data, +**not** a mini batch. That entry of data could be a single item, or a tuple of +items. +Item should be of `supported type `_ (e.g., numpy 1d +array of float32, int, list of int) + +An example implementation for single item data reader creator: + +.. code-block:: python + + def reader_creator_random_image(width, height): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height) + return reader + +An example implementation for multiple item data reader creator: + +.. code-block:: python + + def reader_creator_random_image_and_label(width, height, label): + def reader(): + while True: + yield numpy.random.uniform(-1, 1, size=width*height), label + return reader + + +TODO(yuyang18): Should we add whole design doc here? +""" + +import decorator from decorator import * import creator + +__all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 5a91bb0b8ef6d1874737386897f6c555eaec18d4..07142056f872db5113acdd296b17c52b343c1be6 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -11,6 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Creator package contains some simple reader creator, which could be used in user +program. +""" __all__ = ['np_array', 'text_file'] @@ -38,7 +42,7 @@ def np_array(x): def text_file(path): """ Creates a data reader that outputs text line by line from given text file. - Trailing new line ('\n') of each line will be removed. + Trailing new line ('\\\\n') of each line will be removed. :path: path of the text file. :returns: data reader of text file diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index c4ba110205a3c1cca2f5581d33048d2a205929b4..104ce9a0411413bb8fc65eedf5821f98d6acdba3 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -28,9 +28,11 @@ def map_readers(func, *readers): Creates a data reader that outputs return value of function using output of each data readers as arguments. - :param func: function to use. - :param *readers: readers whose outputs will be used as arguments of func. - :returns: the created data reader. + :param func: function to use. The type of func should be (Sample) => Sample + :type: callable + :param readers: readers whose outputs will be used as arguments of func. + :return: the created data reader. + :rtype: callable """ def reader(): @@ -45,16 +47,19 @@ def map_readers(func, *readers): def shuffle(reader, buf_size): """ - Creates a data reader whose data output is suffled. + Creates a data reader whose data output is shuffled. Output from the iterator that created by original reader will be buffered into shuffle buffer, and then shuffled. The size of shuffle buffer is determined by argument buf_size. :param reader: the original reader whose output will be shuffled. + :type reader: callable :param buf_size: shuffle buffer size. + :type buf_size: int - :returns:the new reader whose output is shuffled. + :return: the new reader whose output is shuffled. + :rtype: callable """ def data_reader(): @@ -88,7 +93,8 @@ def chain(*readers): [0, 0, 0, 1, 1, 1, 2, 2, 2] :param readers: input readers. - :returns: the new data reader. + :return: the new data reader. + :rtype: callable """ def reader(): @@ -115,12 +121,13 @@ def compose(*readers, **kwargs): The composed reader will output: (1, 2, 3, 4, 5) - :*readers: readers that will be composed together. - :check_alignment: if True, will check if input readers are aligned + :param readers: readers that will be composed together. + :param check_alignment: if True, will check if input readers are aligned correctly. If False, will not check alignment and trailing outputs will be discarded. Defaults to True. + :type check_alignment: bool - :returns: the new data reader. + :return: the new data reader. :raises ComposeNotAligned: outputs of readers are not aligned. Will not raise when check_alignment is set to False. @@ -161,7 +168,9 @@ def buffered(reader, size): as the buffer is not empty. :param reader: the data reader to read from. + :type reader: callable :param size: max buffer size. + :type size: int :returns: the buffered data reader. """ @@ -196,6 +205,13 @@ def buffered(reader, size): def firstn(reader, n): """ Limit the max number of samples that reader could return. + + :param reader: the data reader to read from. + :type reader: callable + :param n: the max number of samples that return. + :type n: int + :return: the decorated reader. + :rtype: callable """ # TODO(yuyang18): Check if just drop the reader, could clean the opened diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh index b96f54fe9cc78a436bc67e6c542b6e842aba997b..dda1b1bd222a9f226db1a4bd730e9637ab882196 100755 --- a/python/paddle/v2/tests/run_tests.sh +++ b/python/paddle/v2/tests/run_tests.sh @@ -22,7 +22,7 @@ cd $SCRIPTPATH $1 -m pip install ../../../../paddle/dist/*.whl -test_list="test_data_feeder.py" +test_list="test_data_feeder.py test_parameters.py" export PYTHONPATH=$PWD/../../../../python/ diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..ebb182caab6430862a8e4da2ae4ea6b1e72f726c --- /dev/null +++ b/python/paddle/v2/tests/test_parameters.py @@ -0,0 +1,60 @@ +import unittest +import sys + +try: + import py_paddle + + del py_paddle +except ImportError: + print >> sys.stderr, "It seems swig of Paddle is not installed, this " \ + "unittest will not be run." + sys.exit(0) + +import paddle.v2.parameters as parameters +from paddle.proto.ParameterConfig_pb2 import ParameterConfig +import random +import cStringIO +import numpy + + +def __rand_param_config__(name): + conf = ParameterConfig() + conf.name = name + size = 1 + for i in xrange(2): + dim = random.randint(1, 1000) + conf.dims.append(dim) + size *= dim + conf.size = size + assert conf.IsInitialized() + return conf + + +class TestParameters(unittest.TestCase): + def test_serialization(self): + params = parameters.Parameters() + params.__append_config__(__rand_param_config__("param_0")) + params.__append_config__(__rand_param_config__("param_1")) + + for name in params.names(): + param = params.get(name) + param[:] = numpy.random.uniform( + -1.0, 1.0, size=params.get_shape(name)) + params.set(name, param) + + tmp_file = cStringIO.StringIO() + params.to_tar(tmp_file) + tmp_file.seek(0) + params_dup = parameters.Parameters.from_tar(tmp_file) + + self.assertEqual(params_dup.names(), params.names()) + + for name in params.names(): + self.assertEqual(params.get_shape(name), params_dup.get_shape(name)) + p0 = params.get(name) + p1 = params_dup.get(name) + self.assertTrue(numpy.isclose(p0, p1).all()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py index 1bf55a5bc68dfdb837773b3120e5b55d304f644d..5c6dbcdb4f49b960fb8b71aecbad4f013d2cd283 100644 --- a/python/paddle/v2/tests/test_topology.py +++ b/python/paddle/v2/tests/test_topology.py @@ -16,6 +16,7 @@ import paddle.v2.layer as layer import paddle.v2.topology as topology import paddle.v2.data_type as data_type import paddle.trainer_config_helpers as conf_helps +import paddle.trainer.PyDataProvider2 as pydp2 class TestTopology(unittest.TestCase): @@ -35,13 +36,13 @@ class TestTopology(unittest.TestCase): pixel_data_type = filter(lambda type: type[0] == "pixel", data_types) self.assertEqual(len(pixel_data_type), 1) pixel_data_type = pixel_data_type[0] - self.assertEqual(pixel_data_type[1].type, data_type.DataType.Dense) + self.assertEqual(pixel_data_type[1].type, pydp2.DataType.Dense) self.assertEqual(pixel_data_type[1].dim, 784) label_data_type = filter(lambda type: type[0] == "label", data_types) self.assertEqual(len(label_data_type), 1) label_data_type = label_data_type[0] - self.assertEqual(label_data_type[1].type, data_type.DataType.Index) + self.assertEqual(label_data_type[1].type, pydp2.DataType.Index) self.assertEqual(label_data_type[1].dim, 10) def test_get_layer(self): diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py index dd64f0565b0e2e22c47803287ff55012e0a7bdcc..7bd3e2c565ee00c91402e7dea36c7393fb1a9bdf 100644 --- a/python/paddle/v2/trainer.py +++ b/python/paddle/v2/trainer.py @@ -9,6 +9,10 @@ from . import optimizer as v2_optimizer from . import parameters as v2_parameters __all__ = ['SGD'] +""" +Trainer package +TODO(yuyang18): Complete comments. +""" def default_event_handler(event): @@ -22,14 +26,20 @@ def default_event_handler(event): pass -class SGD(): - def __init__(self, cost, parameters, update_equation): - """ - Simple SGD Trainer. +class SGD(object): + """ + Simple SGD Trainer. + TODO(yuyang18): Complete comments + + :param update_equation: The optimizer object. + :type update_equation: paddle.v2.optimizer.Optimizer + :param cost: Target cost that neural network should be optimized. + :type cost: paddle.v2.config_base.Layer + :param parameters: The parameters dictionary. + :type parameters: paddle.v2.parameters.Parameters + """ - :param update_equation: The optimizer object. - :type update_equation: v2_optimizer.Optimizer - """ + def __init__(self, cost, parameters, update_equation): if not isinstance(parameters, v2_parameters.Parameters): raise TypeError('parameters should be parameters') @@ -47,29 +57,26 @@ class SGD(): self.__topology_in_proto__, api.CREATE_MODE_NORMAL, self.__optimizer__.enable_types()) assert isinstance(gm, api.GradientMachine) - parameters.append_gradient_machine(gm) self.__gradient_machine__ = gm self.__gradient_machine__.randParameters() + parameters.append_gradient_machine(gm) - def train(self, reader, num_passes=1, event_handler=None, reader_dict=None): + def train(self, reader, num_passes=1, event_handler=None, feeding=None): """ Training method. Will train num_passes of input data. :param reader: - :param topology: Network Topology, use one or more Layers to represent it. - :param parameters: The parameter pools. :param num_passes: The total train passes. :param event_handler: Event handler. A method will be invoked when event occurred. :type event_handler: (BaseEvent) => None + :param feeding: Feeding is a map of neural network input name and array + index that reader returns. + :type feeding: dict :return: """ if event_handler is None: event_handler = default_event_handler - - if reader_dict is None: - reader_dict = self.default_reader_dict() - __check_train_args__(**locals()) updater = self.__optimizer__.create_local_updater() @@ -81,9 +88,7 @@ class SGD(): pass_evaluator = self.__gradient_machine__.makeEvaluator() assert isinstance(pass_evaluator, api.Evaluator) out_args = api.Arguments.createArguments(0) - - feeder = DataFeeder(self.__data_types__, reader_dict) - + feeder = DataFeeder(self.__data_types__, feeding) for pass_id in xrange(num_passes): event_handler(v2_event.BeginPass(pass_id)) pass_evaluator.start() @@ -117,17 +122,8 @@ class SGD(): event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator)) self.__gradient_machine__.finish() - def default_reader_dict(self): - reader_dict = dict() - for i, tp in enumerate(self.__data_types__): - reader_dict[tp[0]] = i - return reader_dict - - def test(self, reader, reader_dict=None): - if reader_dict is None: - reader_dict = self.default_reader_dict() - - feeder = DataFeeder(self.__data_types__, reader_dict) + def test(self, reader, feeding=None): + feeder = DataFeeder(self.__data_types__, feeding) evaluator = self.__gradient_machine__.makeEvaluator() out_args = api.Arguments.createArguments(0) evaluator.start()