diff --git a/README.md b/README.md index 60ffbe728178705b1734e682868614025214c2a4..45186ec4ef48dc305b2616dbf4966f01c3609962 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0) +### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0) ### Install Latest Stable Release: ``` # Linux CPU @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 11bd75e1d09a6b51c7c749c512f2b71f3604f3fb..25622ee06c69e13181f34dfffadd5e299d31c8a8 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): program=train_prog, pservers=pserver_endpoints, trainers=trainers, - sync_mode=not args.async_mode) + sync_mode=not args.async_mode, + startup_program=startup_prog) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index ae1baa48e17e40448e457052fd1464b9604a2128..d71b855612ae32083b2b2e3448db3749c340633b 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -20,6 +20,7 @@ import functools import numpy as np import time import os +import math import cProfile, pstats, StringIO @@ -27,128 +28,120 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler -# from recordio_converter import imagenet_train, imagenet_test from imagenet_reader import train, val +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class ResNet(): + def __init__(self, layers=50, is_train=True): + self.params = train_parameters + self.layers = layers + self.is_train = is_train + + def net(self, input, class_dim=1000): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + out = fluid.layers.fc(input=pool, + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv))) + return out + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm( + input=conv, act=act, is_test=not self.is_train) + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + return self.conv_bn_layer(input, ch_out, 1, stride) + else: + return input -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu', - is_train=True): - conv1 = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train) - - -def shortcut(input, ch_out, stride, is_train=True): - ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] - if ch_in != ch_out: - return conv_bn_layer( - input, ch_out, 1, stride, 0, None, is_train=is_train) - else: - return input - - -def basicblock(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def bottleneck(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out * 4, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train) - conv3 = conv_bn_layer( - conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') - - -def layer_warp(block_func, input, ch_out, count, stride): - res_out = block_func(input, ch_out, stride) - for i in range(1, count): - res_out = block_func(res_out, ch_out, 1) - return res_out - + def bottleneck_block(self, input, num_filters, stride): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None) -def resnet_imagenet(input, - class_dim, - depth=50, - data_format='NCHW', - is_train=True): + short = self.shortcut(input, num_filters * 4, stride) - cfg = { - 18: ([2, 2, 2, 1], basicblock), - 34: ([3, 4, 6, 3], basicblock), - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck) - } - stages, block_func = cfg[depth] - conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) - pool1 = fluid.layers.pool2d( - input=conv1, pool_type='avg', pool_size=3, pool_stride=2) - res1 = layer_warp(block_func, pool1, 64, stages[0], 1) - res2 = layer_warp(block_func, res1, 128, stages[1], 2) - res3 = layer_warp(block_func, res2, 256, stages[2], 2) - res4 = layer_warp(block_func, res3, 512, stages[3], 2) - pool2 = fluid.layers.pool2d( - input=res4, - pool_size=7, - pool_type='avg', - pool_stride=1, - global_pooling=True) - out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - return out - - -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): - assert (depth - 2) % 6 == 0 - - n = (depth - 2) // 6 - - conv1 = conv_bn_layer( - input=input, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - return out + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') def _model_reader_dshape_classdim(args, is_train): - model = resnet_cifar10 + model = None reader = None - if args.data_set == "cifar10": - class_dim = 10 - if args.data_format == 'NCHW': - dshape = [3, 32, 32] - else: - dshape = [32, 32, 3] - model = resnet_cifar10 - if is_train: - reader = paddle.dataset.cifar.train10() - else: - reader = paddle.dataset.cifar.test10() - elif args.data_set == "flowers": + if args.data_set == "flowers": class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] - model = resnet_imagenet if is_train: reader = paddle.dataset.flowers.train() else: @@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train): dshape = [3, 224, 224] else: dshape = [224, 224, 3] - model = resnet_imagenet if not args.data_path: raise Exception( "Must specify --data_path when training with imagenet") @@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train): reader = train(xmap=False) else: reader = val(xmap=False) - return model, reader, dshape, class_dim + return reader, dshape, class_dim def get_model(args, is_train, main_prog, startup_prog): - model, reader, dshape, class_dim = _model_reader_dshape_classdim(args, - is_train) + reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train) pyreader = None trainer_count = int(os.getenv("PADDLE_TRAINERS")) @@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog): label = fluid.layers.data( name='label', shape=[1], dtype='int64') - predict = model(input, class_dim, is_train=is_train) + model = ResNet(is_train=is_train) + predict = model.net(input, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -216,15 +208,14 @@ def get_model(args, is_train, main_prog, startup_prog): total_images = 1281167 / trainer_count step = int(total_images / args.batch_size + 1) - epochs = [30, 60, 80, 90] + epochs = [30, 60, 90] bd = [step * e for e in epochs] base_lr = args.learning_rate lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( - learning_rate=base_lr, - #learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md index 4c6728fba7150b0f1e180e57590f18a5b677c70d..acea9a2b5df903a958edf3683900e165670e196f 100644 --- a/doc/fluid/dev/releasing_process_cn.md +++ b/doc/fluid/dev/releasing_process_cn.md @@ -1,24 +1,23 @@ # PaddlePaddle发行规范 -PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 +PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 -1. 对这个版本的提交,做如下几个操作: - * 使用Regression Test List作为检查列表,测试本次release的正确性。 - * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 - * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 - * 将这个版本的python wheel包发布到pypi。 - * 更新Docker镜像(参考后面的操作细节)。 -1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。 -1. 协同完成Release Note的书写。 +2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`。 +3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。 +4. QA和研发发现的bug,在develop上修复验证后,cherry-pick修复到release分支。直到release分支相对稳定。 +5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。 +6. release分支稳定后,打上正式的release tag,比如`0.10.0`。 +7. 将这个版本的python wheel包发布到pypi。 +8. 更新Docker镜像(参考后面的操作细节)。 需要注意的是: -* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 -* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 +* bug修复需要先在develop上进行,然后进入release分支。而不是直接在release分支上开发。 + +* release分支原则上只接受修复类的修改,不接受新feature。 ## 发布wheel包到pypi @@ -61,24 +60,21 @@ docker push [镜像]:[version] ## PaddlePaddle 分支规范 -PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 - -* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: - * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。 - * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。 - * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。 +PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。 -* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 - * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 - * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 - * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 - * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 +* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。 +* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。 +* `master`分支因为历史原因,已经废弃。 -* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 +* 其他开发者fork的feature branch。 + * 建议,开发者的feature branch需要同步主版本库的`develop`分支。 + * 建议,开发者的feature branch需要基于主版本库中的`develop`分支。 + * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 ## PaddlePaddle回归测试列表 -本列表说明PaddlePaddle发版之前需要测试的功能点。 +TODO ### PaddlePaddle Book中所有章节 diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index 2c1c30c1eddfde6d9a8e2637be86537c43cc1b00..b810dc941d27fdb5004812ab58e105502e83280f 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -4,26 +4,21 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti Each time we release a new PaddlePaddle version, we should follow the below steps: -1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`. -1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The - first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on. -1. After that, we should do: - * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm - that this release has no major bugs. - * If regression test fails, we must fix those bugs and create a new `release/[version]` - branch from previous release branch. - * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`. - * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail). - * Update the Docker images (see below instructions for detail). -1. After above step, merge `release/[version]` branch to master and push a tag on the master commit, - then merge `master` to `develop`. -1. Update the Release Note. - -***NOTE:*** - -* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain - features only for current release, so that we can test on that version. -* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch. +1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0` +2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。 +3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch. +4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable. +5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4. +6. After release branch is stable,Create the official release tag,such as `0.10.0`. +7. Release the python wheel package to pypi. +8. Update the docker image (More details below). + +NOTE: + +* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch. + +* release normally only accept bug fixes. Don't add new features. + ## Publish Wheel Packages to pypi @@ -97,26 +92,22 @@ You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlep ## Branching Model -We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model, -with some modifications: - -* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed. -* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no - regression tests are run. -* `release/[version]` branch is used to publish each release. Latest release version branches have - bugfix only for that version, but no feature updates. -* Developer forks are not required to follow - [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) - branching model, all forks is like a feature branch. - * Advise: developer fork's develop branch is used to sync up with main repo's develop branch. - * Advise: developer use it's fork's develop branch to for new branch to start developing. - * Use that branch on developer's fork to create pull requests and start reviews. - * developer can push new commits to that branch when the pull request is open. -* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to - `master`, `develop` and `releases`. +PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model. + +* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests. +* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release. +* `master` branch as been deprecated for historical reasons + +* Developer's feature branch。 + * Developer's feature branch should sync with upstream `develop` branch. + * Developer's feature branch should be forked from upstream `develop` branch. + * After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review. + * In the review process, develop modify codes and push to their own feature branch. ## PaddlePaddle Regression Test List +TODO + ### All Chapters of PaddlePaddle Book We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index ae5f30e431aba4cae04b0fb35f00bce84f18de33..842fde1ec5f16aeec28a790f1869eaed64e3516c 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) @@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cc7938b2ac07f11ceb7f33a2e37380d1e2ed2072..d998109df21f585bc4905e00e59fe07247fd3f5e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -116,7 +116,11 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) + +cc_library(version SRCS version.cc) +cc_test(version_test SRCS version_test.cc DEPS version) + +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7a99169849debcbc57d6f197b36c5045b211f3ef..d44ebbae4d4be9c79e629303805c94030b8879db 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -442,8 +442,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu || - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { // Insert BCast Ops for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index c6588435819a982166cf2d2368a82b4402fdc2bc..460401df5473f8650f450a2bd247a703d91b6048 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -16,6 +16,13 @@ syntax = "proto2"; option optimize_for = LITE_RUNTIME; package paddle.framework.proto; +// Any incompatible changes to ProgramDesc and its dependencies should +// raise the version defined version.h. +// +// Serailization and Deserialization codes should be modified in a way +// that supports old versions following the version and compatibility policy. +message Version { optional int64 version = 1 [ default = 0 ]; } + enum AttrType { INT = 0; FLOAT = 1; @@ -180,4 +187,8 @@ message BlockDesc { // for more details. // TODO(panyx0718): A model can have multiple programs. Need a // way to distinguish them. Maybe ID or name? -message ProgramDesc { repeated BlockDesc blocks = 1; } +message ProgramDesc { + repeated BlockDesc blocks = 1; + + optional Version version = 2; +} diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 78387c407398b58d3fab6eab12445c4198f809b5..7004f484a9975124750fad4cb8f773342082b514 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -19,7 +19,7 @@ function(pass_library TARGET DEST) endfunction() cc_library(node SRCS node.cc DEPS proto_desc) -cc_library(graph SRCS graph.cc DEPS node) +cc_library(graph SRCS graph.cc DEPS node pretty_log) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) @@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) +if(WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif() pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) @@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) +if(WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif() diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..4408cb45acb3d46e1addf5c25c238af50e5f5e5f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr ConvReLUFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); + + std::unordered_set nodes2delete; + + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_relu_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(), + "conv_relu_mkldnn_fuse"); + conv_relu_pattern(conv_input); + + int found_conv_relu_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvReLU fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_relu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out + GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op + + // Create an ConvReLU Node. + OpDesc desc; + std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); + std::string conv_relu_w_in = conv_weight->Name(); + std::string conv_relu_b_in = conv_bias->Name(); + std::string conv_relu_out = relu_out->Name(); + desc.SetInput("Input", std::vector({conv_relu_i_in})); + desc.SetInput("Filter", std::vector({conv_relu_w_in})); + desc.SetInput("Bias", std::vector({conv_relu_b_in})); + desc.SetOutput("Out", std::vector({conv_relu_out})); + desc.SetType("conv2d"); + for (auto& attr : conv->Op()->GetAttrMap()) { + desc.SetAttr(attr.first, attr.second); + } + desc.SetAttr("fuse_relu", true); + auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied. + GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out}); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); + IR_NODE_LINK_TO(conv_weight, conv_relu_node); + IR_NODE_LINK_TO(conv_bias, conv_relu_node); + IR_NODE_LINK_TO(conv_relu_node, relu_out); + + found_conv_relu_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_relu_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_relu_mkldnn_fuse_pass, + paddle::framework::ir::ConvReLUFusePass); diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b5de0d548713772e7ad41cfb6d8b3e9460683efb --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the CONV and ReLU to a ConvReLUOp. + */ +class ConvReLUFusePass : public FusePassBase { + public: + virtual ~ConvReLUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..82b5fa1886098ca3b19c147c307d3f2fc3ba03d6 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", true); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); +} + +// a->OP0->b +// b->OP1->c +// (c, weights, bias)->conv->f +// (f)->relu->g +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"b"}), + std::vector({"c"})); + SetOp(&prog, "conv2d", std::vector({"c", "weights", "bias"}), + std::vector({"f"})); + SetOp(&prog, "relu", std::vector({"f"}), + std::vector({"g"})); + + return prog; +} + +TEST(ConvReLUFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: CONV, RELU, conv_out + // Add 1 Node: ConvReLU + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_relu op in newly generated graph + int conv_relu_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + if (node->Op()->HasAttr("use_mkldnn")) { + bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); + if (use_mkldnn) { + if (node->Op()->HasAttr("fuse_relu")) { + bool fuse_relu = boost::get(node->Op()->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; + } + } + } + } + } + } + EXPECT_EQ(conv_relu_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_relu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index f7fda873574a0f8b10251d4fa6b604a9312ad7f9..aa95d3e9f6c8221f6e48d192b73ad5135539dc75 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -51,7 +51,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, if (with_fc_bias) { // Add FC-bias with LSTM-bias and create a new weight PADDLE_ENFORCE(scope); - const std::string& new_bias_var = name_scope + "_bias.new"; + const std::string& new_bias_var = patterns::UniqueKey("NewBias"); auto* bias_var = scope->Var(new_bias_var); PADDLE_ENFORCE(bias_var); auto* bias_tensor = bias_var->GetMutable(); @@ -120,7 +120,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); @@ -136,7 +135,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, fc_bias); // Remove unneeded nodes. std::unordered_set marked_nodes( - {mul, lstm, elementwise_add}); + {mul, lstm, elementwise_add, fc_bias}); GraphSafeRemoveNodes(graph, marked_nodes); } else { GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index fc7feca567e7a0f623ada77af189ef033b44fc53..11d5998aafe1f325b94ef1a5ea1c13c72c13f5c9 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -21,12 +21,17 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" namespace paddle { namespace framework { namespace ir { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; + size_t PDPattern::id_ = 0UL; PDNode* PDPattern::NewNode(const std::string& name) { @@ -83,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph, ValidateByNodeRole(&subgraphs); if (subgraphs.empty()) return; - LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; + PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); int id = 0; for (auto& g : subgraphs) { VLOG(3) << "optimizing #" << id++ << " subgraph"; @@ -517,6 +522,39 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { return false; } +PDNode* patterns::ConvReLU::operator()( + paddle::framework::ir::PDNode* conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + // Create variables + // Filter + auto* conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // Bias + auto* conv_bias_var = pattern->NewNode(conv_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Bias"); + // intermediate variable, will be removed in the IR after fuse. + auto* conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("relu"); + // output + auto* relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) + .LinksTo({conv_out_var}); + relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); + return relu_out_var; +} + PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, bool with_bias) { // Create shared nodes. diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 57482a07b607ba1d9fa06a5f325f60ba58dce307..371384dc56eec91db1f621c0ebb65113e7a5a5cc 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -360,6 +360,28 @@ struct PatternBase { size_t id_; }; +// CONV with ReLU +// op: conv + relu +// named nodes: +// conv_input, conv_weight, +// conv_bias, conv_out, conv, +// relu_out, relu +struct ConvReLU : public PatternBase { + ConvReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_relu") {} + + PDNode* operator()(PDNode* conv_input); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(relu); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_bias); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(relu_out); +}; + // FC with bias // op: mul + elementwise_add // named nodes: diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index adeb26e4e78693eb9760ec1e12e4b71ba3115d5b..1e7da9a69c7cbf8c13306656599a759515802b76 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" @@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { void SerializeToStream(std::ostream &os, const LoDTensor &tensor, const platform::DeviceContext &dev_ctx) { { // the 1st field, uint32_t version for LoDTensor - constexpr uint32_t version = 0; - os.write(reinterpret_cast(&version), sizeof(version)); + os.write(reinterpret_cast(&kCurTensorVersion), + sizeof(kCurTensorVersion)); } { // the 2st field, LoD information @@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, // the 1st field, unit32_t version for LoDTensor uint32_t version; is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE(framework::IsTensorVersionSupported(version), + "tensor version %u is not supported.", version); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); } { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d58d6e4f3e684b97fcc1121e51355bdf3aae3fce..b7fae7171a57666a8fb4613a7cbe3aa15997b638 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -464,35 +464,35 @@ class RuntimeInferShapeContext : public InferShapeContext { : op_(op), scope_(scope) {} bool HasInput(const std::string& name) const override { - if (!op_.HasInputs(name)) { + // has only one input + const auto& ins = op_.Inputs(); + auto it = ins.find(name); + if (it == ins.end()) { return false; } - auto& ins = Inputs(name); - size_t length = ins.size(); - if (length == 0) { + const auto& in = it->second; + if (in.size() == 0 || in[0] == kEmptyVarName) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, + PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - auto ipt = ins[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; + return scope_.FindVar(in[0]) != nullptr; } bool HasOutput(const std::string& name) const override { - if (!op_.HasOutputs(name)) { + // has only one output + const auto& outs = op_.Outputs(); + auto it = outs.find(name); + if (it == outs.end()) { return false; } - auto& outs = Outputs(name); - size_t length = outs.size(); - if (length == 0) { + const auto& out = it->second; + if (out.size() == 0 || out[0] == kEmptyVarName) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, - "Output %s should not have more than one inputs", name); - auto ipt = outs[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; + PADDLE_ENFORCE_EQ(out.size(), 1UL, + "Output %s should not have more than one outputs", name); + return scope_.FindVar(out[0]) != nullptr; } bool HasInputs(const std::string& name) const override { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 81cb24bdda6b87a3d708cf5047dce05d5020a0d5..5b8c75a93de2ddd8f7260d2191c22a5945b3d2d9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -352,7 +352,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ParallelExecutor::~ParallelExecutor() { if (member_->own_local_scope_) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { - member_->global_scope_->DeleteScope(member_->local_scopes_[i]); + Scope *local_scope = member_->local_scopes_[i]; + if (member_->global_scope_->HasKid(local_scope)) { + member_->global_scope_->DeleteScope(local_scope); + } } } } diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index a63944eaee6132c1082947fddcad4e0d72e26df1..589905828f7793c614c0fe12259e9ba5ab11ceac 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/version.h" namespace paddle { namespace framework { @@ -38,7 +39,10 @@ proto::ProgramDesc *ProgramDesc::Proto() { return &desc_; } +int64_t ProgramDesc::Version() const { return desc_.version().version(); } + ProgramDesc::ProgramDesc() { + desc_.mutable_version()->set_version(kCurProgramVersion); auto *block = desc_.mutable_blocks()->Add(); block->set_idx(kRootBlockIndex); block->set_parent_idx(kNoneBlockIndex); diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index a0e81cade18c0ca5eb1b98fee8325ae2d917d1a2..2ec0e9d7a0969d44f88c7407bfb8cd4646530147 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -57,6 +57,8 @@ class ProgramDesc { proto::ProgramDesc *Proto(); + int64_t Version() const; + // The output variable of feed_op is referenced as feed_target. // This function is used to collect the output variable's name of all // feed_ops. diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc index 925ea98dbe62e4da91689f6e56c135e51c24a8a3..7e689a37da8a16bd9b1ac6650b9322d2eb5a2c85 100644 --- a/paddle/fluid/framework/program_desc_test.cc +++ b/paddle/fluid/framework/program_desc_test.cc @@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) { ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); - ASSERT_EQ(op_copy->Proto()->SerializeAsString(), - op_origin->Proto()->SerializeAsString()); + ASSERT_EQ(op_origin->Proto()->attrs().size(), + op_copy->Proto()->attrs().size()); + for (auto it = op_origin->Proto()->attrs().begin(); + it != op_origin->Proto()->attrs().end(); ++it) { + for (auto it_2 = op_copy->Proto()->attrs().begin(); + it_2 != op_copy->Proto()->attrs().end(); ++it_2) { + if (it->name() == it_2->name()) { + ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString()); + } + } + } if (op->Type() == "op_with_subblock") { ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index a068d3543d9d2abec203f86362a8be5ba135d04d..da163835e8652ae479121bd67f2eed77332b2740 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -56,5 +56,76 @@ struct RWLock { }; #endif +class RWLockGuard { + public: + enum Status { kUnLock, kWRLock, kRDLock }; + + RWLockGuard(RWLock* rw_lock, Status init_status) + : lock_(rw_lock), status_(Status::kUnLock) { + switch (init_status) { + case Status::kRDLock: { + RDLock(); + break; + } + case Status::kWRLock: { + WRLock(); + break; + } + case Status::kUnLock: { + break; + } + } + } + + void WRLock() { + switch (status_) { + case Status::kUnLock: { + lock_->WRLock(); + status_ = Status::kWRLock; + break; + } + case Status::kWRLock: { + break; + } + case Status::kRDLock: { + PADDLE_THROW( + "Please unlock read lock first before invoking write lock."); + break; + } + } + } + + void RDLock() { + switch (status_) { + case Status::kUnLock: { + lock_->RDLock(); + status_ = Status::kRDLock; + break; + } + case Status::kRDLock: { + break; + } + case Status::kWRLock: { + PADDLE_THROW( + "Please unlock write lock first before invoking read lock."); + break; + } + } + } + + void UnLock() { + if (status_ != Status::kUnLock) { + lock_->UNLock(); + status_ = Status::kUnLock; + } + } + + ~RWLockGuard() { UnLock(); } + + private: + RWLock* lock_; + Status status_; +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 50f374e3703a97f6c1fdb4b14fdeb0b603f9ac86..2be655b89a4caf2bf9874dcab6bc0bdb2856a026 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -72,6 +72,12 @@ void Scope::DropKids() { kids_.clear(); } +bool Scope::HasKid(const Scope* scope) const { + std::unique_lock lock(mutex_); + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + return it != this->kids_.end(); +} + std::vector Scope::LocalVarNames() const { std::unique_lock lock(mutex_); std::vector known_vars; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index e246241c0abfbc7bdcaf38d073cc58fc36a4f737..b6165a595d537c314a95685e8b1edbc42e387ab7 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -71,6 +71,9 @@ class Scope { /// Drop all kids scopes belonged to this scope. void DropKids(); + /// Find if a scope exists in the kid scopes + bool HasKid(const Scope* scope) const; + // enumerate all the variables current contains. std::vector LocalVarNames() const; diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc new file mode 100644 index 0000000000000000000000000000000000000000..81c0392bf3cc7378cec06a9de3ae81f2b221ecec --- /dev/null +++ b/paddle/fluid/framework/version.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/version.h" +#include + +namespace paddle { +namespace framework { +bool IsProgramVersionSupported(int64_t version) { + static int num_supported = + sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]); + return std::find(kSupportedProgramVersion, + kSupportedProgramVersion + num_supported, + version) != kSupportedProgramVersion + num_supported; +} + +bool IsTensorVersionSupported(uint32_t version) { + static int num_supported = + sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]); + return std::find(kSupportedTensorVersion, + kSupportedTensorVersion + num_supported, + version) != kSupportedTensorVersion + num_supported; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h new file mode 100644 index 0000000000000000000000000000000000000000..9945bc58c69df8456ff3d1aa0c777970bdbdbf98 --- /dev/null +++ b/paddle/fluid/framework/version.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#pragma once + +namespace paddle { +namespace framework { + +// Note: +// Program and Tensor that pass the IsXXXVersionSupported should +// be supported by the current codes. Otherwise, it's a compatibility +// bug. + +// The program version the current codes generate. +constexpr int64_t kCurProgramVersion = 0; + +// The program version that was generated by previous or current codes +// and supported by current codes. +constexpr int64_t kSupportedProgramVersion[] = {0}; + +// Due to historical reasons, tensor version use uint32_t. +// The tensor version the current codes generate. +constexpr uint32_t kCurTensorVersion = 0; + +// The tensor version that was generated by previous or current codes +// and supported by current codes. +constexpr uint32_t kSupportedTensorVersion[] = {0}; + +bool IsProgramVersionSupported(int64_t version); + +bool IsTensorVersionSupported(uint32_t version); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8c5f256000522af976bbf487741a586f1abc439 --- /dev/null +++ b/paddle/fluid/framework/version_test.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/version.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +TEST(Version, Basic) { + EXPECT_TRUE(IsProgramVersionSupported(0)); + EXPECT_FALSE(IsProgramVersionSupported(1)); + EXPECT_FALSE(IsProgramVersionSupported(-1)); + + EXPECT_TRUE(IsTensorVersionSupported(0)); + EXPECT_FALSE(IsTensorVersionSupported(1)); + EXPECT_FALSE(IsTensorVersionSupported(-1)); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91..efb91bcf75a3cb99a67d5a3251b1d42fc4b04170 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -55,6 +55,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) + add_subdirectory(tests/api) endif() diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 11a7509feb02a806e1e173bfb8bd7764f94d3457..c2a1c6634bd8f8de0796456e91cb3c530d4c6823 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc @@ -40,27 +40,7 @@ function (inference_analysis_test TARGET) endif(WITH_TESTING) endfunction(inference_analysis_test) -function (inference_download_and_uncompress install_dir url gz_filename) - message(STATUS "Download inference test stuff ${gz_filename} from ${url}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}") - message(STATUS "finish downloading ${gz_filename}") -endfunction(inference_download_and_uncompress) - -set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") -set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") -set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1" CACHE PATH "RNN1 model and data root." FORCE) -if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING) - inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz") - inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer SRCS analyzer_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${RNN1_INSTALL_DIR}/model - --infer_data=${RNN1_INSTALL_DIR}/data.txt) - +inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) @@ -71,46 +51,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) - -set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") -set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") -set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) -if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") - inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model - --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) - -set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") -set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") -set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE) -if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz") - inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${LAC_INSTALL_DIR}/model - --infer_data=${LAC_INSTALL_DIR}/data.txt) - - -set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") -set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") -set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE) - -if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz") - inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz") -endif() - -inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta - --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt - --topn=1 # Just run top 1 batch. - ) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cc4b390495c60cecc8ebebb3f17b38d9b5a15956..3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,21 +16,9 @@ #include #include -#include // NOLINT -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/utils/singleton.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); namespace paddle { namespace inference { @@ -91,274 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) { } } -namespace { - -struct DataRecord { - std::vector>> link_step_data_all; - std::vector> week_data_all, minute_data_all; - std::vector lod1, lod2, lod3; - std::vector> rnn_link_data, rnn_week_datas, - rnn_minute_datas; - size_t batch_iter{0}; - size_t batch_size{1}; - DataRecord() = default; - explicit DataRecord(const std::string &path, int batch_size = 1) - : batch_size(batch_size) { - Load(path); - } - DataRecord NextBatch() { - DataRecord data; - size_t batch_end = batch_iter + batch_size; - // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= link_step_data_all.size()) { - data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, - link_step_data_all.begin() + batch_end); - data.week_data_all.assign(week_data_all.begin() + batch_iter, - week_data_all.begin() + batch_end); - data.minute_data_all.assign(minute_data_all.begin() + batch_iter, - minute_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - data.lod3.push_back(0); - CHECK(!data.link_step_data_all.empty()) << "empty"; - CHECK(!data.week_data_all.empty()); - CHECK(!data.minute_data_all.empty()); - CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); - CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); - for (size_t j = 0; j < data.link_step_data_all.size(); j++) { - for (const auto &d : data.link_step_data_all[j]) { - data.rnn_link_data.push_back(d); - } - data.rnn_week_datas.push_back(data.week_data_all[j]); - data.rnn_minute_datas.push_back(data.minute_data_all[j]); - // calculate lod - data.lod1.push_back(data.lod1.back() + - data.link_step_data_all[j].size()); - data.lod3.push_back(data.lod3.back() + 1); - for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { - data.lod2.push_back(data.lod2.back() + - data.link_step_data_all[j].size()); - } - } - } - batch_iter += batch_size; - return data; - } - void Load(const std::string &path) { - std::ifstream file(path); - std::string line; - int num_lines = 0; - while (std::getline(file, line)) { - num_lines++; - std::vector data; - split(line, ':', &data); - std::vector> link_step_data; - std::vector link_datas; - split(data[0], '|', &link_datas); - for (auto &step_data : link_datas) { - std::vector tmp; - split_to_float(step_data, ',', &tmp); - link_step_data.push_back(tmp); - } - // load week data - std::vector week_data; - split_to_float(data[2], ',', &week_data); - // load minute data - std::vector minute_data; - split_to_float(data[1], ',', &minute_data); - link_step_data_all.push_back(std::move(link_step_data)); - week_data_all.push_back(std::move(week_data)); - minute_data_all.push_back(std::move(minute_data)); - } - } -}; -void PrepareInputs(std::vector *input_slots, DataRecord *data, - int batch_size) { - PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, - week_tensor, minute_tensor; - lod_attention_tensor.name = "data_lod_attention"; - init_zero_tensor.name = "cell_init"; - lod_tensor_tensor.name = "data"; - week_tensor.name = "week"; - minute_tensor.name = "minute"; - auto one_batch = data->NextBatch(); - std::vector rnn_link_data_shape( - {static_cast(one_batch.rnn_link_data.size()), - static_cast(one_batch.rnn_link_data.front().size())}); - lod_attention_tensor.shape.assign({1, 2}); - lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); - init_zero_tensor.shape.assign({batch_size, 15}); - init_zero_tensor.lod.assign({one_batch.lod3}); - lod_tensor_tensor.shape = rnn_link_data_shape; - lod_tensor_tensor.lod.assign({one_batch.lod1}); - // clang-format off - week_tensor.shape.assign( - {static_cast(one_batch.rnn_week_datas.size()), - static_cast(one_batch.rnn_week_datas.front().size())}); - week_tensor.lod.assign({one_batch.lod3}); - minute_tensor.shape.assign( - {static_cast(one_batch.rnn_minute_datas.size()), - static_cast(one_batch.rnn_minute_datas.front().size())}); - minute_tensor.lod.assign({one_batch.lod3}); - // clang-format on - // assign data - TensorAssignData(&lod_attention_tensor, - std::vector>({{0, 0}})); - std::vector tmp_zeros(batch_size * 15, 0.); - TensorAssignData(&init_zero_tensor, {tmp_zeros}); - TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); - TensorAssignData(&week_tensor, one_batch.rnn_week_datas); - TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); - // Set inputs. - auto init_zero_tensor1 = init_zero_tensor; - init_zero_tensor1.name = "hidden_init"; - input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, - init_zero_tensor1, lod_attention_tensor, - lod_tensor_tensor}); - for (auto &tensor : *input_slots) { - tensor.dtype = PaddleDType::FLOAT32; - } -} - -} // namespace - -void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); - } - } -} -// Test with a really complicate model. -void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { - AnalysisConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; - config.enable_ir_optim = activate_ir; - PADDLE_ENFORCE(config.ir_mode == - AnalysisConfig::IrPassMode::kExclude); // default - config.ir_passes.clear(); // Do not exclude any pass. - - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - - auto base_predictor = - CreatePaddlePredictor(config); - auto predictor = - CreatePaddlePredictor( - config); - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - // Prepare inputs. - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs, base_outputs; - - base_predictor->Run(input_slots, &base_outputs); - - if (num_threads == 1) { - // Prepare inputs. - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); - CompareResult(outputs, base_outputs); - } else { - std::vector threads; - std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back( - CreatePaddlePredictor( - config)); - } - for (int tid = 0; tid < num_threads; ++tid) { - threads.emplace_back([&, tid]() { - // Each thread should have local input_slots and outputs. - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictors[tid]->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times); - CompareResult(outputs, base_outputs); - }); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } - } - - if (use_analysis && activate_ir) { - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); - auto &fuse_statis = analysis_predictor->analysis_argument() - .Get>( - framework::ir::kFuseStatisAttr); - for (auto &item : fuse_statis) { - LOG(INFO) << "fused " << item.first << " " << item.second; - } - - int num_ops = 0; - for (auto &node : - analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { - if (node->IsFunction()) { - ++num_ops; - } - } - LOG(INFO) << "has num ops: " << num_ops; - - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - EXPECT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - } -} - -// Inference with analysis and IR, easy for profiling independently. -TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } - -// Other unit-tests of RNN1, test different options of use_analysis, -// activate_ir and multi-threads. -TEST(Analyzer, RNN_tests) { - int num_threads[2] = {1, 4}; - for (auto i : num_threads) { - // Directly infer with the original model. - TestRNN1Prediction(false, false, i); - // Inference with the original model with the analysis turned on, the - // analysis - // module will transform the program to a data flow graph. - TestRNN1Prediction(true, false, i); - // Inference with analysis and IR. The IR module will fuse some large - // kernels. - TestRNN1Prediction(true, true, i); - } +TEST(Analyzer, word2vec_without_analysis) { + TestWord2vecPrediction(FLAGS_inference_model_dir); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index ea0f2241d7dbab8f79ec9349effbe96112748e34..e76708baf4b39afb0febbcf3ff71281dfbfc8627 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -14,13 +14,18 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { namespace analysis { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; IRPassManager::IRPassManager(const ProgramDesc &program, framework::Scope *scope) @@ -33,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program, void IRPassManager::Apply(const std::vector &passes) { // Apply all the passes std::string pre_pass; + int pass_num = 0; for (const std::string &pass_name : passes) { - LOG(WARNING) << "Running IR pass [" << pass_name << "]"; + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { - std::string dot_file_path = - "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot"; + std::string dot_file_path = std::to_string(pass_num) + "_ir_" + + (pre_pass.empty() ? "origin" : pre_pass) + + ".dot"; pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); + pass_num++; } graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index 759b2b96a1944c060ac98b6865b58ba2f6369607..a6ac0ee49f8f408faa7a17bf5ef5d2799a9a6238 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -22,7 +23,7 @@ namespace analysis { bool PassManager::Initialize(Argument* argument) { argument_ = argument; for (auto& pass : data_) { - LOG(WARNING) << "Initializing pass [" << pass->repr() << "]"; + VLOG(3) << "Initializing pass [" << pass->repr() << "]"; if (!pass->Initialize(argument)) { LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; return false; @@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); - LOG(INFO) << "Total " << data_.size() << " Analysys passes"; + VLOG(3) << "Total " << data_.size() << " Analysys passes"; for (auto& pass : data_) { - LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]"; + string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]", + pass->repr()); pass->Run(argument_->main_dfg.get()); } } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index bd9b4b1a814f995e3979105f5b9830b95fd8ea7d..6fe13ed027de403bdc21882c26225bcd4cc7e49a 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -262,7 +262,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { buffer.Resize(sizeof(T) * data.size()); } - std::memcpy(buffer.data(), data.data(), buffer.length()); + std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size()); // copy LoD for (const auto &level : fetch.lod()) { output->lod.emplace_back(level); diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index cef7b2a7e3a29da05628d7540f5545dc9adda27e..e246a06fd079d837ac321197914c9f70b528f2c8 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/pybind/pybind.h" @@ -124,6 +125,9 @@ std::unique_ptr Load(framework::Executor* executor, std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); LoadPersistables(executor, scope, *main_program, dirname, ""); return main_program; @@ -138,6 +142,9 @@ std::unique_ptr Load( std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); LoadPersistables(executor, scope, *main_program, "", param_filename); return main_program; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d44a2cfa7f2d2f7dde5001006e05cdff1612435b --- /dev/null +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -0,0 +1,57 @@ +function (inference_download_and_uncompress install_dir url) + get_filename_component(filename ${url} NAME) + message(STATUS "Download inference test stuff ${filename} from ${url}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + message(STATUS "finish downloading ${filename}") +endfunction(inference_download_and_uncompress) + +function(download_model_and_data install_dir model_url data_url) + if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) + inference_download_and_uncompress(${install_dir} ${model_url}) + inference_download_and_uncompress(${install_dir} ${data_url}) + endif() +endfunction() + +# RNN1 +set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") +set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") +set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1") +download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL}) +inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + ARGS --infer_model=${RNN1_INSTALL_DIR}/model + --infer_data=${RNN1_INSTALL_DIR}/data.txt) + +# chinese_ner +set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") +set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") +set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL}) +inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model + --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) + +# lac +set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") +set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") +set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac") +download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL}) +inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor + ARGS --infer_model=${LAC_INSTALL_DIR}/model + --infer_data=${LAC_INSTALL_DIR}/data.txt) + +# text_classification +set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") +set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") +set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL}) +inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc + EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor + ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta + --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt + --topn=1 # Just run top 1 batch. + ) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc similarity index 90% rename from paddle/fluid/inference/analysis/analyzer_lac_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 522d870db8583aac4006e8cdb7909625c3feb34b..7e00cb20ad0ce052a84d5491b0cdf167f0768081 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -117,34 +117,6 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, input_slots->assign({input_tensor}); } -void BenchAllData(const std::string &model_path, const std::string &data_file, - const int batch_size, const int repeat) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; - std::vector input_slots, outputs_slots; - DataRecord data(data_file, batch_size); - auto predictor = - CreatePaddlePredictor(config); - GetOneBatch(&input_slots, &data, batch_size); - for (int i = 0; i < FLAGS_burning; i++) { - predictor->Run(input_slots, &outputs_slots); - } - Timer timer; - double sum = 0; - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } - } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); -} - const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc similarity index 98% rename from paddle/fluid/inference/analysis/analyzer_ner_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 661b047ed7cb70545267e468d8c2c48596a2994c..6e8e43add7d3383fa79efea91c23750be9c8956f 100644 --- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -144,8 +144,9 @@ void TestChineseNERPrediction(bool use_analysis) { size_t num_samples; for (int i = 0; i < FLAGS_repeat; i++) { DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + // Just one batch, the num_samples remains the same. num_samples = data.num_samples; - for (size_t bid = 0; bid < num_samples; ++bid) { + for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) { PrepareInputs(&input_slots, &data, FLAGS_batch_size); timer.tic(); predictor->Run(input_slots, &outputs); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..b8ac468b4e98bcef81cdbbf66e3f1640c03a7ab8 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -0,0 +1,306 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 10, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector> week_data_all, minute_data_all; + std::vector lod1, lod2, lod3; + std::vector> rnn_link_data, rnn_week_datas, + rnn_minute_datas; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + data.week_data_all.assign(week_data_all.begin() + batch_iter, + week_data_all.begin() + batch_end); + data.minute_data_all.assign(minute_data_all.begin() + batch_iter, + minute_data_all.begin() + batch_end); + // Prepare LoDs + data.lod1.push_back(0); + data.lod2.push_back(0); + data.lod3.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + CHECK(!data.week_data_all.empty()); + CHECK(!data.minute_data_all.empty()); + CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); + CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + } + data.rnn_week_datas.push_back(data.week_data_all[j]); + data.rnn_minute_datas.push_back(data.minute_data_all[j]); + // calculate lod + data.lod1.push_back(data.lod1.back() + + data.link_step_data_all[j].size()); + data.lod3.push_back(data.lod3.back() + 1); + for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { + data.lod2.push_back(data.lod2.back() + + data.link_step_data_all[j].size()); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + std::vector> link_step_data; + std::vector link_datas; + split(data[0], '|', &link_datas); + for (auto &step_data : link_datas) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + link_step_data.push_back(tmp); + } + // load week data + std::vector week_data; + split_to_float(data[2], ',', &week_data); + // load minute data + std::vector minute_data; + split_to_float(data[1], ',', &minute_data); + link_step_data_all.push_back(std::move(link_step_data)); + week_data_all.push_back(std::move(week_data)); + minute_data_all.push_back(std::move(minute_data)); + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, + week_tensor, minute_tensor; + lod_attention_tensor.name = "data_lod_attention"; + init_zero_tensor.name = "cell_init"; + lod_tensor_tensor.name = "data"; + week_tensor.name = "week"; + minute_tensor.name = "minute"; + auto one_batch = data->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor.shape.assign({1, 2}); + lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); + init_zero_tensor.shape.assign({batch_size, 15}); + init_zero_tensor.lod.assign({one_batch.lod3}); + lod_tensor_tensor.shape = rnn_link_data_shape; + lod_tensor_tensor.lod.assign({one_batch.lod1}); + // clang-format off + week_tensor.shape.assign( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor.lod.assign({one_batch.lod3}); + minute_tensor.shape.assign( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor.lod.assign({one_batch.lod3}); + // clang-format on + // assign data + TensorAssignData(&lod_attention_tensor, + std::vector>({{0, 0}})); + std::vector tmp_zeros(batch_size * 15, 0.); + TensorAssignData(&init_zero_tensor, {tmp_zeros}); + TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); + TensorAssignData(&week_tensor, one_batch.rnn_week_datas); + TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); + // Set inputs. + auto init_zero_tensor1 = init_zero_tensor; + init_zero_tensor1.name = "hidden_init"; + input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, + init_zero_tensor1, lod_attention_tensor, + lod_tensor_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::FLOAT32; + } +} + +void CompareResult(const std::vector &outputs, + const std::vector &base_outputs) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &base_out = base_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), + 1, [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_EQ(size, size1); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + float *base_data = static_cast(base_out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_data[i], 1e-3); + } + } +} +// Test with a really complicate model. +void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = activate_ir; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + config.ir_passes.clear(); // Do not exclude any pass. + + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + // Prepare inputs. + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + base_predictor->Run(input_slots, &base_outputs); + + if (num_threads == 1) { + // Prepare inputs. + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); + CompareResult(outputs, base_outputs); + } else { + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local input_slots and outputs. + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + predictors[tid]->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times); + CompareResult(outputs, base_outputs); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } + } + + if (use_analysis && activate_ir) { + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + } +} + +// Inference with analysis and IR, easy for profiling independently. +TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } + +// Other unit-tests of RNN1, test different options of use_analysis, +// activate_ir and multi-threads. +TEST(Analyzer, RNN_tests) { + int num_threads[2] = {1, 4}; + for (auto i : num_threads) { + // Directly infer with the original model. + TestRNN1Prediction(false, false, i); + // Inference with the original model with the analysis turned on, the + // analysis + // module will transform the program to a data flow graph. + TestRNN1Prediction(true, false, i); + // Inference with analysis and IR. The IR module will fuse some large + // kernels. + TestRNN1Prediction(true, true, i); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc similarity index 100% rename from paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 39b0c856996c11c6efdb530f1396afd5731c778d..9b943440a869e213db4ed761cfe7c508bc5e94ae 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -24,28 +24,28 @@ namespace operators { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of AttentionLSTM should not be null."); + "Assert only one Input(X) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("C0"), - "Input(C0) of AttentionLSTM should not be null."); + "Assert only one Input(C0) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), - "Input(LSTMWeight) of AttentionLSTM should not be null."); + "Assert only one Input(LSTMWeight) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), - "Input(LSTMBias) of AttentionLSTM should not be null."); + "Assert only one Input(LSTMBias) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), - "Input(AttentionWeight) of AttentionLSTM should not be null."); + "Assert only one Input(AttentionWeight) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of AttentionLSTM should not be null."); + "Assert only one Output(Hidden) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of AttentionLSTM should not be null."); + "Assert only one Output(Cell) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), - "Output(AttentionedX) of AttentionLSTM should not be null."); + "Assert only one Output(AttentionedX) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), - "Output(AttentionFCOut) of AttentionLSTM should not be null."); + "Assert only one Output(AttentionFCOut) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), - "Output(LSTMX) of AttentionLSTM should not be null."); + "Assert only one Output(LSTMX) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), - "Output(LSTMOUT) of AttentionLSTM should not be null."); + "Assert only one Output(LSTMOUT) of AttentionLSTM."); auto x_dims = ctx->GetInputDim("X"); const int M = x_dims[1]; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 22cbf680c0670552fb014043c69fcadc56863529..4a7a6bcf7154d5680de751e3c933be46fb09fd74 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { output_channels / groups * output_height * output_width * output_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; if (user_workspace_size > 0) { @@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // Allocate on GPU memory - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_filter_desc, filter_algo, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } - // ------------------- cudnn conv workspace --------------------- - // Already on GPU - void* cudnn_workspace = nullptr; - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { @@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + i * group_offset_in)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + i * group_offset_in)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } // ------------------- cudnn conv backward filter --------------------- @@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_output_grad_desc, output_grad_data + i * group_offset_out, - cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + i * group_offset_filter)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index c5cbadc892904dc064b49ebc461944c4671a69da..3eb02c6b61ce61140bd777647a12477dd9c3c803 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline); + pipeline, is_persistent); } std::shared_ptr AcquireBiasMemoryFromPrimitive( @@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -296,6 +299,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); + bool fuse_relu = ctx.Attr("fuse_relu"); int groups = ctx.Attr("groups"); // TODO(pzelazko-intel) add support for group convolution and dilation @@ -348,11 +352,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, + paddings, mkldnn_engine, fuse_relu); } else { conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine); + paddings, mkldnn_engine, fuse_relu); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -371,7 +376,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_memory_p = handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline); + user_weights_memory_p, pipeline, is_test); auto dst_memory_p = handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); @@ -402,11 +407,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } private: + mkldnn::primitive_attr AddRelu() const { + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + mkldnn::primitive_attr conv_attr; + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + mkldnn::post_ops post_operations; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine) const { + const mkldnn::engine& engine, + const bool fuse_relu) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -415,8 +435,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + mkldnn::primitive_attr conv_attr; + if (fuse_relu) { + conv_attr = AddRelu(); + } + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); return std::unique_ptr( p_conv_pd); @@ -427,7 +452,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine) const { + const mkldnn::engine& engine, + const bool fuse_relu) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -436,8 +462,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + mkldnn::primitive_attr conv_attr; + if (fuse_relu) { + conv_attr = AddRelu(); + } + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); return std::unique_ptr( p_conv_pd); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 61ca80877a6dfcdf30a0ff346342116e36eec6f2..41d4fcf6de7c8fcb3cfbb2063b0a2ac1a2356168 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } void Conv2DOpMaker::Make() { + AddAttr("is_test", "").SetDefault(false); AddInput( "Input", "(Tensor) The input tensor of convolution operator. " @@ -161,6 +162,8 @@ void Conv2DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 82fff68e7557b3f0b44e6faf2a50e5a0ecbba589..73831611d01b8c5b8d2d9f7f15634a0094e4a608 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { conv_desc.descriptor(paddings, strides, dilations); // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; if (user_workspace_size > 0) { @@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); - // Allocate on GPU memory - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); - // ------------------- cudnn conv transpose forward --------------------- int input_offset = input->numel() / input->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } - - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::max(workspace_size_in_bytes, bwd_filter_ws_size); } - // ------------------- cudnn conv workspace --------------------- - // Already on GPU - void* cudnn_workspace = nullptr; - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. int input_offset = input->numel() / input->dims()[0] / groups; @@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } @@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } - - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 578ab63bc380ee62d76e34b7cf3cbd590bfa2eda..66f19fe7ecfa51b2ce917f0c5fcb6d486f1a7307 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") .SetDefault(false); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); AddComment(R"DOC( CrossEntropy Operator. diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 36b58d80144d242277f6fc970a3a61a6721d4b50..03974a7fc511b1e1cb5b0eca532b260fdf9bf964 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel { math::CrossEntropyFunctor()( ctx.template device_context(), &y_2d, &x_2d, &labels_2d, - ctx.Attr("soft_label")); + ctx.Attr("soft_label"), ctx.Attr("ignore_index")); } }; @@ -74,16 +74,22 @@ class XeGradFunctor { const T* dy, // NOLINT const T* x, // NOLINT const int64_t* label, // NOLINT - size_t num_classes) - : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} + size_t num_classes, size_t ignore_index) + : dx_(dx), + dy_(dy), + x_(x), + label_(label), + num_classes_(num_classes), + ignore_index_(ignore_index) {} HOSTDEVICE void operator()(size_t sample_id) { auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; for (size_t x_offset = sample_id * num_classes_; x_offset < (sample_id + 1) * num_classes_; ++x_offset) { - dx_[x_offset] = x_offset != x_is_true_offset - ? static_cast(0) - : -dy_[sample_id] / x_[x_offset]; + dx_[x_offset] = + (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_) + ? static_cast(0) + : -dy_[sample_id] / x_[x_offset]; } } @@ -93,6 +99,7 @@ class XeGradFunctor { const T* x_; const int64_t* label_; size_t num_classes_; + size_t ignore_index_; }; template @@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { // unnecessary to convert tensors to 2-D views. int rank = x->dims().size(); int64_t class_num = x->dims()[rank - 1]; + int64_t ignore_index = ctx.Attr("ignore_index"); if (ctx.Attr("soft_label")) { XeSoftlabelGradFunctor functor(dx_data, dy->data(), x->data(), label->data(), @@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { static_cast(dx->numel())); for_range(functor); } else { - XeGradFunctor functor(dx_data, dy->data(), x->data(), - label->data(), - static_cast(class_num)); + XeGradFunctor functor( + dx_data, dy->data(), x->data(), label->data(), + static_cast(class_num), static_cast(ignore_index)); platform::ForRange for_range( ctx.template device_context(), static_cast(dy->numel())); diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index da5d20505e9b06c0717af8d79d5456a9ade1e89c..56734b81e8716a0c0c37a11e35c9118ee7b55020 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,6 +20,7 @@ if(WITH_GRPC) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + cc_test(varhandle_test SRCS varhandle_test.cc) return() endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940..07ac20797ddab54296a45e99915588a40cc6f3c7 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() { } channels_.clear(); } - client_thread_->join(); } -bool GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + SendProcessor* s = new SendProcessor(ch); + VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, - this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; SerializeToByteBuffer(var_name_val, var, *p_ctx, &req); - // varhandle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Send"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - SendProcessor* s = new SendProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = nullptr; auto call = s->stub_g_.PrepareUnaryCall( @@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep, }); req_count_++; - return true; + return h; } void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { framework::Variable* outvar = nullptr; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar); } template @@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { result->Swap(&tmp); } -bool GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, - this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); ::grpc::ByteBuffer buf; RequestToByteBuffer(req, &buf); - // var handle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Get"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( @@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep, req_count_++; - return true; + return h; } -bool GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h( + new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + time_out, s, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); - // var handle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = out_var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Prefetch"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( @@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep, }); req_count_++; - return true; + return h; } -void GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h( + new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(COMPLETE_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dir, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { const auto ch = GetChannel(ep); CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(CHECKPOINT_SAVE_MESSAGE); @@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } bool GRPCClient::Wait() { @@ -276,25 +268,28 @@ void GRPCClient::Proceed() { void* tag = nullptr; bool ok = false; + VLOG(3) << "GRPCClient Proceed begin"; while (!stopped_ && cq_.Next(&tag, &ok)) { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); if (c->status_.ok()) { - VLOG(3) << c->var_h_.String() << " process"; + VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { - LOG(ERROR) << c->var_h_.String() + LOG(ERROR) << c->GetVarHandlePtr()->String() << " meets grpc error:" << c->status_.error_message(); { std::lock_guard lk(sync_mutex_); ok_ = false; } - sync_cond_.notify_all(); + c->Finish(false); } else { - LOG(FATAL) << c->var_h_.String() + LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error:" << c->status_.error_message(); + c->Finish(false); } + delete c; { std::lock_guard lk(sync_mutex_); @@ -302,6 +297,7 @@ void GRPCClient::Proceed() { } sync_cond_.notify_all(); } + VLOG(3) << "GRPCClient Proceed end"; } std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 0c95ffeb5ce7e1586c5968fb122acd12c0c0196e..75a3662316462a222760bfbb7d7906c70f46d143 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); class BaseProcessor { public: - explicit BaseProcessor(std::shared_ptr ch) { - context_ = nullptr; - } + BaseProcessor() { context_ = nullptr; } virtual ~BaseProcessor() {} - virtual void Prepare(const VarHandle& var_info, int64_t time_out) { + virtual void Prepare(VarHandlePtr h, int64_t time_out) { + var_h_ = h; + context_.reset(new grpc::ClientContext()); - var_h_ = var_info; context_->set_wait_for_ready(true); if (time_out) { std::chrono::system_clock::time_point deadline = @@ -71,21 +70,21 @@ class BaseProcessor { } } - virtual void Prepare(int64_t time_out) { - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); - - context_->set_deadline(deadline); + void Process() { + ProcessImpl(); + var_h_->Finish(true); } - virtual void Process() = 0; + VarHandlePtr GetVarHandlePtr() { return var_h_; } + bool Wait() { return var_h_->Wait(); } + void Finish(bool ok) { return var_h_->Finish(ok); } + virtual void ProcessImpl() = 0; std::unique_ptr context_; grpc::Status status_; - VarHandle var_h_; + + protected: + VarHandlePtr var_h_; }; typedef std::function @@ -94,13 +93,13 @@ typedef std::function class SendProcessor : public BaseProcessor { public: explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(ch), stub_g_(ch) {} + : BaseProcessor(), stub_g_(ch) {} virtual ~SendProcessor() {} - virtual void Process() { + void ProcessImpl() override { if (response_call_back_) { - response_call_back_(var_h_, reply_); + response_call_back_(*var_h_.get(), reply_); } } @@ -115,13 +114,13 @@ typedef std::function class GetProcessor : public BaseProcessor { public: explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(ch), stub_g_(ch) {} + : BaseProcessor(), stub_g_(ch) {} virtual ~GetProcessor() {} - virtual void Process() { + void ProcessImpl() override { if (response_call_back_) { - response_call_back_(var_h_, reply_); + response_call_back_(*var_h_.get(), reply_); } } @@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor { public: explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~BatchBarrierProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VoidMessage reply_; std::unique_ptr stub_; }; @@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor { class FetchBarrierProcessor : public BaseProcessor { public: explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~FetchBarrierProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VariableMessage reply_; std::unique_ptr stub_; }; @@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor { class CheckpointNotifyProcessor : public BaseProcessor { public: explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~CheckpointNotifyProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VoidMessage reply_; std::unique_ptr stub_; }; @@ -177,32 +176,37 @@ class GRPCClient : public RPCClient { GRPCClient() : ok_(true), completed_(false), stopped_(false) {} virtual ~GRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; bool Wait() override; diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 64ac7281848f91302bc0aa3cb81dd198e56fb653..3c3f9d17c871ac1cb4df83db17cf489d5b9e0563 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { @@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; class RPCServer; -struct VarHandle { - // RPC endpoint. - std::string ep; - const platform::DeviceContext* ctx; - const framework::Scope* scope; - // Variable name. - std::string name; - // RPC method name. - std::string method; +class VarHandle { + public: + VarHandle(const std::string ep, const std::string& method, + const std::string& name, + const platform::DeviceContext* p_ctx = nullptr, + const framework::Scope* p_scope = nullptr) + : ok_(kVarHandleDefaultState) { + ep_ = ep; + ctx_ = p_ctx; + scope_ = p_scope; + name_ = name; + method_ = method; + } + + virtual ~VarHandle() {} + + public: + bool Wait() { + { + std::unique_lock lk(sync_mutex_); + wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; }); + } + VLOG(7) << "VarHandle wait:" << ok_; + return ok_ != 0; + } + + void Finish(bool ok) { + { + std::unique_lock lk(sync_mutex_); + ok_ = ok; + } + VLOG(7) << "VarHandle finish:" << ok; + wait_cond_.notify_all(); + } std::string String() const { std::ostringstream s; - s << method << " name:[" << name << "], ep:[" << ep << "]"; + s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_ + << "]"; return s.str(); } + + std::string ep() const { return ep_; } + const platform::DeviceContext* ctx() const { return ctx_; } + const framework::Scope* scope() const { return scope_; } + std::string name() const { return name_; } + std::string method() const { return method_; } + + protected: + // RPC endpoint. + std::string ep_; + const platform::DeviceContext* ctx_; + const framework::Scope* scope_; + // Variable name. + std::string name_; + // RPC method name. + std::string method_; + + protected: + std::mutex sync_mutex_; + std::condition_variable wait_cond_; + int ok_; + + static const int kVarHandleDefaultState = -1; + + private: + DISABLE_COPY_AND_ASSIGN(VarHandle); }; +typedef std::shared_ptr VarHandlePtr; + class RequestHandler { public: explicit RequestHandler(bool sync_mode) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 31159a02592a2aff75f7ecf5be924989f0f47071..849e412504eb9180b746db65fd4fa353ed0c05a1 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname, LOG(FATAL) << "sync: Can not find server side var: " << varname; return false; } - - if (invar->IsType()) { - std::unique_lock lock(mutex_sparse_vars_); - sparse_vars_.push_back(invar); - } } } return true; } -void RequestSendHandler::ResetSparseVarRecorder() { - std::unique_lock lock(mutex_sparse_vars_); - for (auto* var : sparse_vars_) { - var->GetMutable()->mutable_rows()->clear(); - } - sparse_vars_.clear(); -} - bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index 87185500f2ffc3a8578eea339cc7a1e2b0e46631..8be5b21bb89a580f4091de19186fd2d7e5802478 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler { bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, const std::string& out_var_name = "") override; - void ResetSparseVarRecorder(); - - private: - std::mutex mutex_sparse_vars_; - std::vector sparse_vars_; }; class RequestGetHandler final : public RequestHandler { diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 22a022a5d25e5c6628b80294494b87ca105a04c7..3539ee5e459d6dfe0b6510806464bcc6817910bb 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -14,12 +14,14 @@ #pragma once +#include // NOLINT #include #include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/distributed/request_handler.h" DECLARE_int32(rpc_deadline); @@ -31,37 +33,36 @@ class RPCClient { public: RPCClient() {} virtual ~RPCClient() {} - virtual bool AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual bool AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncCheckpointNotify(const std::string& ep, - const std::string& dir, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncPrefetchVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; // Complete tells all the pserver instances that finishe the training, // the pserver can reduce it's barrier count, and continue to train diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 406e7294c190172347d432fb155c2a81c43dda25..084480ae48b8b9267ade1a840f6a70519cb28e48 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -101,6 +101,8 @@ void RPCServer::Complete() { { std::unique_lock lock(mutex_); client_num_--; + need_reset_all_vars_ = true; + VLOG(4) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; @@ -109,6 +111,11 @@ void RPCServer::Complete() { barrier_cond_.notify_all(); } +bool RPCServer::NeedResetAllVars() { + std::unique_lock lock(mutex_); + return need_reset_all_vars_; +} + int RPCServer::GetClientNum() { std::unique_lock lock(mutex_); return client_num_; @@ -120,6 +127,7 @@ void RPCServer::ResetBarrierCounter() { for (auto& t : barrier_counter_) { t.second = 0; } + need_reset_all_vars_ = false; } void RPCServer::RegisterRPC(const std::string& rpc_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index d813ba03e2fbec6e808f59f814a9b2f4bfbcd77b..d88e8c640ffb5ea44e88318cc973c9a783862435 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -49,7 +49,8 @@ class RPCServer { bind_address_(address), exit_flag_(false), selected_port_(0), - client_num_(client_num) {} + client_num_(client_num), + need_reset_all_vars_(false) {} virtual ~RPCServer() {} virtual void StartServer() = 0; @@ -86,6 +87,8 @@ class RPCServer { void ResetBarrierCounter(); RPCServerProfiler& Profiler() { return profiler_; } + bool NeedResetAllVars(); + protected: virtual void ShutDownImpl() = 0; @@ -104,6 +107,7 @@ class RPCServer { std::atomic exit_flag_; int selected_port_; int client_num_; + bool need_reset_all_vars_; std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0fcaf886475c5e03d959ffd6af22b2123526b9f --- /dev/null +++ b/paddle/fluid/operators/distributed/varhandle_test.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +using paddle::operators::distributed::VarHandlePtr; +using paddle::operators::distributed::VarHandle; + +void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); } + +void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); } + +TEST(VarHandle, Run) { + std::vector a; + for (int i = 0; i < 12; i++) { + VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr)); + a.push_back(s); + } + + std::vector> t; + for (int i = 0; i < 6; i++) { + t.emplace_back(new std::thread(WaitFalse, a[i])); + } + + for (int i = 0; i < 6; i++) { + a[i]->Finish(false); + t[i]->join(); + } + + for (int i = 6; i < 12; i++) { + t.emplace_back(new std::thread(WaitTrue, a[i])); + } + + for (int i = 6; i < 12; i++) { + a[i]->Finish(true); + t[i]->join(); + } +} diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 916f84cb4a78c3721cb67bd3cf8d3759a8eaf1bf..31e87d9113118ebe7a4b25ffee5ba55e2714fb66 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -25,14 +25,14 @@ namespace paddle { namespace operators { void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU."); PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of GRU should not be null."); + "Assert only one Input(WeightX) of GRU."); PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); + "Assert only one Input(WeightH) of GRU."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of GRU should not be null."); + "Assert only one Output(Hidden) of GRU."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); @@ -80,11 +80,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), - "Output(ReorderedH0) of GRU should not be null."); + "Assert only one Output(ReorderedH0) of GRU."); PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), - "Output(BatchedInput) of GRU should not be null."); + "Assert only one Output(BatchedInput) of GRU."); PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"), - "Output(BatchedOut) of GRU should not be null."); + "Assert only one Output(BatchedOut) of GRU."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedOut", out_dims); } diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ef23ab3f981786d33567619ad0194d21f31bdc8e..55e465e3af08c012b8cff7714452ed32b32a5556 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -24,20 +24,17 @@ namespace paddle { namespace operators { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM."); PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of LSTM should not be null."); + "Assert only one Input(WeightX) of LSTM."); PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Bias"), - "Input(Bias) of LSTM should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("XX"), - "Output(XX) of LSTM should not be null."); + "Assert only one Input(WeightH) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of LSTM should not be null."); + "Assert only one Output(Hidden) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of LSTM should not be null."); + "Assert only one Output(Cell) of LSTM."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); @@ -96,15 +93,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), - "Output(BatchedInput) of LSTM should not be null."); + "Assert only one Output(BatchedInput) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), - "Output(BatchedHidden) of LSTM should not be null."); + "Assert only one Output(BatchedHidden) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), - "Output(BatchedCell) of LSTM should not be null."); + "Assert only one Output(BatchedCell) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), - "Output(ReorderedH0) of LSTM should not be null."); + "Assert only one Output(ReorderedH0) of LSTM"); PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), - "Output(ReorderedC0) of LSTM should not be null."); + "Assert only one Output(ReorderedC0) of LSTM."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedHidden", out_dims); ctx->SetOutputDim("BatchedCell", out_dims); diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 4cc2159d9f22809a640f82ad19415f3e5a2d9999..966d78b84130c172c41e8049bf6bb1dc659d7d48 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" @@ -101,7 +102,7 @@ static int64_t GetTimestamp() { void ListenAndServOp::RunSyncLoop( framework::Executor *executor, framework::ProgramDesc *program, - framework::Scope *recv_scope, + framework::Scope *recv_scope, platform::DeviceContext *dev_ctx, const std::vector &prefetch_block_id_list, const int checkpoint_point_block_id) const { VLOG(2) << "RunSyncLoop"; @@ -128,6 +129,7 @@ void ListenAndServOp::RunSyncLoop( rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->ResetBarrierCounter(); + while (true) { rpc_service_->Profiler().OneStep(); // Get from multiple trainers, we don't care about the order in which @@ -165,9 +167,7 @@ void ListenAndServOp::RunSyncLoop( recv_scope); VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - // reset received sparse vars to avoid reuse it in the next mini-batch - dynamic_cast(request_send_handler_.get()) - ->ResetSparseVarRecorder(); + ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); @@ -175,6 +175,42 @@ void ListenAndServOp::RunSyncLoop( } // while(true) } +void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, + platform::DeviceContext *dev_ctx, + bool reset_all) const { + for (auto &varname : sparse_vars_) { + auto var = recv_scope->FindVar(varname); + if (var == nullptr) { + VLOG(2) << "can not find var " << varname << " in received scope"; + continue; + } + if (var->IsType()) { + VLOG(3) << "reset sparse var: " << varname; + var->GetMutable()->mutable_rows()->clear(); + } else { + PADDLE_THROW("The type of sparse var should be SelectedRows"); + } + } + if (UNLIKELY(reset_all)) { + for (auto &varname : dense_vars_) { + auto var = recv_scope->FindVar(varname); + if (var == nullptr) { + VLOG(2) << "can not find var " << varname << " in received scope"; + continue; + } + if (var->IsType()) { + math::set_constant(*dev_ctx, var->GetMutable(), + static_cast(0)); + } else if (var->IsType()) { + math::set_constant(*dev_ctx, var->GetMutable(), + static_cast(0)); + } else { + PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]"); + } + } + } +} + void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope) const { @@ -248,6 +284,25 @@ static void FillRequestCtx( h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); } +void ListenAndServOp::CacheVarsType(const std::vector &varnames, + const framework::Scope &scope) const { + for (const auto &varname : varnames) { + auto var = scope.FindVar(varname); + PADDLE_ENFORCE(var != nullptr, + "Received var should be initialized in the received scope."); + if (var->IsType()) { + sparse_vars_.push_back(varname); + } else if (var->IsType() || + var->IsType()) { + dense_vars_.push_back(varname); + } else { + PADDLE_THROW( + "The type of received var should be in [SelectedRows, LoDTensor, " + "Tensor]."); + } + } +} + void ListenAndServOp::RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const { // Mark this as PS that it should decide profiling by listening from trainer. @@ -258,6 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, bool sync_mode = Attr("sync_mode"); auto fan_in = Attr("Fanin"); + auto inputs = Inputs("X"); PADDLE_ENFORCE(!rpc_service_); std::string endpoint = Attr("endpoint"); @@ -348,11 +404,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, signal(SIGINT, SignalHandler::StopAndExit); signal(SIGTERM, SignalHandler::StopAndExit); + // Cache the type of the received vars as `sparse_vars_` and `dense_vars_` + // so that we can reset them at the end of each iteration. + // NOTE: only used in sync update + CacheVarsType(inputs, recv_scope); + // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { - RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list, - checkpoint_block_id); + RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, + prefetch_block_id_list, checkpoint_block_id); } else { RunAsyncLoop(&executor, program, &recv_scope); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0..5f889793ab16249a4e06801090db087a089dbed1 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -48,6 +49,7 @@ class ListenAndServOp : public framework::OperatorBase { void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx, const std::vector& prefetch_block_id_list, const int checkpoint_point_block_id) const; @@ -64,6 +66,13 @@ class ListenAndServOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override; + void ResetReceivedVars(framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx, + bool reset_all = false) const; + + void CacheVarsType(const std::vector& varnames, + const framework::Scope& scope) const; + protected: mutable std::shared_ptr rpc_service_; mutable std::shared_ptr request_send_handler_; @@ -74,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase { request_checkpoint_handler_; mutable std::shared_ptr server_thread_; + mutable std::vector sparse_vars_; + mutable std::vector dense_vars_; }; class SignalHandler { diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc index caff35e03ae3a144f799d982c859ded62cb3e93d..18bf1a66f6d9903f32048574dc93faf7e98953ac 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -28,7 +28,8 @@ class CrossEntropyFunctor { public: void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel) { + const framework::Tensor* labels, const bool softLabel, + const int ignore_index) { const int batch_size = prob->dims()[0]; if (softLabel) { auto in = EigenMatrix::From(*prob); @@ -49,8 +50,12 @@ class CrossEntropyFunctor { int lbl = label_data[i]; PADDLE_ENFORCE_GE(lbl, 0); PADDLE_ENFORCE_LT(lbl, class_num); + PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index); int index = i * class_num + lbl; - loss_data[i] = -math::TolerableValue()(std::log(prob_data[index])); + loss_data[i] = + lbl == ignore_index + ? 0 + : -math::TolerableValue()(std::log(prob_data[index])); } } } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b..c92341ea55ea21773acba33665e267b2f1c25fe3 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -23,11 +23,14 @@ namespace math { namespace { template __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, - const int N, const int D) { + const int N, const int D, + const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - PADDLE_ASSERT(label[i] >= 0 && label[i] < D); - Y[i] = -math::TolerableValue()(log(X[i * D + label[i]])); + PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index); + Y[i] = ignore_index == label[i] + ? 0 + : -math::TolerableValue()(log(X[i * D + label[i]])); } } @@ -57,7 +60,8 @@ class CrossEntropyFunctor { public: void operator()(const platform::CUDADeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, bool softLabel) { + const framework::Tensor* labels, bool softLabel, + const int ignore_index) { const T* prob_data = prob->data(); T* loss_data = out->mutable_data(ctx.GetPlace()); @@ -77,7 +81,8 @@ class CrossEntropyFunctor { int block = 512; int grid = (batch_size + block - 1) / block; CrossEntropyKernel<<>>( - loss_data, prob_data, label_data, batch_size, class_num); + loss_data, prob_data, label_data, batch_size, class_num, + ignore_index); } } }; diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index adc5b3fe47cd3bf524eb56747b6bd51e345a2eb6..e8aeb5d0575ac0f6b8761e97896df73578e8a103 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -38,7 +38,8 @@ class CrossEntropyFunctor { public: void operator()(const DeviceContext& context, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel); + const framework::Tensor* labels, const bool softLabel, + const int ignore_index); }; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu index 1d0021d33ff9ee65c3366183466b94266e6c2999..67449aa4c67bee6606928ef3a2d986a1bdec038f 100644 --- a/paddle/fluid/operators/norm_op.cu +++ b/paddle/fluid/operators/norm_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,14 +11,151 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU +#include +#include "cub/cub.cuh" #include "paddle/fluid/operators/norm_op.h" +namespace paddle { +namespace operators { + +__device__ __forceinline__ float square_root(float x) { return sqrtf(x); } + +__device__ __forceinline__ double square_root(double x) { return sqrt(x); } + +template +__global__ void Normalize(const T* x, const int pre, + const int axis_n, // dim in axis + const int post, const T eps, T* y, T* out_norm) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + int num = pre * post; + for (int i = blockIdx.x; i < num; i += gridDim.x) { + int base = (i / post) * post * axis_n + (i % post); + + T sum = 0.0; + __shared__ T norm; + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + const T x_ij = x[base + j * post]; + sum += x_ij * x_ij; + } + T reduce_result = BlockReduce(temp_storage).Sum(sum); + + if (threadIdx.x == 0) { + norm = square_root(reduce_result + eps); + out_norm[i] = norm; + } + __syncthreads(); + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + const int index = base + j * post; + y[index] = x[index] / norm; + } + } +} + +template +class NormCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* out_y = ctx.Output("Out"); + auto* out_norm = ctx.Output("Norm"); + const T* x = in_x->data(); + T* y = out_y->mutable_data(ctx.GetPlace()); + T* norm = out_norm->mutable_data(ctx.GetPlace()); + + auto xdim = in_x->dims(); + auto ndim = out_norm->dims(); + int axis = ctx.Attr("axis"); + T eps = static_cast(ctx.Attr("epsilon")); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + GetDims(xdim, axis, &pre, &n, &post); + + auto& dev_ctx = ctx.cuda_device_context(); + + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid = std::min(max_blocks, pre * post); + Normalize<<>>(x, pre, n, post, + eps, y, norm); + } +}; + +template +__global__ void NormalizeGradient(const T* x, const T* x_norm, const T* y_grad, + const int pre, const int axis_n, + const int post, T* x_grad) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage_sum; + int num = pre * post; + for (int i = blockIdx.x; i < num; i += gridDim.x) { + T sum = 0.0; + __shared__ T row_sum; + __shared__ T row_sqrt_norm; + __shared__ T row_norm; + + auto base = (i / post) * post * axis_n + (i % post); + + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + int index = base + j * post; + sum += x[index] * y_grad[index]; + } + T reduce_result = BlockReduce(temp_storage_sum).Sum(sum); + + if (threadIdx.x == 0) { + row_sum = reduce_result; + row_sqrt_norm = x_norm[i]; + row_norm = row_sqrt_norm * row_sqrt_norm; + } + __syncthreads(); + for (int j = threadIdx.x; j < axis_n; j += blockDim.x) { + int index = base + j * post; + const T x_ij = x[index]; + const T dy_ij = y_grad[index]; + x_grad[index] = (dy_ij - x_ij * row_sum / row_norm) / row_sqrt_norm; + } + } +} + +template +class NormGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_x = ctx.Input("X"); + auto* in_norm = ctx.Input("Norm"); + auto* in_dy = ctx.Input(framework::GradVarName("Out")); + auto* out_dx = ctx.Output(framework::GradVarName("X")); + T* dx = out_dx->mutable_data(ctx.GetPlace()); + const T* x = in_x->data(); + const T* x_norm = in_norm->data(); + const T* dy = in_dy->data(); + + auto xdim = in_x->dims(); + int axis = ctx.Attr("axis"); + if (axis < 0) axis = xdim.size() + axis; + int pre, n, post; + GetDims(xdim, axis, &pre, &n, &post); + + auto& dev_ctx = ctx.cuda_device_context(); + + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid = std::min(max_blocks, pre * post); + NormalizeGradient<<>>( + x, x_norm, dy, pre, n, post, dx); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel, - ops::NormKernel); -REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel, - ops::NormGradKernel); +REGISTER_OP_CUDA_KERNEL(norm, ops::NormCUDAKernel, + ops::NormCUDAKernel); +REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradCUDAKernel, + ops::NormGradCUDAKernel); diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index 3167bdc8ac718b23435690577e4163826d14a332..d0224177ecf7f0c918def08ff4dd6a3c8eb349d8 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -65,14 +65,17 @@ class NormKernel : public framework::OpKernel { Eigen::DSizes rdim(1); // y = x / sqrt((sum(x * x) + epsilon)) // norm = sqrt(sum(x * x) + epsilon) - auto sum = x.pow(2).sum(rdim) + eps; + auto x2 = x * x; + auto sum = x2.sum(rdim) + eps; norm.device(*place) = sum.sqrt(); + // y = x / norm Eigen::DSizes rshape(pre, 1, post); Eigen::DSizes bcast(1, n, 1); y.device(*place) = x / norm.reshape(rshape).broadcast(bcast); } }; + template class NormGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 4b804740a06f9e29704f2b3f58a90191e3559347..0519c15e13aac99802ff0f95b975712b36b44246 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " << outs[i] << " back"; - rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]); + rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, + ins[i], outs[i])); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } }; diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index a1f368e8690512cec2db7593aabc0279bbe174eb..4d34b8a1686efb1fc30020f0d27e9a3c3a6c0866 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); + rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); } if (sync_mode) { - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 82a70e4bf13247d784371ffdf419c9f792d7f721..48322ac7fd54a2e4cc3405a2c4dcddfc273f5a66 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include // NOLINT #include +#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); + rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } if (sync_send) { - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 53cb716a979229c99fcbdc12f1aeab4e21b320f3..1a9324ec862fc3dd7ce669c5fed94527cac22b8f 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker "(bool, default: false), A flag to indicate whether to interpretate " "the given labels as soft labels.") .SetDefault(false); + AddAttr( + "ignore_index", + "(int, default -100), Specifies a target value that is ignored and" + "does not contribute to the input gradient. Only valid if soft_label" + "is set to False") + .SetDefault(-100); AddComment(R"DOC( Softmax With Cross Entropy Operator. diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index a559b01ed32a48e3befb37c2ae8935b4f3a4acb0..a07c17348ebb3f768d1c8be65c2d31e3c130bd23 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -26,11 +26,13 @@ using Tensor = framework::Tensor; namespace { template __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, - const int batch_size, const int class_num) { + const int batch_size, const int class_num, + const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; i += blockDim.x * gridDim.x) { int idx = i * class_num + labels[i]; - logit_grad[idx] -= static_cast(1.); + logit_grad[idx] -= + ignore_index == labels[i] ? static_cast(0.) : static_cast(1.); } } @@ -260,6 +262,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { auto* loss_data = loss->mutable_data(context.GetPlace()); auto soft_label = context.Attr("soft_label"); + auto ignore_index = context.Attr("ignore_index"); if (soft_label) { int batch_size = logits->dims()[0]; int feature_size = logits->dims()[1]; @@ -272,7 +275,8 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { math::SoftmaxCUDNNFunctor()(context.cuda_device_context(), logits, softmax); math::CrossEntropyFunctor()( - context.cuda_device_context(), loss, softmax, labels, false); + context.cuda_device_context(), loss, softmax, labels, false, + ignore_index); } } }; @@ -295,7 +299,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { const int class_num = logit_grad->dims()[1]; int block = 512; auto stream = context.cuda_device_context().stream(); - + auto ignore_index = context.Attr("ignore_index"); if (context.Attr("soft_label")) { int grid = (batch_size * class_num + block - 1) / block; const T* label_data = labels->data(); @@ -305,7 +309,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { int grid = (batch_size + block - 1) / block; const int64_t* label_data = labels->data(); CrossEntropyGrad<<>>( - logit_grad_data, label_data, batch_size, class_num); + logit_grad_data, label_data, batch_size, class_num, ignore_index); int num = batch_size * class_num; grid = (num + block - 1) / block; Scale<<>>(logit_grad_data, loss_grad_data, num, diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index dd6f6aca5ada7aa215d3b3444194fc53efeb7020..e9aba3b37b8cc01d4fe5de5200579d4e93f67e56 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { math::SoftmaxFunctor()(dev_ctx, logits, softmax); math::CrossEntropyFunctor()( - dev_ctx, loss, softmax, labels, context.Attr("soft_label")); + dev_ctx, loss, softmax, labels, context.Attr("soft_label"), + context.Attr("ignore_index")); } }; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 2cc26da013f59f5b7ee1747d57baca9c1c0efe2c..c6f1d1f3d544117311821d980300dffea03891a5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/framework/rw_lock.h" +#endif namespace paddle { namespace platform { @@ -142,7 +145,58 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { mutable unsigned int* semaphore_; }; -CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { +class CudnnHolder { + public: + CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) + : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) { + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); + } + + cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } + + void RunFunc(const std::function& cudnn_func, + size_t required_workspace_len) { + std::lock_guard lock(mtx_); + if (required_workspace_len > workspace_len_) { + ReallocateWorkspace(required_workspace_len); + } + cudnn_func(workspace_); + } + + ~CudnnHolder() { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + if (workspace_ != nullptr) { + paddle::memory::Free(place_, workspace_); + } + } + + private: + void ReallocateWorkspace(size_t required_workspace_len) { + if (required_workspace_len <= workspace_len_) { + return; + } + if (workspace_ != nullptr) { + // Maybe someone is using the current workspace + PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); + paddle::memory::Free(place_, workspace_); + } + workspace_ = paddle::memory::Alloc(place_, required_workspace_len); + workspace_len_ = required_workspace_len; + } + + cudnnHandle_t cudnn_handle_; + void* workspace_; + size_t workspace_len_; + + const cudaStream_t* stream_; // not owned; + const CUDAPlace place_; + + std::mutex mtx_; +}; + +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) + : place_(place), cudnn_holder_(nullptr) { SetDeviceId(place_.device); compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); @@ -154,10 +208,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); if (dynload::HasCUDNN()) { - PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); - } else { - cudnn_handle_ = nullptr; + cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } } @@ -165,9 +216,6 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); - if (cudnn_handle_ != nullptr) { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); - } eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -196,7 +244,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { return cublas_handle_; } -cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { + return cudnn_holder_->cudnn_handle(); +} + +void CUDADeviceContext::RunCudnnFuncWithWorkspace( + const std::function& cudnn_func, size_t workspace_len) const { + cudnn_holder_->RunFunc(cudnn_func, workspace_len); +} cudaStream_t CUDADeviceContext::stream() const { return stream_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b97dad20db0b003b4886b7c7cfd1c8de8bf44ab9..3ed49fc4233d4c0cd6cc16319eda08480ab9b434 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -69,6 +69,7 @@ struct DefaultDeviceContextType { #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; +class CudnnHolder; class CUDADeviceContext : public DeviceContext { public: @@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; + /*! \brief Run a cudnn function with the workspace provided by + * CUDADeviceContext */ + void RunCudnnFuncWithWorkspace(const std::function& cudnn_func, + size_t workspace_len) const; + /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; @@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; + std::unique_ptr cudnn_holder_; cudaStream_t stream_; - cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; int compute_capability; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f6e9a52b275353c03c1f350719766922a97f6cb3..c0a2543ba5d8ff8f34cb6231c51cb5053a6a9481 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -192,7 +192,8 @@ class MKLDNNHandler { mkldnn::memory::primitive_desc& user_mpd, // NOLINT const std::shared_ptr user_memory_p, const std::string& suffix, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -213,7 +214,7 @@ class MKLDNNHandler { pipeline.push_back(*reorder_p); } dev_ctx_.SetBlob(local_key, target_memory_p); - } else { + } else if (!is_persistent) { // Make reorder if needed auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index f21f8d23f99c27529b2ed1995c92fd4eee4a5807..67501186d150171728194f23bc02d2c014848dd7 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) { PADDLE_ENFORCE(desc->ParseFromString(data), "Fail to parse ProgramDesc from string. This could " "be a bug of Paddle."); - }); + }) + .def("_version", [](pd::ProgramDesc &self) -> int64_t { + return self.Proto()->version().version(); + }); } void BindBlockDesc(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5b20b87174e42f4dfdd22214e8f9dd20c7296374..20fc08e21dadc12be8903476df374abb5caecf61 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -33,6 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/enforce.h" @@ -530,6 +531,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("_is_program_version_supported", IsProgramVersionSupported); + BindProgramDesc(&m); BindBlockDesc(&m); BindVarDsec(&m); diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 1fe7f42ca1c692e4d7034883022852657be8cc20..719411bf6677c923397748289b95415c47fa299a 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,4 +1,6 @@ cc_library(stringpiece SRCS piece.cc) +cc_library(pretty_log SRCS pretty_log.cc) +cc_test(test_pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/fluid/string/pretty_log.cc new file mode 100644 index 0000000000000000000000000000000000000000..4534fdc58b81fe03b3a1fc19b55aa62ddbf5eaf1 --- /dev/null +++ b/paddle/fluid/string/pretty_log.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/pretty_log.h" +#include + +DEFINE_bool(color, true, "Whether to turn on pretty log"); + +namespace paddle { +namespace string {} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h new file mode 100644 index 0000000000000000000000000000000000000000..a3b4e38f453835828a4a53130e11c854ac3f4a74 --- /dev/null +++ b/paddle/fluid/string/pretty_log.h @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/string/printf.h" + +DECLARE_bool(color); + +namespace paddle { + +namespace string { + +inline std::string black() { return FLAGS_color ? "\e[30m" : ""; } +inline std::string red() { return FLAGS_color ? "\e[31m" : ""; } +inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; } +inline std::string green() { return FLAGS_color ? "\e[32m" : ""; } +inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; } +inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; } +inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; } +inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; } +inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string white() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; } +inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; } +inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; } +inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; } +inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; } +inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; } + +using TextBlock = std::pair; + +struct Style { + static std::string info() { return black(); } + static std::string warn() { return b_red(); } + static std::string suc() { return green(); } + static std::string H1() { return bold() + purple(); } + static std::string H2() { return green(); } + static std::string H3() { return green(); } + static std::string detail() { return light_gray(); } +}; + +template +static void PrettyLogEndl(const std::string& style, const char* fmt, + const Args&... args) { + std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; +} +template +static void PrettyLog(const std::string& style, const char* fmt, + const Args&... args) { + std::cerr << style << Sprintf(fmt, args...) << reset(); +} + +} // namespace string +} // namespace paddle diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b0e0d27ff7a0c603523065d34169b1b73eabdac3..8892606486ee97bb085e642e89fce872e5ba1f7e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1564,6 +1564,9 @@ class Program(object): """ return self.desc + def _version(self): + return self.desc._version() + def clone(self, for_test=False): """ Create a new, duplicated program. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 5c4ec99c533829240ac0bcc9647acb870f3412f8..656fafa0cb54d70e0eba8ec2bef21488c50d8d94 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -750,6 +750,10 @@ def load_inference_model(dirname, program_desc_str = f.read() program = Program.parse_from_string(program_desc_str) + if not core._is_program_version_supported(program._version()): + raise ValueError("Unsupported program version: %d\n" % + program._version()) + # Binary data also need versioning. load_persistables(executor, dirname, program, params_filename) if pserver_endpoints: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8408e6d2a12edacb310ed5eb543ad51585f3d82a..3ae0fac4bef5c47964f9a9cd8dd45b57e705e1f8 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -968,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): return out -def cross_entropy(input, label, soft_label=False): +def cross_entropy(input, label, soft_label=False, ignore_index=-100): """ **Cross Entropy Layer** @@ -1012,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False): tensor with shape [N x D]. soft_label (bool): a flag indicating whether to interpretate the given labels as soft - labels, default `False`. + labels. Default: `False`. + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid + if soft_label is set to False. Default: -100 Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. @@ -1037,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False): inputs={'X': [input], 'Label': [label]}, outputs={'Y': [out]}, - attrs={"soft_label": soft_label}) + attrs={"soft_label": soft_label, + "ignore_index": ignore_index}) return out @@ -4242,7 +4246,10 @@ def multiplex(inputs, index): return out -def softmax_with_cross_entropy(logits, label, soft_label=False): +def softmax_with_cross_entropy(logits, + label, + soft_label=False, + ignore_index=-100): """ **Softmax With Cross Entropy Operator.** @@ -4284,6 +4291,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): soft_label is set to true, Label is a Tensor with soft_label (bool): A flag to indicate whether to interpretate the given labels as soft labels. By default, `soft_label` is set to False. + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid + if soft_label is set to False. Default: -100 + Returns: Variable: The cross entropy loss is a 2-D tensor with shape [N x 1]. @@ -4305,7 +4316,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): 'Label': label}, outputs={'Softmax': softmax, 'Loss': loss}, - attrs={'soft_label': soft_label}) + attrs={'soft_label': soft_label, + 'ignore_index': ignore_index}) return loss diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4790e0f6119e96b11b049bfdd3b46d40a382683b..bd9f8b3c356ca1e43923d42beebaa5ab98158084 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -128,6 +128,13 @@ class ParallelExecutor(object): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exec_strategy.num_threads = cpu_num * 2 + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + if build_strategy is None: build_strategy = BuildStrategy() diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index e5ae95e2d943917b9bc10f0d4c4bdc5f8fb07fdb..de276755bb1eb2746cc780575a40357255223809 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -178,7 +178,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index ff91be72c918f8dac65b7030e45c4a00deb965ac..dd547f3448ae55c07b6c09f9de4ac08d8ec5ee88 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -152,7 +152,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index fa72c939e57356f26d60032dd0a91c894b28c505..973308498bec3cddde2ef651751ad5d0c9f84503 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -155,7 +155,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index 440d2a30835cb89089709f024a4dcc6e4113efa8..cb4aeb430e1a9662a183084c0cdacc41c5a8ec11 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -137,7 +137,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py index fa367f95fc9c65dd782d53a2799cacadf74dcfd2..f22badbea0c67b210f7ac4e14e5d647f1cffa6cc 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py @@ -209,5 +209,34 @@ class TestCrossEntropyOp6(OpTest): ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) +class TestCrossEntropyOp7(OpTest): + """Test cross-entropy with ignore index. + """ + + def setUp(self): + self.op_type = "cross_entropy" + batch_size = 30 + class_num = 10 + ignore_index = 3 + + X = randomize_probability(batch_size, class_num, dtype='float64') + + label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64") + cross_entropy = np.asmatrix( + [[-np.log(X[i][label[i][0]])] + if label[i][0] != ignore_index else [0] + for i in range(X.shape[0])], + dtype="float64") + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": False, "ignore_index": ignore_index} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py index e39eedd282daf6bbe0603a22c357e06c95c086b6..4bd24510bc8ac7f0fbaad3fd1919ab589cd21c4b 100644 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ b/python/paddle/fluid/tests/unittests/test_data_balance.py @@ -84,7 +84,7 @@ class TestDataBalance(unittest.TestCase): self.data_file_name = './data_balance_test.recordio' self.lod_data_file_name = './data_balance_with_lod_test.recordio' self.total_ins_num = 50 - self.batch_size = 10 + self.batch_size = 12 self.prepare_data() self.prepare_lod_data() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index bc4d364c74c6cb6b8f0df59e7ede77e6271f4b96..b04346b052903959f44aa96f6fccb7d20652e854 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -556,6 +556,15 @@ class TestBook(unittest.TestCase): out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0) print(str(program)) + def test_cross_entropy(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[30, 10], dtype="float32") + label = layers.data(name="label", shape=[30, 1], dtype="int32") + mode = 'channel' + out = layers.cross_entropy(x, label, False, 4) + self.assertIsNotNone(out) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py index 22bc45ff1ea0efea0ec766a6a9e819cdd81b0866..a424260312eab850e579b4365efd071de599bd4f 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_norm_op.py @@ -63,5 +63,27 @@ class TestNormOp3(TestNormOp): self.epsilon = 1e-8 +class TestNormOp4(TestNormOp): + def init_test_case(self): + self.shape = [128, 1024, 14, 14] + self.axis = 2 + self.epsilon = 1e-8 + + def test_check_grad(self): + # since the gradient check is very slow in large shape, so skip check_grad + pass + + +class TestNormOp5(TestNormOp): + def init_test_case(self): + self.shape = [2048, 2048] + self.axis = 1 + self.epsilon = 1e-8 + + def test_check_grad(self): + # since the gradient check is very slow in large shape, so skip check_grad + pass + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 5ad922725a0b692e28552737a99b745ed09ddbd5..a55b2002ed989d4588716202a37aa6f4139825ea 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -20,6 +20,7 @@ import numpy as np from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle +import paddle.fluid.core as core import paddle.dataset.wmt16 as wmt16 import os @@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase): writer.complete_append_tensor() def test_main(self): - self.check_network_convergence(transformer, use_cuda=True) + if core.is_compiled_with_cuda(): + self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 931cac409f26fce4ecca18c4b0cfcca2e675046f..b7fad9b3a60632adb564e1d155a3d935706b467f 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase): self.queue_capacity = 50 def test(self): - for use_cuda in [False, True]: + for use_cuda in ([False, True] + if core.is_compiled_with_cuda() else [False]): for use_parallel_executor in [False, True]: for use_double_buffer in [False, True]: print('Test Parameters:'), diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index b7e5ff6d52ad7dde3dd94b3bd660cfca383e1ada..a18941dd3126ac027f022ddafbbaed8516166233 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -88,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): self.check_grad(["Logits"], "Loss") +class TestSoftmaxWithCrossEntropyOp3(OpTest): + """ + Test softmax with cross entropy operator with ignore_index. + """ + + def setUp(self): + self.op_type = "softmax_with_cross_entropy" + batch_size = 41 + class_num = 37 + + logits = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype("float64") + softmax = np.apply_along_axis(stable_softmax, 1, logits) + labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") + ignore_index = 7 + cross_entropy = np.asmatrix( + [[-np.log(softmax[i][labels[i][0]])] + if labels[i] != ignore_index else [0] + for i in range(softmax.shape[0])], + dtype="float64") + + self.inputs = {"Logits": logits, "Label": labels} + self.outputs = { + "Softmax": softmax.astype("float64"), + "Loss": cross_entropy.astype("float64") + } + self.attrs = {"ignore_index": ignore_index} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss") + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index f79fcb24bb5a48de00d03fc468b6526e48656d07..adad2428f7fdc554cf4efd652f52b5c5de0ab527 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -60,13 +60,46 @@ class InferenceTranspiler(object): if not isinstance(scope, core.Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) + + self._fuse_batch_norm(program, place, scope) if use_mkldnn: - self._fuse_relu_mkldnn(program) self._fuse_conv_bias_mkldnn(program) - else: - self._fuse_batch_norm(program, place, scope) + self._fuse_conv_relu_mkldnn(program) + self._fuse_bn_relu_mkldnn(program) + + def _fuse_conv_relu_mkldnn(self, program): + ''' + Transpile the program by fused relu activation for MKLDNN program. + Relu activation following convolution OP can be fused by adding + 'fuse_relu' attribute to convolution OP. + The result of fuse is: + - before: + - conv->relu->any_other_op + - after: + - conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type in ['conv2d']: + next_op = self.block.ops[i + 1] + if next_op.type == 'relu': + # modify conv OP to include relu + current_op.set_attr("fuse_relu", True) + # remove conv OP + self.block._remove_op(i + 1) + i = i + 1 + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() - def _fuse_relu_mkldnn(self, program): + def _fuse_bn_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -160,7 +193,6 @@ class InferenceTranspiler(object): self._fuse_conv_bias(i, current_op, next_op) self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove elementwise_add - i = i + 1 i = i + 1 self._remove_unused_var() diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 6d7ac876fdf65fe0e85cceb94c311a93d9ea39c2..5b9459b670ac8583ee0e65a3c1b51f6248bb6303 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -14,11 +14,14 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader', + 'multiprocess_reader' ] from threading import Thread import subprocess +import multiprocessing +import sys from six.moves.queue import Queue from six.moves import zip_longest @@ -332,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): return xreader +def multiprocess_reader(readers, use_pipe=True, queue_size=1000): + """ + multiprocess_reader use python multi process to read data from readers + and then use multiprocess.Queue or multiprocess.Pipe to merge all + data. The process number is equal to the number of input readers, each + process call one reader. + + Multiprocess.Queue require the rw access right to /dev/shm, some + platform does not support. + + you need to create multiple readers first, these readers should be independent + to each other so that each process can work independently. + + An example: + + .. code-block:: python + + reader0 = reader(["file01", "file02"]) + reader1 = reader(["file11", "file12"]) + reader1 = reader(["file21", "file22"]) + reader = multiprocess_reader([reader0, reader1, reader2], + queue_size=100, use_pipe=False) + """ + + try: + import ujson as json + except Exception as e: + sys.stderr.write("import ujson error: " + str(e) + " use json\n") + import json + + assert type(readers) is list and len(readers) > 0 + + def _read_into_queue(reader, queue): + for sample in reader(): + if sample is None: + raise ValueError("sample has None") + queue.put(sample) + queue.put(None) + + def queue_reader(): + queue = multiprocessing.Queue(queue_size) + for reader in readers: + p = multiprocessing.Process( + target=_read_into_queue, args=(reader, queue)) + p.start() + + reader_num = len(readers) + finish_num = 0 + while finish_num < reader_num: + sample = queue.get() + if sample is None: + finish_num += 1 + else: + yield sample + + def _read_into_pipe(reader, conn): + for sample in reader(): + if sample is None: + raise ValueError("sample has None!") + conn.send(json.dumps(sample)) + conn.send(json.dumps(None)) + conn.close() + + def pipe_reader(): + conns = [] + for reader in readers: + parent_conn, child_conn = multiprocessing.Pipe() + conns.append(parent_conn) + p = multiprocessing.Process( + target=_read_into_pipe, args=(reader, child_conn)) + p.start() + + reader_num = len(readers) + finish_num = 0 + conn_to_remove = [] + while finish_num < reader_num: + for conn in conn_to_remove: + conns.remove(conn) + conn_to_remove = [] + for conn in conns: + sample = json.loads(conn.recv()) + if sample is None: + finish_num += 1 + conn.close() + conn_to_remove.append(conn) + else: + yield sample + + if use_pipe: + return pipe_reader + else: + return queue_reader + + def _buf2lines(buf, line_break="\n"): # FIXME: line_break should be automatically configured. lines = buf.split(line_break) diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py index 537df489b9738864933b3a7922d178701db3d19f..c324092f8850e4bd64955aa9c987746b5cec54b5 100644 --- a/python/paddle/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -14,6 +14,7 @@ import time import unittest +import functools import paddle.reader @@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase): temp.close() +class TestMultiProcessReader(unittest.TestCase): + def setup(self): + self.samples = [] + for i in range(1000): + self.samples.append([[i], [i + 1, i + 2], i + 3]) + + def reader(index): + for i in range(len(self.samples)): + if i % 3 == index: + yield self.samples[i] + + self.reader0 = functools.partial(reader, 0) + self.reader1 = functools.partial(reader, 1) + self.reader2 = functools.partial(reader, 2) + + def reader_test(self, use_pipe): + self.setup() + results = [] + for data in paddle.reader.multiprocess_reader( + [self.reader0, self.reader1, self.reader2], 100, use_pipe)(): + results.append(data) + self.assertEqual(sorted(self.samples), sorted(results)) + + def test_multi_process_reader(self): + self.reader_test(use_pipe=False) + self.reader_test(use_pipe=True) + + if __name__ == '__main__': unittest.main()