diff --git a/README.md b/README.md index 60ffbe728178705b1734e682868614025214c2a4..45186ec4ef48dc305b2616dbf4966f01c3609962 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0) +### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0) ### Install Latest Stable Release: ``` # Linux CPU @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index ed696e82f8723eba573e8affd3f25e2aa6426e63..0d5c9652de6b814627e54018366137e214726619 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -140,5 +140,11 @@ def parse_args(): '--use_lars', action='store_true', help='If set, use lars for optimizers, ONLY support resnet module.') + parser.add_argument( + '--reduce_strategy', + type=str, + choices=['reduce', 'all_reduce'], + default='all_reduce', + help='Specify the reduce strategy, can be reduce, all_reduce') args = parser.parse_args() return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 11bd75e1d09a6b51c7c749c512f2b71f3604f3fb..ddd9fe809853a830ca676cc98f1819f683866def 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): program=train_prog, pservers=pserver_endpoints, trainers=trainers, - sync_mode=not args.async_mode) + sync_mode=not args.async_mode, + startup_program=startup_prog) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program( @@ -169,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False + build_strategy = fluid.BuildStrategy() + if args.reduce_strategy == "reduce": + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy( + ).ReduceStrategy.AllReduce + avg_loss = train_args[0] if args.update_method == "pserver": @@ -183,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, avg_loss.name, main_program=train_prog, exec_strategy=strategy, + build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py index cef8657ee629dcbc19221fd3440844a56627e920..f123e07fb711bd8ff67c1ecf5ec9a02c1e79eb1d 100644 --- a/benchmark/fluid/models/mnist.py +++ b/benchmark/fluid/models/mnist.py @@ -67,11 +67,14 @@ def cnn_model(data): def get_model(args, is_train, main_prog, startup_prog): # NOTE: mnist is small, we don't implement data sharding yet. - filelist = [ - os.path.join(args.data_path, f) for f in os.listdir(args.data_path) - ] + opt = None + data_file_handle = None with fluid.program_guard(main_prog, startup_prog): if args.use_reader_op: + filelist = [ + os.path.join(args.data_path, f) + for f in os.listdir(args.data_path) + ] data_file_handle = fluid.layers.open_files( filenames=filelist, shapes=[[-1, 1, 28, 28], (-1, 1)], @@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog): if is_train: opt = fluid.optimizer.AdamOptimizer( learning_rate=0.001, beta1=0.9, beta2=0.999) - opt.minimize() + opt.minimize(avg_cost) if args.memory_optimize: fluid.memory_optimize(main_prog) diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index ae1baa48e17e40448e457052fd1464b9604a2128..1b3bfe659c7d97b58dc4121387d4db22266381c5 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -20,6 +20,7 @@ import functools import numpy as np import time import os +import math import cProfile, pstats, StringIO @@ -27,128 +28,120 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.profiler as profiler -# from recordio_converter import imagenet_train, imagenet_test from imagenet_reader import train, val +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class ResNet(): + def __init__(self, layers=50, is_train=True): + self.params = train_parameters + self.layers = layers + self.is_train = is_train + + def net(self, input, class_dim=1000): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + out = fluid.layers.fc(input=pool, + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv))) + return out + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm( + input=conv, act=act, is_test=not self.is_train) + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + return self.conv_bn_layer(input, ch_out, 1, stride) + else: + return input -def conv_bn_layer(input, - ch_out, - filter_size, - stride, - padding, - act='relu', - is_train=True): - conv1 = fluid.layers.conv2d( - input=input, - filter_size=filter_size, - num_filters=ch_out, - stride=stride, - padding=padding, - act=None, - bias_attr=False) - return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train) - - -def shortcut(input, ch_out, stride, is_train=True): - ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1] - if ch_in != ch_out: - return conv_bn_layer( - input, ch_out, 1, stride, 0, None, is_train=is_train) - else: - return input - - -def basicblock(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') - - -def bottleneck(input, ch_out, stride, is_train=True): - short = shortcut(input, ch_out * 4, stride, is_train=is_train) - conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train) - conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train) - conv3 = conv_bn_layer( - conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train) - return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') - - -def layer_warp(block_func, input, ch_out, count, stride): - res_out = block_func(input, ch_out, stride) - for i in range(1, count): - res_out = block_func(res_out, ch_out, 1) - return res_out - + def bottleneck_block(self, input, num_filters, stride): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None) -def resnet_imagenet(input, - class_dim, - depth=50, - data_format='NCHW', - is_train=True): + short = self.shortcut(input, num_filters * 4, stride) - cfg = { - 18: ([2, 2, 2, 1], basicblock), - 34: ([3, 4, 6, 3], basicblock), - 50: ([3, 4, 6, 3], bottleneck), - 101: ([3, 4, 23, 3], bottleneck), - 152: ([3, 8, 36, 3], bottleneck) - } - stages, block_func = cfg[depth] - conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) - pool1 = fluid.layers.pool2d( - input=conv1, pool_type='avg', pool_size=3, pool_stride=2) - res1 = layer_warp(block_func, pool1, 64, stages[0], 1) - res2 = layer_warp(block_func, res1, 128, stages[1], 2) - res3 = layer_warp(block_func, res2, 256, stages[2], 2) - res4 = layer_warp(block_func, res3, 512, stages[3], 2) - pool2 = fluid.layers.pool2d( - input=res4, - pool_size=7, - pool_type='avg', - pool_stride=1, - global_pooling=True) - out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax') - return out - - -def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'): - assert (depth - 2) % 6 == 0 - - n = (depth - 2) // 6 - - conv1 = conv_bn_layer( - input=input, ch_out=16, filter_size=3, stride=1, padding=1) - res1 = layer_warp(basicblock, conv1, 16, n, 1) - res2 = layer_warp(basicblock, res1, 32, n, 2) - res3 = layer_warp(basicblock, res2, 64, n, 2) - pool = fluid.layers.pool2d( - input=res3, pool_size=8, pool_type='avg', pool_stride=1) - out = fluid.layers.fc(input=pool, size=class_dim, act='softmax') - return out + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') def _model_reader_dshape_classdim(args, is_train): - model = resnet_cifar10 + model = None reader = None - if args.data_set == "cifar10": - class_dim = 10 - if args.data_format == 'NCHW': - dshape = [3, 32, 32] - else: - dshape = [32, 32, 3] - model = resnet_cifar10 - if is_train: - reader = paddle.dataset.cifar.train10() - else: - reader = paddle.dataset.cifar.test10() - elif args.data_set == "flowers": + if args.data_set == "flowers": class_dim = 102 if args.data_format == 'NCHW': dshape = [3, 224, 224] else: dshape = [224, 224, 3] - model = resnet_imagenet if is_train: reader = paddle.dataset.flowers.train() else: @@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train): dshape = [3, 224, 224] else: dshape = [224, 224, 3] - model = resnet_imagenet if not args.data_path: raise Exception( "Must specify --data_path when training with imagenet") @@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train): reader = train(xmap=False) else: reader = val(xmap=False) - return model, reader, dshape, class_dim + return reader, dshape, class_dim def get_model(args, is_train, main_prog, startup_prog): - model, reader, dshape, class_dim = _model_reader_dshape_classdim(args, - is_train) + reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train) pyreader = None trainer_count = int(os.getenv("PADDLE_TRAINERS")) @@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog): label = fluid.layers.data( name='label', shape=[1], dtype='int64') - predict = model(input, class_dim, is_train=is_train) + model = ResNet(is_train=is_train) + predict = model.net(input, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(x=cost) @@ -215,16 +207,15 @@ def get_model(args, is_train, main_prog, startup_prog): total_images = 1281167 / trainer_count - step = int(total_images / args.batch_size + 1) - epochs = [30, 60, 80, 90] + step = int(total_images / (args.batch_size * args.gpus) + 1) + epochs = [30, 60, 90] bd = [step * e for e in epochs] base_lr = args.learning_rate lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( - learning_rate=base_lr, - #learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md index 4c6728fba7150b0f1e180e57590f18a5b677c70d..acea9a2b5df903a958edf3683900e165670e196f 100644 --- a/doc/fluid/dev/releasing_process_cn.md +++ b/doc/fluid/dev/releasing_process_cn.md @@ -1,24 +1,23 @@ # PaddlePaddle发行规范 -PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 +PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 -1. 对这个版本的提交,做如下几个操作: - * 使用Regression Test List作为检查列表,测试本次release的正确性。 - * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 - * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 - * 将这个版本的python wheel包发布到pypi。 - * 更新Docker镜像(参考后面的操作细节)。 -1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。 -1. 协同完成Release Note的书写。 +2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`。 +3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。 +4. QA和研发发现的bug,在develop上修复验证后,cherry-pick修复到release分支。直到release分支相对稳定。 +5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。 +6. release分支稳定后,打上正式的release tag,比如`0.10.0`。 +7. 将这个版本的python wheel包发布到pypi。 +8. 更新Docker镜像(参考后面的操作细节)。 需要注意的是: -* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 -* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 +* bug修复需要先在develop上进行,然后进入release分支。而不是直接在release分支上开发。 + +* release分支原则上只接受修复类的修改,不接受新feature。 ## 发布wheel包到pypi @@ -61,24 +60,21 @@ docker push [镜像]:[version] ## PaddlePaddle 分支规范 -PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 - -* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: - * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。 - * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。 - * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。 +PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。 -* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 - * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 - * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 - * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 - * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 +* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。 +* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。 +* `master`分支因为历史原因,已经废弃。 -* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 +* 其他开发者fork的feature branch。 + * 建议,开发者的feature branch需要同步主版本库的`develop`分支。 + * 建议,开发者的feature branch需要基于主版本库中的`develop`分支。 + * 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。 ## PaddlePaddle回归测试列表 -本列表说明PaddlePaddle发版之前需要测试的功能点。 +TODO ### PaddlePaddle Book中所有章节 diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md index 2c1c30c1eddfde6d9a8e2637be86537c43cc1b00..b810dc941d27fdb5004812ab58e105502e83280f 100644 --- a/doc/fluid/dev/releasing_process_en.md +++ b/doc/fluid/dev/releasing_process_en.md @@ -4,26 +4,21 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti Each time we release a new PaddlePaddle version, we should follow the below steps: -1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`. -1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The - first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on. -1. After that, we should do: - * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm - that this release has no major bugs. - * If regression test fails, we must fix those bugs and create a new `release/[version]` - branch from previous release branch. - * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`. - * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail). - * Update the Docker images (see below instructions for detail). -1. After above step, merge `release/[version]` branch to master and push a tag on the master commit, - then merge `master` to `develop`. -1. Update the Release Note. - -***NOTE:*** - -* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain - features only for current release, so that we can test on that version. -* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch. +1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0` +2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`。 +3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch. +4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable. +5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4. +6. After release branch is stable,Create the official release tag,such as `0.10.0`. +7. Release the python wheel package to pypi. +8. Update the docker image (More details below). + +NOTE: + +* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch. + +* release normally only accept bug fixes. Don't add new features. + ## Publish Wheel Packages to pypi @@ -97,26 +92,22 @@ You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlep ## Branching Model -We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model, -with some modifications: - -* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed. -* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no - regression tests are run. -* `release/[version]` branch is used to publish each release. Latest release version branches have - bugfix only for that version, but no feature updates. -* Developer forks are not required to follow - [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) - branching model, all forks is like a feature branch. - * Advise: developer fork's develop branch is used to sync up with main repo's develop branch. - * Advise: developer use it's fork's develop branch to for new branch to start developing. - * Use that branch on developer's fork to create pull requests and start reviews. - * developer can push new commits to that branch when the pull request is open. -* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to - `master`, `develop` and `releases`. +PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model. + +* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests. +* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release. +* `master` branch as been deprecated for historical reasons + +* Developer's feature branch。 + * Developer's feature branch should sync with upstream `develop` branch. + * Developer's feature branch should be forked from upstream `develop` branch. + * After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review. + * In the review process, develop modify codes and push to their own feature branch. ## PaddlePaddle Regression Test List +TODO + ### All Chapters of PaddlePaddle Book We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index ae5f30e431aba4cae04b0fb35f00bce84f18de33..e362d3486487dd0b55e3e40d1c1358f2e5604ac5 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) @@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) @@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) -paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) @@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) -paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) +paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) -paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) +paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) @@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) -paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cc7938b2ac07f11ceb7f33a2e37380d1e2ed2072..d998109df21f585bc4905e00e59fe07247fd3f5e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -116,7 +116,11 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) -cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) + +cc_library(version SRCS version.cc) +cc_test(version_test SRCS version_test.cc DEPS version) + +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index bf493a3fa44e48deec734250d04b2a413c3ed9da..7c5f5bd80a937bf1a1c891155764833d7b21c5c2 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent r("all_reduce", nullptr); + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf..4fdab5cd94358d08eac7f8b041bf16d09042f0bd 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -15,12 +15,15 @@ #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { namespace details { void BroadcastOpHandle::RunImpl() { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // The input and output may have dummy vars. diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7a99169849debcbc57d6f197b36c5045b211f3ef..250e093a5f789dba6b06df4889c060c294d469fe 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -348,14 +348,31 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( size_t cur_device_id = 0; bool is_forwarding = true; + bool is_dist_train = false; for (ir::Node *node : sorted_ops) { if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - CreateRPCOp(&result, node); + int op_dev_id = CreateRPCOp(&result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); + } + } + is_dist_train = true; } else if (IsDistTrainOp(node, send_vars, recv_vars)) { - CreateDistTrainOp(&result, node); + int op_dev_id = CreateDistTrainOp(&result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set[op_dev_id].emplace(origin_param_name); + } } else if (IsScaleLossOp(node)) { // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != @@ -414,7 +431,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateReduceOp(&result, g_name, cur_device_id); graph->Get(kShardedVarDevice) .emplace(g_name, cur_device_id); - bcast_var_name_set[cur_device_id].emplace(p_name); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } break; case BuildStrategy::ReduceStrategy::kAllReduce: if (IsSparseGradient(g_name)) { @@ -436,15 +455,19 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } } - bool use_gpu = false; #ifdef PADDLE_WITH_CUDA use_gpu = nccl_ctxs_ != nullptr; #endif - if (use_gpu || - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - // Insert BCast Ops + // Insert broadcast operators principle: + // 1. Broadcast optimized parameters in Reduce strategy; + // 2. No need broadcast optimized parameters in AllReduce strategy because of + // the optimization sub-graph would be run on every GPU; + // 3. Allways broadcast received parameters in Distribute Training. + if ((use_gpu && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || + is_dist_train) { for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { auto &to_bcast_set = bcast_var_name_set[dev_id]; for (auto &bcast_name : to_bcast_set) { @@ -676,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; std::vector input_var_names; std::vector output_var_names; @@ -720,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type()); CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -738,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, + ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. @@ -825,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); } } + return op_dev_id; } bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index ac6d9c5a64cfde60f75c76dae0a30cc7d735e996..1ca8c4b855f9468589e537245380451a91a50b14 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { bool IsScaleLossOp(ir::Node *node) const; - void CreateRPCOp(ir::Graph *result, ir::Node *node) const; - void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; /** * Is this operator as the end-point operator before/after send operator. diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62..7fc06f234d42a992328c0b6164f17945d8075c28 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,7 +27,8 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - platform::RecordEvent r("reduce", nullptr); + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 609e18581957f62b040e04e937873b7a8fa5785a..ba243979b34aa1f683de707525403becaf0a1c00 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - VLOG(1) << place_ << "RUN Scale loss grad op"; + VLOG(10) << place_ << "RUN Scale loss grad op"; }); #endif } diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index c6588435819a982166cf2d2368a82b4402fdc2bc..460401df5473f8650f450a2bd247a703d91b6048 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -16,6 +16,13 @@ syntax = "proto2"; option optimize_for = LITE_RUNTIME; package paddle.framework.proto; +// Any incompatible changes to ProgramDesc and its dependencies should +// raise the version defined version.h. +// +// Serailization and Deserialization codes should be modified in a way +// that supports old versions following the version and compatibility policy. +message Version { optional int64 version = 1 [ default = 0 ]; } + enum AttrType { INT = 0; FLOAT = 1; @@ -180,4 +187,8 @@ message BlockDesc { // for more details. // TODO(panyx0718): A model can have multiple programs. Need a // way to distinguish them. Maybe ID or name? -message ProgramDesc { repeated BlockDesc blocks = 1; } +message ProgramDesc { + repeated BlockDesc blocks = 1; + + optional Version version = 2; +} diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 78387c407398b58d3fab6eab12445c4198f809b5..7004f484a9975124750fad4cb8f773342082b514 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -19,7 +19,7 @@ function(pass_library TARGET DEST) endfunction() cc_library(node SRCS node.cc DEPS proto_desc) -cc_library(graph SRCS graph.cc DEPS node) +cc_library(graph SRCS graph.cc DEPS node pretty_log) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) @@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) +if(WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif() pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) @@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) +if(WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif() diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..09c5ec59d66445bdbd5349447b125be89cb2efdf --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::unique_ptr ConvReLUFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); + + std::unordered_set nodes2delete; + + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_relu_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(), + "conv_relu_mkldnn_fuse"); + conv_relu_pattern(conv_input); + + int found_conv_relu_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvReLU fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_relu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out + GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op + + // Create an ConvReLU Node. + OpDesc desc; + std::string conv_relu_i_in = subgraph.at(conv_input)->Name(); + std::string conv_relu_w_in = conv_weight->Name(); + std::string conv_relu_b_in = conv_bias->Name(); + std::string conv_relu_out = relu_out->Name(); + desc.SetInput("Input", std::vector({conv_relu_i_in})); + desc.SetInput("Filter", std::vector({conv_relu_w_in})); + desc.SetInput("Bias", std::vector({conv_relu_b_in})); + desc.SetOutput("Output", std::vector({conv_relu_out})); + desc.SetType("conv2d"); + for (auto& attr : conv->Op()->GetAttrMap()) { + desc.SetAttr(attr.first, attr.second); + } + desc.SetAttr("fuse_relu", true); + auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied. + GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out}); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node); + IR_NODE_LINK_TO(conv_weight, conv_relu_node); + IR_NODE_LINK_TO(conv_bias, conv_relu_node); + IR_NODE_LINK_TO(conv_relu_node, relu_out); + + found_conv_relu_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_relu_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_relu_mkldnn_fuse_pass, + paddle::framework::ir::ConvReLUFusePass); diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b5de0d548713772e7ad41cfb6d8b3e9460683efb --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the CONV and ReLU to a ConvReLUOp. + */ +class ConvReLUFusePass : public FusePassBase { + public: + virtual ~ConvReLUFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..82b5fa1886098ca3b19c147c307d3f2fc3ba03d6 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", true); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); +} + +// a->OP0->b +// b->OP1->c +// (c, weights, bias)->conv->f +// (f)->relu->g +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"b"}), + std::vector({"c"})); + SetOp(&prog, "conv2d", std::vector({"c", "weights", "bias"}), + std::vector({"f"})); + SetOp(&prog, "relu", std::vector({"f"}), + std::vector({"g"})); + + return prog; +} + +TEST(ConvReLUFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: CONV, RELU, conv_out + // Add 1 Node: ConvReLU + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_relu op in newly generated graph + int conv_relu_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + if (node->Op()->HasAttr("use_mkldnn")) { + bool use_mkldnn = boost::get(node->Op()->GetAttr("use_mkldnn")); + if (use_mkldnn) { + if (node->Op()->HasAttr("fuse_relu")) { + bool fuse_relu = boost::get(node->Op()->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; + } + } + } + } + } + } + EXPECT_EQ(conv_relu_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_relu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index f7fda873574a0f8b10251d4fa6b604a9312ad7f9..aa95d3e9f6c8221f6e48d192b73ad5135539dc75 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -51,7 +51,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, if (with_fc_bias) { // Add FC-bias with LSTM-bias and create a new weight PADDLE_ENFORCE(scope); - const std::string& new_bias_var = name_scope + "_bias.new"; + const std::string& new_bias_var = patterns::UniqueKey("NewBias"); auto* bias_var = scope->Var(new_bias_var); PADDLE_ENFORCE(bias_var); auto* bias_tensor = bias_var->GetMutable(); @@ -120,7 +120,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); @@ -136,7 +135,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, fc_bias); // Remove unneeded nodes. std::unordered_set marked_nodes( - {mul, lstm, elementwise_add}); + {mul, lstm, elementwise_add, fc_bias}); GraphSafeRemoveNodes(graph, marked_nodes); } else { GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index fc7feca567e7a0f623ada77af189ef033b44fc53..11d5998aafe1f325b94ef1a5ea1c13c72c13f5c9 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -21,12 +21,17 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" namespace paddle { namespace framework { namespace ir { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; + size_t PDPattern::id_ = 0UL; PDNode* PDPattern::NewNode(const std::string& name) { @@ -83,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph, ValidateByNodeRole(&subgraphs); if (subgraphs.empty()) return; - LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; + PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); int id = 0; for (auto& g : subgraphs) { VLOG(3) << "optimizing #" << id++ << " subgraph"; @@ -517,6 +522,39 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { return false; } +PDNode* patterns::ConvReLU::operator()( + paddle::framework::ir::PDNode* conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + // Create variables + // Filter + auto* conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // Bias + auto* conv_bias_var = pattern->NewNode(conv_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Bias"); + // intermediate variable, will be removed in the IR after fuse. + auto* conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("relu"); + // output + auto* relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var}) + .LinksTo({conv_out_var}); + relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); + return relu_out_var; +} + PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, bool with_bias) { // Create shared nodes. diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 57482a07b607ba1d9fa06a5f325f60ba58dce307..371384dc56eec91db1f621c0ebb65113e7a5a5cc 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -360,6 +360,28 @@ struct PatternBase { size_t id_; }; +// CONV with ReLU +// op: conv + relu +// named nodes: +// conv_input, conv_weight, +// conv_bias, conv_out, conv, +// relu_out, relu +struct ConvReLU : public PatternBase { + ConvReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_relu") {} + + PDNode* operator()(PDNode* conv_input); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(relu); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_bias); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(relu_out); +}; + // FC with bias // op: mul + elementwise_add // named nodes: diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index adeb26e4e78693eb9760ec1e12e4b71ba3115d5b..1e7da9a69c7cbf8c13306656599a759515802b76 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" @@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { void SerializeToStream(std::ostream &os, const LoDTensor &tensor, const platform::DeviceContext &dev_ctx) { { // the 1st field, uint32_t version for LoDTensor - constexpr uint32_t version = 0; - os.write(reinterpret_cast(&version), sizeof(version)); + os.write(reinterpret_cast(&kCurTensorVersion), + sizeof(kCurTensorVersion)); } { // the 2st field, LoD information @@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, // the 1st field, unit32_t version for LoDTensor uint32_t version; is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE(framework::IsTensorVersionSupported(version), + "tensor version %u is not supported.", version); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); } { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ee119aa362918666fcd1aaaca72f1138112b03c4..5401df20abf401306f7c5a0ffc606e562b70b4df 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -495,35 +495,35 @@ class RuntimeInferShapeContext : public InferShapeContext { : op_(op), scope_(scope) {} bool HasInput(const std::string& name) const override { - if (!op_.HasInputs(name)) { + // has only one input + const auto& ins = op_.Inputs(); + auto it = ins.find(name); + if (it == ins.end()) { return false; } - auto& ins = Inputs(name); - size_t length = ins.size(); - if (length == 0) { + const auto& in = it->second; + if (in.size() == 0 || in[0] == kEmptyVarName) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, + PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - auto ipt = ins[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; + return scope_.FindVar(in[0]) != nullptr; } bool HasOutput(const std::string& name) const override { - if (!op_.HasOutputs(name)) { + // has only one output + const auto& outs = op_.Outputs(); + auto it = outs.find(name); + if (it == outs.end()) { return false; } - auto& outs = Outputs(name); - size_t length = outs.size(); - if (length == 0) { + const auto& out = it->second; + if (out.size() == 0 || out[0] == kEmptyVarName) { return false; } - PADDLE_ENFORCE_EQ(length, 1UL, - "Output %s should not have more than one inputs", name); - auto ipt = outs[0]; - auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - return var != nullptr; + PADDLE_ENFORCE_EQ(out.size(), 1UL, + "Output %s should not have more than one outputs", name); + return scope_.FindVar(out[0]) != nullptr; } bool HasInputs(const std::string& name) const override { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 81cb24bdda6b87a3d708cf5047dce05d5020a0d5..5b8c75a93de2ddd8f7260d2191c22a5945b3d2d9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -352,7 +352,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ParallelExecutor::~ParallelExecutor() { if (member_->own_local_scope_) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { - member_->global_scope_->DeleteScope(member_->local_scopes_[i]); + Scope *local_scope = member_->local_scopes_[i]; + if (member_->global_scope_->HasKid(local_scope)) { + member_->global_scope_->DeleteScope(local_scope); + } } } } diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index a63944eaee6132c1082947fddcad4e0d72e26df1..589905828f7793c614c0fe12259e9ba5ab11ceac 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/version.h" namespace paddle { namespace framework { @@ -38,7 +39,10 @@ proto::ProgramDesc *ProgramDesc::Proto() { return &desc_; } +int64_t ProgramDesc::Version() const { return desc_.version().version(); } + ProgramDesc::ProgramDesc() { + desc_.mutable_version()->set_version(kCurProgramVersion); auto *block = desc_.mutable_blocks()->Add(); block->set_idx(kRootBlockIndex); block->set_parent_idx(kNoneBlockIndex); diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index a0e81cade18c0ca5eb1b98fee8325ae2d917d1a2..2ec0e9d7a0969d44f88c7407bfb8cd4646530147 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -57,6 +57,8 @@ class ProgramDesc { proto::ProgramDesc *Proto(); + int64_t Version() const; + // The output variable of feed_op is referenced as feed_target. // This function is used to collect the output variable's name of all // feed_ops. diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc index 925ea98dbe62e4da91689f6e56c135e51c24a8a3..7e689a37da8a16bd9b1ac6650b9322d2eb5a2c85 100644 --- a/paddle/fluid/framework/program_desc_test.cc +++ b/paddle/fluid/framework/program_desc_test.cc @@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) { ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); - ASSERT_EQ(op_copy->Proto()->SerializeAsString(), - op_origin->Proto()->SerializeAsString()); + ASSERT_EQ(op_origin->Proto()->attrs().size(), + op_copy->Proto()->attrs().size()); + for (auto it = op_origin->Proto()->attrs().begin(); + it != op_origin->Proto()->attrs().end(); ++it) { + for (auto it_2 = op_copy->Proto()->attrs().begin(); + it_2 != op_copy->Proto()->attrs().end(); ++it_2) { + if (it->name() == it_2->name()) { + ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString()); + } + } + } if (op->Type() == "op_with_subblock") { ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index a068d3543d9d2abec203f86362a8be5ba135d04d..da163835e8652ae479121bd67f2eed77332b2740 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -56,5 +56,76 @@ struct RWLock { }; #endif +class RWLockGuard { + public: + enum Status { kUnLock, kWRLock, kRDLock }; + + RWLockGuard(RWLock* rw_lock, Status init_status) + : lock_(rw_lock), status_(Status::kUnLock) { + switch (init_status) { + case Status::kRDLock: { + RDLock(); + break; + } + case Status::kWRLock: { + WRLock(); + break; + } + case Status::kUnLock: { + break; + } + } + } + + void WRLock() { + switch (status_) { + case Status::kUnLock: { + lock_->WRLock(); + status_ = Status::kWRLock; + break; + } + case Status::kWRLock: { + break; + } + case Status::kRDLock: { + PADDLE_THROW( + "Please unlock read lock first before invoking write lock."); + break; + } + } + } + + void RDLock() { + switch (status_) { + case Status::kUnLock: { + lock_->RDLock(); + status_ = Status::kRDLock; + break; + } + case Status::kRDLock: { + break; + } + case Status::kWRLock: { + PADDLE_THROW( + "Please unlock write lock first before invoking read lock."); + break; + } + } + } + + void UnLock() { + if (status_ != Status::kUnLock) { + lock_->UNLock(); + status_ = Status::kUnLock; + } + } + + ~RWLockGuard() { UnLock(); } + + private: + RWLock* lock_; + Status status_; +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 50f374e3703a97f6c1fdb4b14fdeb0b603f9ac86..2be655b89a4caf2bf9874dcab6bc0bdb2856a026 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -72,6 +72,12 @@ void Scope::DropKids() { kids_.clear(); } +bool Scope::HasKid(const Scope* scope) const { + std::unique_lock lock(mutex_); + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + return it != this->kids_.end(); +} + std::vector Scope::LocalVarNames() const { std::unique_lock lock(mutex_); std::vector known_vars; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index e246241c0abfbc7bdcaf38d073cc58fc36a4f737..b6165a595d537c314a95685e8b1edbc42e387ab7 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -71,6 +71,9 @@ class Scope { /// Drop all kids scopes belonged to this scope. void DropKids(); + /// Find if a scope exists in the kid scopes + bool HasKid(const Scope* scope) const; + // enumerate all the variables current contains. std::vector LocalVarNames() const; diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc new file mode 100644 index 0000000000000000000000000000000000000000..81c0392bf3cc7378cec06a9de3ae81f2b221ecec --- /dev/null +++ b/paddle/fluid/framework/version.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/version.h" +#include + +namespace paddle { +namespace framework { +bool IsProgramVersionSupported(int64_t version) { + static int num_supported = + sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]); + return std::find(kSupportedProgramVersion, + kSupportedProgramVersion + num_supported, + version) != kSupportedProgramVersion + num_supported; +} + +bool IsTensorVersionSupported(uint32_t version) { + static int num_supported = + sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]); + return std::find(kSupportedTensorVersion, + kSupportedTensorVersion + num_supported, + version) != kSupportedTensorVersion + num_supported; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h new file mode 100644 index 0000000000000000000000000000000000000000..9945bc58c69df8456ff3d1aa0c777970bdbdbf98 --- /dev/null +++ b/paddle/fluid/framework/version.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#pragma once + +namespace paddle { +namespace framework { + +// Note: +// Program and Tensor that pass the IsXXXVersionSupported should +// be supported by the current codes. Otherwise, it's a compatibility +// bug. + +// The program version the current codes generate. +constexpr int64_t kCurProgramVersion = 0; + +// The program version that was generated by previous or current codes +// and supported by current codes. +constexpr int64_t kSupportedProgramVersion[] = {0}; + +// Due to historical reasons, tensor version use uint32_t. +// The tensor version the current codes generate. +constexpr uint32_t kCurTensorVersion = 0; + +// The tensor version that was generated by previous or current codes +// and supported by current codes. +constexpr uint32_t kSupportedTensorVersion[] = {0}; + +bool IsProgramVersionSupported(int64_t version); + +bool IsTensorVersionSupported(uint32_t version); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8c5f256000522af976bbf487741a586f1abc439 --- /dev/null +++ b/paddle/fluid/framework/version_test.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/version.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +TEST(Version, Basic) { + EXPECT_TRUE(IsProgramVersionSupported(0)); + EXPECT_FALSE(IsProgramVersionSupported(1)); + EXPECT_FALSE(IsProgramVersionSupported(-1)); + + EXPECT_TRUE(IsTensorVersionSupported(0)); + EXPECT_FALSE(IsTensorVersionSupported(1)); + EXPECT_FALSE(IsTensorVersionSupported(-1)); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91..efb91bcf75a3cb99a67d5a3251b1d42fc4b04170 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -55,6 +55,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) + add_subdirectory(tests/api) endif() diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 11a7509feb02a806e1e173bfb8bd7764f94d3457..c2a1c6634bd8f8de0796456e91cb3c530d4c6823 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc @@ -40,27 +40,7 @@ function (inference_analysis_test TARGET) endif(WITH_TESTING) endfunction(inference_analysis_test) -function (inference_download_and_uncompress install_dir url gz_filename) - message(STATUS "Download inference test stuff ${gz_filename} from ${url}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}") - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}") - message(STATUS "finish downloading ${gz_filename}") -endfunction(inference_download_and_uncompress) - -set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz") -set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz") -set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1" CACHE PATH "RNN1 model and data root." FORCE) -if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING) - inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz") - inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer SRCS analyzer_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${RNN1_INSTALL_DIR}/model - --infer_data=${RNN1_INSTALL_DIR}/data.txt) - +inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) @@ -71,46 +51,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) - -set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz") -set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz") -set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE) -if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz") - inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model - --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) - -set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz") -set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz") -set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE) -if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz") - inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz") -endif() - -inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor - ARGS --infer_model=${LAC_INSTALL_DIR}/model - --infer_data=${LAC_INSTALL_DIR}/data.txt) - - -set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz") -set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz") -set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE) - -if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE) - inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz") - inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz") -endif() - -inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc - EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor - ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta - --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt - --topn=1 # Just run top 1 batch. - ) diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 399afbe64a56393176795ecdd1ac70bfedd5c91a..9bdbefc07cbc4bf7a4714927c84855837610430e 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry { "mul_gru_fuse_pass", // "seq_concat_fc_fuse_pass", // "fc_fuse_pass", // +#ifdef PADDLE_WITH_MKLDNN + "conv_relu_mkldnn_fuse_pass", // +#endif }}; std::unordered_set disabled_ir_passes_; diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cc4b390495c60cecc8ebebb3f17b38d9b5a15956..3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -16,21 +16,9 @@ #include #include -#include // NOLINT -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/utils/singleton.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); namespace paddle { namespace inference { @@ -91,274 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) { } } -namespace { - -struct DataRecord { - std::vector>> link_step_data_all; - std::vector> week_data_all, minute_data_all; - std::vector lod1, lod2, lod3; - std::vector> rnn_link_data, rnn_week_datas, - rnn_minute_datas; - size_t batch_iter{0}; - size_t batch_size{1}; - DataRecord() = default; - explicit DataRecord(const std::string &path, int batch_size = 1) - : batch_size(batch_size) { - Load(path); - } - DataRecord NextBatch() { - DataRecord data; - size_t batch_end = batch_iter + batch_size; - // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= link_step_data_all.size()) { - data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, - link_step_data_all.begin() + batch_end); - data.week_data_all.assign(week_data_all.begin() + batch_iter, - week_data_all.begin() + batch_end); - data.minute_data_all.assign(minute_data_all.begin() + batch_iter, - minute_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - data.lod3.push_back(0); - CHECK(!data.link_step_data_all.empty()) << "empty"; - CHECK(!data.week_data_all.empty()); - CHECK(!data.minute_data_all.empty()); - CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); - CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); - for (size_t j = 0; j < data.link_step_data_all.size(); j++) { - for (const auto &d : data.link_step_data_all[j]) { - data.rnn_link_data.push_back(d); - } - data.rnn_week_datas.push_back(data.week_data_all[j]); - data.rnn_minute_datas.push_back(data.minute_data_all[j]); - // calculate lod - data.lod1.push_back(data.lod1.back() + - data.link_step_data_all[j].size()); - data.lod3.push_back(data.lod3.back() + 1); - for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { - data.lod2.push_back(data.lod2.back() + - data.link_step_data_all[j].size()); - } - } - } - batch_iter += batch_size; - return data; - } - void Load(const std::string &path) { - std::ifstream file(path); - std::string line; - int num_lines = 0; - while (std::getline(file, line)) { - num_lines++; - std::vector data; - split(line, ':', &data); - std::vector> link_step_data; - std::vector link_datas; - split(data[0], '|', &link_datas); - for (auto &step_data : link_datas) { - std::vector tmp; - split_to_float(step_data, ',', &tmp); - link_step_data.push_back(tmp); - } - // load week data - std::vector week_data; - split_to_float(data[2], ',', &week_data); - // load minute data - std::vector minute_data; - split_to_float(data[1], ',', &minute_data); - link_step_data_all.push_back(std::move(link_step_data)); - week_data_all.push_back(std::move(week_data)); - minute_data_all.push_back(std::move(minute_data)); - } - } -}; -void PrepareInputs(std::vector *input_slots, DataRecord *data, - int batch_size) { - PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, - week_tensor, minute_tensor; - lod_attention_tensor.name = "data_lod_attention"; - init_zero_tensor.name = "cell_init"; - lod_tensor_tensor.name = "data"; - week_tensor.name = "week"; - minute_tensor.name = "minute"; - auto one_batch = data->NextBatch(); - std::vector rnn_link_data_shape( - {static_cast(one_batch.rnn_link_data.size()), - static_cast(one_batch.rnn_link_data.front().size())}); - lod_attention_tensor.shape.assign({1, 2}); - lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); - init_zero_tensor.shape.assign({batch_size, 15}); - init_zero_tensor.lod.assign({one_batch.lod3}); - lod_tensor_tensor.shape = rnn_link_data_shape; - lod_tensor_tensor.lod.assign({one_batch.lod1}); - // clang-format off - week_tensor.shape.assign( - {static_cast(one_batch.rnn_week_datas.size()), - static_cast(one_batch.rnn_week_datas.front().size())}); - week_tensor.lod.assign({one_batch.lod3}); - minute_tensor.shape.assign( - {static_cast(one_batch.rnn_minute_datas.size()), - static_cast(one_batch.rnn_minute_datas.front().size())}); - minute_tensor.lod.assign({one_batch.lod3}); - // clang-format on - // assign data - TensorAssignData(&lod_attention_tensor, - std::vector>({{0, 0}})); - std::vector tmp_zeros(batch_size * 15, 0.); - TensorAssignData(&init_zero_tensor, {tmp_zeros}); - TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); - TensorAssignData(&week_tensor, one_batch.rnn_week_datas); - TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); - // Set inputs. - auto init_zero_tensor1 = init_zero_tensor; - init_zero_tensor1.name = "hidden_init"; - input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, - init_zero_tensor1, lod_attention_tensor, - lod_tensor_tensor}); - for (auto &tensor : *input_slots) { - tensor.dtype = PaddleDType::FLOAT32; - } -} - -} // namespace - -void CompareResult(const std::vector &outputs, - const std::vector &base_outputs) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size()); - for (size_t i = 0; i < outputs.size(); i++) { - auto &out = outputs[i]; - auto &base_out = base_outputs[i]; - size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, - [](int a, int b) { return a * b; }); - size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(), - 1, [](int a, int b) { return a * b; }); - PADDLE_ENFORCE_EQ(size, size1); - PADDLE_ENFORCE_GT(size, 0); - float *data = static_cast(out.data.data()); - float *base_data = static_cast(base_out.data.data()); - for (size_t i = 0; i < size; i++) { - EXPECT_NEAR(data[i], base_data[i], 1e-3); - } - } -} -// Test with a really complicate model. -void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { - AnalysisConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; - config.enable_ir_optim = activate_ir; - PADDLE_ENFORCE(config.ir_mode == - AnalysisConfig::IrPassMode::kExclude); // default - config.ir_passes.clear(); // Do not exclude any pass. - - int batch_size = FLAGS_batch_size; - int num_times = FLAGS_repeat; - - auto base_predictor = - CreatePaddlePredictor(config); - auto predictor = - CreatePaddlePredictor( - config); - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - // Prepare inputs. - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs, base_outputs; - - base_predictor->Run(input_slots, &base_outputs); - - if (num_threads == 1) { - // Prepare inputs. - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictor->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times); - CompareResult(outputs, base_outputs); - } else { - std::vector threads; - std::vector> predictors; - // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled - // because AttentionLSTM's hard code nodeid will be damanged. - for (int tid = 0; tid < num_threads; ++tid) { - predictors.emplace_back( - CreatePaddlePredictor( - config)); - } - for (int tid = 0; tid < num_threads; ++tid) { - threads.emplace_back([&, tid]() { - // Each thread should have local input_slots and outputs. - std::vector input_slots; - DataRecord data(FLAGS_infer_data, batch_size); - PrepareInputs(&input_slots, &data, batch_size); - std::vector outputs; - Timer timer; - timer.tic(); - for (int i = 0; i < num_times; i++) { - predictors[tid]->Run(input_slots, &outputs); - } - PrintTime(batch_size, num_times, num_threads, tid, - timer.toc() / num_times); - CompareResult(outputs, base_outputs); - }); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } - } - - if (use_analysis && activate_ir) { - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); - auto &fuse_statis = analysis_predictor->analysis_argument() - .Get>( - framework::ir::kFuseStatisAttr); - for (auto &item : fuse_statis) { - LOG(INFO) << "fused " << item.first << " " << item.second; - } - - int num_ops = 0; - for (auto &node : - analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { - if (node->IsFunction()) { - ++num_ops; - } - } - LOG(INFO) << "has num ops: " << num_ops; - - ASSERT_TRUE(fuse_statis.count("fc_fuse")); - EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); - EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM - EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); - EXPECT_EQ(num_ops, - 13); // After graph optimization, only 13 operators exists. - } -} - -// Inference with analysis and IR, easy for profiling independently. -TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } - -// Other unit-tests of RNN1, test different options of use_analysis, -// activate_ir and multi-threads. -TEST(Analyzer, RNN_tests) { - int num_threads[2] = {1, 4}; - for (auto i : num_threads) { - // Directly infer with the original model. - TestRNN1Prediction(false, false, i); - // Inference with the original model with the analysis turned on, the - // analysis - // module will transform the program to a data flow graph. - TestRNN1Prediction(true, false, i); - // Inference with analysis and IR. The IR module will fuse some large - // kernels. - TestRNN1Prediction(true, true, i); - } +TEST(Analyzer, word2vec_without_analysis) { + TestWord2vecPrediction(FLAGS_inference_model_dir); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc b/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc deleted file mode 100644 index 65169f8cfcc5bf1e989609666f6e0ba03e42e5ba..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/analyzer_text_classification_tester.cc +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files. -#include -#include -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_api.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/inference/api/timer.h" - -DEFINE_string(infer_model, "", "Directory of the inference model."); -DEFINE_string(infer_data, "", "Path of the dataset."); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(repeat, 1, "How many times to repeat run."); -DEFINE_int32(topn, -1, "Run top n batches of data to save time"); - -namespace paddle { -namespace inference { - -struct DataReader { - explicit DataReader(const std::string &path) - : file(new std::ifstream(path)) {} - - bool NextBatch(PaddleTensor *tensor, int batch_size) { - PADDLE_ENFORCE_EQ(batch_size, 1); - std::string line; - tensor->lod.clear(); - tensor->lod.emplace_back(std::vector({0})); - std::vector data; - - for (int i = 0; i < batch_size; i++) { - if (!std::getline(*file, line)) return false; - inference::split_to_int64(line, ' ', &data); - } - tensor->lod.front().push_back(data.size()); - - tensor->data.Resize(data.size() * sizeof(int64_t)); - memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t)); - tensor->shape.clear(); - tensor->shape.push_back(data.size()); - tensor->shape.push_back(1); - return true; - } - - std::unique_ptr file; -}; - -void Main(int batch_size) { - // shape -- - // Create Predictor -- - AnalysisConfig config; - config.model_dir = FLAGS_infer_model; - config.use_gpu = false; - config.enable_ir_optim = true; - auto predictor = - CreatePaddlePredictor( - config); - - std::vector input_slots(1); - // one batch starts - // data -- - auto &input = input_slots[0]; - input.dtype = PaddleDType::INT64; - - inference::Timer timer; - double sum = 0; - std::vector output_slots; - - int num_batches = 0; - for (int t = 0; t < FLAGS_repeat; t++) { - DataReader reader(FLAGS_infer_data); - while (reader.NextBatch(&input, FLAGS_batch_size)) { - if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break; - timer.tic(); - CHECK(predictor->Run(input_slots, &output_slots)); - sum += timer.toc(); - ++num_batches; - } - } - PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); - - // Get output - LOG(INFO) << "get outputs " << output_slots.size(); - - for (auto &output : output_slots) { - LOG(INFO) << "output.shape: " << to_string(output.shape); - // no lod ? - CHECK_EQ(output.lod.size(), 0UL); - LOG(INFO) << "output.dtype: " << output.dtype; - std::stringstream ss; - for (int i = 0; i < 5; i++) { - ss << static_cast(output.data.data())[i] << " "; - } - LOG(INFO) << "output.data summary: " << ss.str(); - // one batch ends - } -} - -TEST(text_classification, basic) { Main(FLAGS_batch_size); } - -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 100a7504b8526b3587858dd7783913757ba09895..8c7d58678fd29cb25d13d64a08e6c6f26f242d8b 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT } return false; }; + for (auto &node : graph) { for (auto *in : node->inlinks) { // The Value that is written by nodes inside a sub-graph shouldn't be the @@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT std::vector(outputs.begin(), outputs.end())); } +// Filter the Intermediate results of the subgraph node. void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { std::vector op_nodes; for (auto &node : GraphTraits(*graph).nodes_in_TS()) { @@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { for (auto *out : op_nodes[i]->outlinks) { if (follow_up_input_names.count(out->name())) { filtered_subgraph_outlinks.push_back(out); + } else { + out->SetDeleted(); } } - PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); + // The filtered_subgraph_outlinks may be empty. op_nodes[i]->outlinks = filtered_subgraph_outlinks; } } diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 8579845d51e80d73d220465d25b70944f5ad9bf2..5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, // collect inputs std::unordered_set input_names; + std::unordered_set input_names_with_id; for (auto *x : func->inlinks) { input_names.insert(x->name()); + input_names_with_id.insert(x->name() + std::to_string(x->id())); } desc.SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); std::unordered_set output_names; + std::unordered_set output_names_with_id; for (auto *x : func->outlinks) { output_names.insert(x->name()); + output_names_with_id.insert(x->name() + std::to_string(x->id())); } - std::vector output_temp(output_names.begin(), - output_names.end()); - desc.SetOutput("Ys", output_temp); + desc.SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); desc.SetType("tensorrt_engine"); std::unordered_map output_name_map; @@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < in_var->arguments_size(); k++) { std::string arg_value = in_var->arguments(k); - if (input_names.count(arg_value)) { + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { replaced_names.push_back(arg_value); } else { - replaced_names.push_back(arg_value + - std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } } in_var->clear_arguments(); @@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, std::vector replaced_names; for (int k = 0; k < out_var->arguments_size(); k++) { std::string arg_value = out_var->arguments(k); - if (output_names.count(arg_value)) { - output_name_map[arg_value] = - arg_value + std::to_string(var2id[arg_value]); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id.count(arg_value_with_id)) { + output_name_map[arg_value] = arg_value_with_id; } - replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); + replaced_names.push_back(arg_value_with_id); } out_var->clear_arguments(); for (size_t k = 0; k < replaced_names.size(); k++) { diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index ea0f2241d7dbab8f79ec9349effbe96112748e34..e76708baf4b39afb0febbcf3ff71281dfbfc8627 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -14,13 +14,18 @@ #include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { namespace analysis { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; IRPassManager::IRPassManager(const ProgramDesc &program, framework::Scope *scope) @@ -33,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program, void IRPassManager::Apply(const std::vector &passes) { // Apply all the passes std::string pre_pass; + int pass_num = 0; for (const std::string &pass_name : passes) { - LOG(WARNING) << "Running IR pass [" << pass_name << "]"; + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { - std::string dot_file_path = - "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot"; + std::string dot_file_path = std::to_string(pass_num) + "_ir_" + + (pre_pass.empty() ? "origin" : pre_pass) + + ".dot"; pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); + pass_num++; } graph_ = pass->Apply(std::move(graph_)); pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index 759b2b96a1944c060ac98b6865b58ba2f6369607..a6ac0ee49f8f408faa7a17bf5ef5d2799a9a6238 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -22,7 +23,7 @@ namespace analysis { bool PassManager::Initialize(Argument* argument) { argument_ = argument; for (auto& pass : data_) { - LOG(WARNING) << "Initializing pass [" << pass->repr() << "]"; + VLOG(3) << "Initializing pass [" << pass->repr() << "]"; if (!pass->Initialize(argument)) { LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; return false; @@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); - LOG(INFO) << "Total " << data_.size() << " Analysys passes"; + VLOG(3) << "Total " << data_.size() << " Analysys passes"; for (auto& pass : data_) { - LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]"; + string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]", + pass->repr()); pass->Run(argument_->main_dfg.get()); } } diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 670a8de667494c655bed15aa3e4ce8265448635a..b879067d2f2f6294c50e0adb21f9399a7c36698a 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; } +// This is a simple representation of a graph. +// The BriefNode hold the pointer of the Node. +// This is to avoid changing the original graph +// in the process of trt graph analysis. +struct BriefNode { + explicit BriefNode(Node *n) { node = n; } + Node *node; + std::vector inlinks; + std::vector outlinks; +}; + +// Union two adjacent BriefNode. +// Suppose we have two adjacent nodes src and dst. +// We will perform the following operations: +// 1. add all inputs(except src) of dst to src inlinks. +// 2. add all outputs of dst to src outlinks. +// 3. change all the dst's inputs and outputs +// corresponding inlinks and outlinks to src node. +// 4. delete all dst's inlinks and outlinks. +void UnionContractedNodes(const std::unordered_map &node_map, + int src_id, int dst_id) { + // merge the two adjacent nodes into one node. + BriefNode *src_node = node_map.at(src_id); + BriefNode *dst_node = node_map.at(dst_id); + + std::unordered_set inputs(src_node->inlinks.begin(), + src_node->inlinks.end()); + std::unordered_set outputs; + + for (auto *n : src_node->outlinks) { + if (n != dst_node) outputs.insert(n); + } + + // Add the inlinks and outlinks of dst node to src node. + std::vector dst_in_nodes = dst_node->inlinks; + for (BriefNode *node : dst_in_nodes) { + if (node != src_node) { + inputs.insert(node); + } + } + + std::vector dst_out_nodes = dst_node->outlinks; + for (BriefNode *node : dst_out_nodes) { + outputs.insert(node); + } + +// update the dst and src node's inlinks and outlinks. +#ifdef __clang__ + src_node->inlinks = std::vector(inputs.begin(), inputs.end()); + src_node->outlinks = std::vector(outputs.begin(), outputs.end()); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#else + src_node->inlinks = + std::move(std::vector(inputs.begin(), inputs.end())); + src_node->outlinks = + std::move(std::vector(outputs.begin(), outputs.end())); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#endif + + auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { + for (auto *&n : nodes) { + if (n == src_node || n == dst_node) { + n = src_node; + } + } + }; + // Change all the dst inputs and outputs corresponding inlink and + // outlink to the src node. + for (auto *node : src_node->inlinks) { + inlink_or_outlink_cleaner(node->outlinks); + } + + for (auto *node : src_node->outlinks) { + inlink_or_outlink_cleaner(node->inlinks); + } +} + +// FlexibleDFS +// If reverse is true, do reverse dfs. +// If enter func is not nullptr, calls enter(node) before visiting any children +// of node. +// If leave func not nullptr, calls leave(node) after visiting all parents of +// node. +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave) { + typedef struct { + const BriefNode *node; + bool leave; + } FNode; + + std::vector stack; + for (auto &node : source) { + stack.push_back(FNode{node, false}); + } + std::unordered_set visited; + while (!stack.empty()) { + auto fnode = stack.back(); + stack.pop_back(); + + if (fnode.leave) { + if (leave && !leave(fnode.node)) return; + } + if (visited.count(fnode.node)) continue; + visited.insert(fnode.node); + + if (enter && !enter(fnode.node)) return; + + if (leave) stack.push_back(FNode{fnode.node, true}); + const std::vector iter_nodes = + reverse == true ? fnode.node->inlinks : fnode.node->outlinks; + for (const BriefNode *node : iter_nodes) { + if (!visited.count(node)) { + stack.push_back(FNode{node, false}); + } + } + } +} + std::vector> SubGraphSplitter::ExtractSubGraphs() { + // Run the Extract algorithm to find all subgraphs. std::vector marked_nodes; + // We use brief_node_map to represent the original graph in order to avoid + // changing the original graph. + std::unordered_map brief_node_map; + for (auto &node : GraphTraits(*graph_).nodes_in_TS()) { + brief_node_map[node.id()] = new BriefNode(&node); if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } } + // extract sub-graphs in the marked node set, use Union Find algorithm. node_map_t node_map; // id to ptr for (auto *n : marked_nodes) { @@ -88,11 +216,73 @@ std::vector> SubGraphSplitter::ExtractSubGraphs() { n->attr(kUnionFindParent).Int32() = n->id(); node_map[n->id()] = n; } - std::unordered_set visited; - for (auto *n : marked_nodes) { - for (auto *out : n->outlinks) { - if (node_map.count(out->id())) { - UnionFindCombine(node_map, n->id(), out->id()); + + // create breif node map + for (auto &itr : brief_node_map) { + for (Node *node : itr.second->node->inlinks) { + itr.second->inlinks.push_back(brief_node_map[node->id()]); + } + + for (Node *node : itr.second->node->outlinks) { + itr.second->outlinks.push_back(brief_node_map[node->id()]); + } + } + + for (auto &itr : brief_node_map) { + BriefNode *brief_node = itr.second; + + if (!brief_node->node->attr(kMarkerAttrName).Bool()) { + VLOG(4) << brief_node->node->id() << " node not a trt candicate."; + continue; + } + + // Our algorithm must guarantee that: + // 1. The graph is always directed acyclic graph(DAG). + // 2. If there is a path in the subgraph from X to Y (X and Y are both + // nodes in the subgraph), then all paths from X to Y are in the + // subgraph. + // + // In order to achieve the above guarantee. + // For adjacent nodes src -> dst. + // 1. Get all dst input nodes except src. + // 2. Reverse DFS from those input nodes + // 3. If there is a path from input nodes to src, + // then the src and dst nodes can not be fused into one node, + // otherwise it can be done. + + while (true) { + std::unordered_set contract_nodes; + for (auto *out : brief_node->outlinks) { + // must be an trt candidate + if (!out->node->attr(kMarkerAttrName).Bool()) continue; + // get all dst input nodes except src. + std::vector source_nodes; + for (auto *n : out->inlinks) { + if (n != brief_node) { + source_nodes.push_back(n); + } + } + + // Reverse DFS from the source_nodes. + bool have_excess_path = false; + FlexibleDFS(source_nodes, true, nullptr, + [&have_excess_path, brief_node](const BriefNode *n) { + if (n == brief_node) { + have_excess_path = true; + return false; + } + return true; + }); + if (have_excess_path) continue; + contract_nodes.insert(out); + } + if (contract_nodes.empty()) break; + + for (auto dst_node : contract_nodes) { + UnionFindCombine(node_map, brief_node->node->id(), + dst_node->node->id()); + UnionContractedNodes(brief_node_map, brief_node->node->id(), + dst_node->node->id()); } } } @@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto io = ExtractInputAndOutputOfSubGraph(subgraph); block_node->inlinks = std::move(io.first); block_node->outlinks = std::move(io.second); + for (auto *node : subgraph) { // TODO(Superjomn) need a unified mechanism to treat deleted node in each // pass. diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 39cc433b40fad17f4f12359d4e907a250a88bd63..531a170512f727d891aa6644ee08a60c25f16876 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) { // At least one nodes should be deleted. ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock - ASSERT_EQ(6, count1); + ASSERT_EQ(11, count1); } } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2a9a7aed480e76edbac4d5ba6d7bc3b8b2dc5006..684e0ce0e292d852d4601ebd1ccd920382e42c8b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -77,6 +77,9 @@ bool AnalysisPredictor::Init( OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } VLOG(5) << "to create variables"; PADDLE_ENFORCE(scope_.get()); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index bd9b4b1a814f995e3979105f5b9830b95fd8ea7d..2e9e10139fa7008a46c3782960dfd44d3228cc26 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init( } ctx_ = executor_->Prepare(*inference_program_, 0); + if (config_._use_mkldnn) { + executor_->EnableMKLDNN(*inference_program_); + } executor_->CreateVariables(*inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0); @@ -262,7 +265,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { buffer.Resize(sizeof(T) * data.size()); } - std::memcpy(buffer.data(), data.data(), buffer.length()); + std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size()); // copy LoD for (const auto &level : fetch.lod()) { output->lod.emplace_back(level); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index f6893be428feacbba85bab380e22972848eaeb93..8e359a67738c0df180933421b45f15b39fd0e78c 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) { } void PrintTime(int batch_size, int repeat, int num_threads, int tid, - double latency) { + double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid << ", latency: " << latency << "ms ======"; + if (epoch > 1) { + int samples = batch_size * epoch; + LOG(INFO) << "====== sample number: " << samples + << ", average latency of each sample: " << latency / samples + << "ms ======"; + } } } // namespace inference diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 995da11e4a30eca72a91a53d3293aa8b033b012b..55a07ca705f9fafa9ea223a867300bd14e10c364 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -45,7 +45,7 @@ class PaddleBuf { PaddleBuf(void* data, size_t length) : data_(data), length_(length), memory_owned_{false} {} // Own memory. - PaddleBuf(size_t length) + explicit PaddleBuf(size_t length) : data_(new char[length]), length_(length), memory_owned_(true) {} // Resize to `length` bytes. void Resize(size_t length); @@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config { bool use_gpu{false}; int device{0}; float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. + // NOTE: NOT use it, just for the internal test, will discard later + bool _use_mkldnn{false}; // Specify the variable's name of each input. bool specify_input_name{false}; diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index cef7b2a7e3a29da05628d7540f5545dc9adda27e..e246a06fd079d837ac321197914c9f70b528f2c8 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/pybind/pybind.h" @@ -124,6 +125,9 @@ std::unique_ptr Load(framework::Executor* executor, std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); LoadPersistables(executor, scope, *main_program, dirname, ""); return main_program; @@ -138,6 +142,9 @@ std::unique_ptr Load( std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); LoadPersistables(executor, scope, *main_program, "", param_filename); return main_program; diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index e1cace9cc1b06f036f52e82b7b86c99a02d50f50..8168cdff1b85fc05d22fbec7fac6ab8892f3a907 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter { engine_, Activation, *const_cast(input_tensor), nvinfer1::ActivationType::kRELU); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("relu (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 94f8b0ae5606d39a722ffe28501645c9b6fc5d2e..3330af2da6c97ad153dcecd86be4b441eac62b5e 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); + layer->setName(("batch_norm (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Bias").front()] = std::move(combile_bias_tensor); engine_->weight_map[op_desc.Input("Scale").front()] = diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc index bb9627bf957b63993b2c8d23e7ec8122eb004eaf..a11dfa1e8f2dacfad067d025678911200db500fb 100644 --- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter { axis = axis - 1; // Remove batch dim layer->setAxis(axis); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("concat (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 841a95db38ce7cf0cb5961ff04cb569ee2633e6f..0a37d3968c39d2c244bbd82161afddf6330e421d 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter { layer->setNbGroups(groups); auto output_name = op_desc.Output("Output").front(); + layer->setName(("conv2d (Output: " + output_name + ")").c_str()); engine_->weight_map[op_desc.Input("Filter").front()] = std::move(weight_tensor); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 60a72b4eb5c75b5cd12305f13763a9a1a567213f..0a6ce568f194f03c7259e1ebf28dd6ce4df2d594 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter { shift_weights.get(), scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise_add (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the @@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter { *const_cast(Y), op_pair->second); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("elementwise (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { // the test framework can not determine which is the // output, so place the declaration inside. diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index ad98d85aae9cf594922aca00c43718ccfbce2278..7c21ecd95da07b498eed2ab1bbdcc0e8cd184787 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter { n_output, tmp_weight.get(), bias.get()); auto output_name = op_desc.Output("Out").front(); + layer->setName(("fc (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); if (test_mode) { diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 73f1b28ddf73403862e55d102a259d7b6cf67b1f..f9bb66a6e9f81a10368db7710108c319860e940a 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter { layer->setPadding(nv_paddings); auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool2d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); engine_->SetITensor(output_name, layer->getOutput(0)); if (test_mode) { engine_->DeclareOutput(output_name); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8c34047abaac1552f140262f5fdb3f11c792bfc --- /dev/null +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -0,0 +1,73 @@ +set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo") +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) +function (inference_download_and_uncompress install_dir filename) + message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}") + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + message(STATUS "finish downloading ${filename}") +endfunction(inference_download_and_uncompress) + +function(download_model_and_data install_dir model_name data_name) + if (NOT EXISTS ${install_dir} AND WITH_INFERENCE) + inference_download_and_uncompress(${install_dir} ${model_name}) + inference_download_and_uncompress(${install_dir} ${data_name}) + endif() +endfunction() + +# RNN1 +set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") +download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") +inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RNN1_INSTALL_DIR}/model + --infer_data=${RNN1_INSTALL_DIR}/data.txt) + +# RNN2 +set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") +download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") +inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RNN2_INSTALL_DIR}/model + --infer_data=${RNN2_INSTALL_DIR}/data.txt) + +# chinese_ner +set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") +inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model + --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt) + +# lac +set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") +download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") +inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${LAC_INSTALL_DIR}/model + --infer_data=${LAC_INSTALL_DIR}/data.txt) + +# text_classification +set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") +inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta + --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt) + +# ocr +set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz") +set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr") +if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE) + get_filename_component(filename ${OCR_MODEL_URL} NAME) + message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}") + execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}") + execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}") + execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}") + message(STATUS "finish downloading ${filename}") +endif() +inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${OCR_INSTALL_DIR}/model + --infer_data=${OCR_INSTALL_DIR}/data.txt) diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc similarity index 67% rename from paddle/fluid/inference/analysis/analyzer_lac_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 522d870db8583aac4006e8cdb7909625c3feb34b..bf893e32569f4b50a583ab6f43cb214ec3620e09 100644 --- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -12,21 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path for LAC"); -DEFINE_string(infer_data, "", "data file for LAC"); -DEFINE_int32(batch_size, 1, "batch size."); -DEFINE_int32(burning, 0, "Burning before repeat."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -117,34 +103,6 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, input_slots->assign({input_tensor}); } -void BenchAllData(const std::string &model_path, const std::string &data_file, - const int batch_size, const int repeat) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; - std::vector input_slots, outputs_slots; - DataRecord data(data_file, batch_size); - auto predictor = - CreatePaddlePredictor(config); - GetOneBatch(&input_slots, &data, batch_size); - for (int i = 0; i < FLAGS_burning; i++) { - predictor->Run(input_slots, &outputs_slots); - } - Timer timer; - double sum = 0; - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } - } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); -} - const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, @@ -152,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void TestLACPrediction(const std::string &model_path, const std::string &data_file, const int batch_size, - const int repeat, bool test_all_data, - bool use_analysis = false) { - NativeConfig config; - config.model_dir = model_path; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + const int repeat, bool use_analysis = false) { + AnalysisConfig cfg; + cfg.model_dir = model_path; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; + std::vector input_slots, outputs_slots; DataRecord data(data_file, batch_size); GetOneBatch(&input_slots, &data, batch_size); std::unique_ptr predictor; if (use_analysis) { - AnalysisConfig cfg; - cfg.model_dir = model_path; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } for (int i = 0; i < FLAGS_burning; i++) { predictor->Run(input_slots, &outputs_slots); } Timer timer; - if (test_all_data) { - double sum = 0; - LOG(INFO) << "Total number of samples: " << data.datasets.size(); - for (int i = 0; i < repeat; i++) { - for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { - GetOneBatch(&input_slots, &data, batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs_slots); - sum += timer.toc(); - } + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { + GetOneBatch(&input_slots, &data, batch_size); + input_slots_all.emplace_back(input_slots); } - PrintTime(batch_size, repeat, 1, 0, sum / repeat); - LOG(INFO) << "Average latency of each sample: " - << sum / repeat / data.datasets.size() << " ms"; + LOG(INFO) << "total number of samples: " << data.datasets.size(); + TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads); return; } timer.tic(); @@ -218,19 +166,10 @@ void TestLACPrediction(const std::string &model_path, if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], pdata[i]); - } + CompareResult(ref_outputs_slots, outputs_slots); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); @@ -259,13 +198,13 @@ void TestLACPrediction(const std::string &model_path, TEST(Analyzer_LAC, native) { LOG(INFO) << "LAC with native"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data); + FLAGS_repeat); } TEST(Analyzer_LAC, analysis) { LOG(INFO) << "LAC with analysis"; TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, - FLAGS_repeat, FLAGS_test_all_data, true); + FLAGS_repeat, true); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc similarity index 73% rename from paddle/fluid/inference/analysis/analyzer_ner_tester.cc rename to paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 661b047ed7cb70545267e468d8c2c48596a2994c..f8c651e32f7e2ce1d8ced0e6774ffd555d351167 100644 --- a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -12,20 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/analyzer.h" -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" -#include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/helper.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_string(infer_model, "", "model path"); -DEFINE_string(infer_data, "", "data path"); -DEFINE_int32(batch_size, 10, "batch size."); -DEFINE_int32(repeat, 1, "Running the inference program repeat times."); -DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +#include "paddle/fluid/inference/tests/api/tester_helper.h" namespace paddle { namespace inference { @@ -113,49 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, 48, 39, 38, 16, 25}; void TestChineseNERPrediction(bool use_analysis) { - NativeConfig config; - config.prog_file = FLAGS_infer_model + "/__model__"; - config.param_file = FLAGS_infer_model + "/param"; - config.use_gpu = false; - config.device = 0; - config.specify_input_name = true; + AnalysisConfig cfg; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.param_file = FLAGS_infer_model + "/param"; + cfg.use_gpu = false; + cfg.device = 0; + cfg.specify_input_name = true; + cfg.enable_ir_optim = true; std::vector input_slots, outputs; std::unique_ptr predictor; Timer timer; if (use_analysis) { - AnalysisConfig cfg; - cfg.prog_file = FLAGS_infer_model + "/__model__"; - cfg.param_file = FLAGS_infer_model + "/param"; - cfg.use_gpu = false; - cfg.device = 0; - cfg.specify_input_name = true; - cfg.enable_ir_optim = true; predictor = CreatePaddlePredictor(cfg); } else { predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); } if (FLAGS_test_all_data) { LOG(INFO) << "test all data"; - double sum = 0; - size_t num_samples; - for (int i = 0; i < FLAGS_repeat; i++) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - num_samples = data.num_samples; - for (size_t bid = 0; bid < num_samples; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); - timer.tic(); - predictor->Run(input_slots, &outputs); - sum += timer.toc(); - } + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector> input_slots_all; + for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); } - LOG(INFO) << "total number of samples: " << num_samples; - PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); - LOG(INFO) << "average latency of each sample: " - << sum / FLAGS_repeat / num_samples; + LOG(INFO) << "total number of samples: " << data.num_samples; + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); return; } // Prepare inputs. @@ -181,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) { if (use_analysis) { // run once for comparion as reference auto ref_predictor = - CreatePaddlePredictor(config); + CreatePaddlePredictor(cfg); std::vector ref_outputs_slots; ref_predictor->Run(input_slots, &ref_outputs_slots); - EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); - auto &ref_out = ref_outputs_slots[0]; - size_t ref_size = - std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, - [](int a, int b) { return a * b; }); - EXPECT_EQ(size, ref_size); - int64_t *pdata_ref = static_cast(ref_out.data.data()); - for (size_t i = 0; i < size; ++i) { - EXPECT_EQ(pdata_ref[i], result[i]); - } + CompareResult(ref_outputs_slots, outputs); AnalysisPredictor *analysis_predictor = dynamic_cast(predictor.get()); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..df96be544eaf51c52aa5592966f499fad91aab82 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -0,0 +1,236 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector> week_data_all, minute_data_all; + std::vector lod1, lod2, lod3; + std::vector> rnn_link_data, rnn_week_datas, + rnn_minute_datas; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + data.week_data_all.assign(week_data_all.begin() + batch_iter, + week_data_all.begin() + batch_end); + data.minute_data_all.assign(minute_data_all.begin() + batch_iter, + minute_data_all.begin() + batch_end); + // Prepare LoDs + data.lod1.push_back(0); + data.lod2.push_back(0); + data.lod3.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + CHECK(!data.week_data_all.empty()); + CHECK(!data.minute_data_all.empty()); + CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); + CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + } + data.rnn_week_datas.push_back(data.week_data_all[j]); + data.rnn_minute_datas.push_back(data.minute_data_all[j]); + // calculate lod + data.lod1.push_back(data.lod1.back() + + data.link_step_data_all[j].size()); + data.lod3.push_back(data.lod3.back() + 1); + for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { + data.lod2.push_back(data.lod2.back() + + data.link_step_data_all[j].size()); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + std::vector> link_step_data; + std::vector link_datas; + split(data[0], '|', &link_datas); + for (auto &step_data : link_datas) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + link_step_data.push_back(tmp); + } + // load week data + std::vector week_data; + split_to_float(data[2], ',', &week_data); + // load minute data + std::vector minute_data; + split_to_float(data[1], ',', &minute_data); + link_step_data_all.push_back(std::move(link_step_data)); + week_data_all.push_back(std::move(week_data)); + minute_data_all.push_back(std::move(minute_data)); + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, + week_tensor, minute_tensor; + lod_attention_tensor.name = "data_lod_attention"; + init_zero_tensor.name = "cell_init"; + lod_tensor_tensor.name = "data"; + week_tensor.name = "week"; + minute_tensor.name = "minute"; + auto one_batch = data->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor.shape.assign({1, 2}); + lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); + init_zero_tensor.shape.assign({batch_size, 15}); + init_zero_tensor.lod.assign({one_batch.lod3}); + lod_tensor_tensor.shape = rnn_link_data_shape; + lod_tensor_tensor.lod.assign({one_batch.lod1}); + // clang-format off + week_tensor.shape.assign( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor.lod.assign({one_batch.lod3}); + minute_tensor.shape.assign( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor.lod.assign({one_batch.lod3}); + // clang-format on + // assign data + TensorAssignData(&lod_attention_tensor, + std::vector>({{0, 0}})); + std::vector tmp_zeros(batch_size * 15, 0.); + TensorAssignData(&init_zero_tensor, {tmp_zeros}); + TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); + TensorAssignData(&week_tensor, one_batch.rnn_week_datas); + TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); + // Set inputs. + auto init_zero_tensor1 = init_zero_tensor; + init_zero_tensor1.name = "hidden_init"; + input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, + init_zero_tensor1, lod_attention_tensor, + lod_tensor_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::FLOAT32; + } +} + +// Test with a really complicate model. +void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = activate_ir; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + config.ir_passes.clear(); // Do not exclude any pass. + + int batch_size = FLAGS_batch_size; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + // Prepare inputs. + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + base_predictor->Run(input_slots, &base_outputs); + + std::vector> input_slots_all; + input_slots_all.emplace_back(input_slots); + if (num_threads == 1) { + TestOneThreadPrediction(config, input_slots_all, &outputs); + CompareResult(outputs, base_outputs); + } else { + // only return the output of first thread + TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads); + } + + if (use_analysis && activate_ir) { + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; + + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + } +} + +// Inference with analysis and IR, easy for profiling independently. +TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } + +// Other unit-tests of RNN1, test different options of use_analysis, +// activate_ir and multi-threads. +TEST(Analyzer, RNN_tests) { + int num_threads[2] = {1, 4}; + for (auto i : num_threads) { + // Directly infer with the original model. + TestRNN1Prediction(false, false, i); + // Inference with the original model with the analysis turned on, the + // analysis module will transform the program to a data flow graph. + TestRNN1Prediction(true, false, i); + // Inference with analysis and IR. The IR module will fuse some large + // kernels. + TestRNN1Prediction(true, true, i); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..c40ea58eea9c10a85acf84108f1d081a779f526d --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data path"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector lod; + std::vector> rnn_link_data; + std::vector result_data; + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + // Prepare LoDs + data.lod.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + // calculate lod + data.lod.push_back(data.lod.back() + 11); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + if (num_lines % 2) { // feature + std::vector feature_data; + split(data[1], ' ', &feature_data); + std::vector> link_step_data; + int feature_count = 1; + std::vector feature; + for (auto &step_data : feature_data) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + feature.insert(feature.end(), tmp.begin(), tmp.end()); + if (feature_count % 11 == 0) { // each sample has 11 features + link_step_data.push_back(feature); + feature.clear(); + } + feature_count++; + } + link_step_data_all.push_back(std::move(link_step_data)); + } else { // result + std::vector tmp; + split_to_float(data[1], ',', &tmp); + result_data.insert(result_data.end(), tmp.begin(), tmp.end()); + } + } + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor feed_tensor; + feed_tensor.name = "feed"; + auto one_batch = data->NextBatch(); + int token_size = one_batch.rnn_link_data.size(); + // each token has 11 features, each feature's dim is 54. + std::vector rnn_link_data_shape({token_size * 11, 54}); + feed_tensor.shape = rnn_link_data_shape; + feed_tensor.lod.assign({one_batch.lod}); + feed_tensor.dtype = PaddleDType::FLOAT32; + TensorAssignData(&feed_tensor, one_batch.rnn_link_data); + // Set inputs. + input_slots->assign({feed_tensor}); +} + +void CompareResult(const std::vector &outputs, + const std::vector &base_result) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + PADDLE_ENFORCE_GT(size, 0); + float *data = static_cast(out.data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(data[i], base_result[i], 1e-3); + } + } +} +// Test with a really complicate model. +void TestRNN2Prediction() { + AnalysisConfig config; + config.prog_file = FLAGS_infer_model + "/__model__"; + config.param_file = FLAGS_infer_model + "/param"; + config.use_gpu = false; + config.device = 0; + config.specify_input_name = true; + config.enable_ir_optim = true; + PADDLE_ENFORCE(config.ir_mode == + AnalysisConfig::IrPassMode::kExclude); // default + + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + + auto base_predictor = + CreatePaddlePredictor(config); + auto predictor = + CreatePaddlePredictor( + config); + std::vector input_slots; + DataRecord data(FLAGS_infer_data, batch_size); + PrepareInputs(&input_slots, &data, batch_size); + std::vector outputs, base_outputs; + + Timer timer1; + timer1.tic(); + for (int i = 0; i < num_times; i++) { + base_predictor->Run(input_slots, &base_outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); + + Timer timer2; + timer2.tic(); + for (int i = 0; i < num_times; i++) { + predictor->Run(input_slots, &outputs); + } + PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times); + + CompareResult(base_outputs, data.result_data); + CompareResult(outputs, data.result_data); +} + +TEST(Analyzer, rnn2) { TestRNN2Prediction(); } + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..1472c475e4a3061ffcad96925ea215a41a7e63eb --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataReader { + explicit DataReader(const std::string &path) + : file(new std::ifstream(path)) {} + + bool NextBatch(std::vector *input, int batch_size) { + PADDLE_ENFORCE_EQ(batch_size, 1); + std::string line; + PaddleTensor tensor; + tensor.dtype = PaddleDType::INT64; + tensor.lod.emplace_back(std::vector({0})); + std::vector data; + + for (int i = 0; i < batch_size; i++) { + if (!std::getline(*file, line)) return false; + inference::split_to_int64(line, ' ', &data); + } + tensor.lod.front().push_back(data.size()); + + tensor.data.Resize(data.size() * sizeof(int64_t)); + memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t)); + tensor.shape.push_back(data.size()); + tensor.shape.push_back(1); + input->assign({tensor}); + return true; + } + + std::unique_ptr file; +}; + +void Main(int batch_size) { + // shape -- + // Create Predictor -- + AnalysisConfig config; + config.model_dir = FLAGS_infer_model; + config.use_gpu = false; + config.enable_ir_optim = true; + + std::vector input_slots, output_slots; + DataReader reader(FLAGS_infer_data); + std::vector> input_slots_all; + + if (FLAGS_test_all_data) { + LOG(INFO) << "test all data"; + int num_batches = 0; + while (reader.NextBatch(&input_slots, FLAGS_batch_size)) { + input_slots_all.emplace_back(input_slots); + ++num_batches; + } + LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size; + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); + return; + } + + // one batch starts + // data -- + reader.NextBatch(&input_slots, FLAGS_batch_size); + input_slots_all.emplace_back(input_slots); + TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); + + // Get output + LOG(INFO) << "get outputs " << output_slots.size(); + + for (auto &output : output_slots) { + LOG(INFO) << "output.shape: " << to_string(output.shape); + // no lod ? + CHECK_EQ(output.lod.size(), 0UL); + LOG(INFO) << "output.dtype: " << output.dtype; + std::stringstream ss; + for (int i = 0; i < 5; i++) { + ss << static_cast(output.data.data())[i] << " "; + } + LOG(INFO) << "output.data summary: " << ss.str(); + // one batch ends + } +} + +TEST(text_classification, basic) { Main(FLAGS_batch_size); } + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..a207c41b7140c806b4c1fdc7f24a317b165c9aef --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +/* + * Use the native and analysis fluid engine to inference the demo. + * ocr, mobilenet and se_resnext50 + */ +void TestVisualPrediction(bool use_mkldnn) { + std::unique_ptr predictor; + AnalysisConfig cfg; + cfg.param_file = FLAGS_infer_model + "/__params__"; + cfg.prog_file = FLAGS_infer_model + "/__model__"; + cfg.use_gpu = false; + cfg._use_mkldnn = use_mkldnn; + cfg.device = 0; + cfg.enable_ir_optim = true; + // TODO(TJ): fix fusion gru + cfg.ir_passes.push_back("fc_gru_fuse_pass"); +#ifdef PADDLE_WITH_MKLDNN + // disable mkldnn fuse since it should have some bugs + cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); +#endif + predictor = + CreatePaddlePredictor(cfg); + + // Only have single batch of data. + std::string line; + std::ifstream file(FLAGS_infer_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + std::vector outputs_slots; + Timer timer; + timer.tic(); + for (int i = 0; i < FLAGS_repeat; i++) { + predictor->Run({input}, &outputs_slots); + } + PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0, + timer.toc() / FLAGS_repeat); + + VLOG(3) << "output.size " << outputs_slots.size(); + + // run native as reference + auto ref_predictor = + CreatePaddlePredictor(cfg); + std::vector ref_outputs_slots; + ref_predictor->Run({input}, &ref_outputs_slots); + CompareResult(outputs_slots, ref_outputs_slots); + // print what are fused + AnalysisPredictor *analysis_predictor = + dynamic_cast(predictor.get()); + auto &fuse_statis = analysis_predictor->analysis_argument() + .Get>( + framework::ir::kFuseStatisAttr); + for (auto &item : fuse_statis) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num_ops = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { + if (node->IsFunction()) { + ++num_ops; + } + } + LOG(INFO) << "has num ops: " << num_ops; +} + +TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, analysis_mkldnn) { + TestVisualPrediction(/*use_mkldnn*/ true); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..43e97614e3ad9c14c8deee9f340757f373eb593e --- /dev/null +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -0,0 +1,141 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data file"); +DEFINE_int32(batch_size, 1, "batch size."); +DEFINE_int32(burning, 0, "Burning before repeat."); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); + +namespace paddle { +namespace inference { + +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t ref_size = + std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1, + [](int a, int b) { return a * b; }); + EXPECT_GT(size, 0); + EXPECT_EQ(size, ref_size); + EXPECT_EQ(out.dtype, ref_out.dtype); + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3); + } + break; + } + } + } +} + +void TestOneThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + auto predictor = + CreatePaddlePredictor( + config); + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs.size(); j++) { + predictor->Run(inputs[j], outputs); + } + } + PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times, + inputs.size()); +} + +void TestMultiThreadPrediction( + AnalysisConfig config, const std::vector> inputs, + std::vector *outputs, int num_threads) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + std::vector threads; + std::vector> predictors; + // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled + // because AttentionLSTM's hard code nodeid will be damanged. + for (int tid = 0; tid < num_threads; ++tid) { + predictors.emplace_back( + CreatePaddlePredictor( + config)); + } + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local inputs and outputs. + // The inputs of each thread are all the same. + std::vector> inputs_tid = inputs; + std::vector outputs_tid; + Timer timer; + timer.tic(); + for (int i = 0; i < num_times; i++) { + for (size_t j = 0; j < inputs_tid.size(); j++) { + predictors[tid]->Run(inputs_tid[j], &outputs_tid); + } + } + PrintTime(batch_size, num_times, num_threads, tid, + timer.toc() / num_times, inputs_tid.size()); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + +void TestPrediction(AnalysisConfig config, + const std::vector> inputs, + std::vector *outputs, int num_threads) { + if (num_threads == 1) { + TestOneThreadPrediction(config, inputs, outputs); + } else { + TestMultiThreadPrediction(config, inputs, outputs, num_threads); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 39b0c856996c11c6efdb530f1396afd5731c778d..9b943440a869e213db4ed761cfe7c508bc5e94ae 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -24,28 +24,28 @@ namespace operators { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of AttentionLSTM should not be null."); + "Assert only one Input(X) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("C0"), - "Input(C0) of AttentionLSTM should not be null."); + "Assert only one Input(C0) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), - "Input(LSTMWeight) of AttentionLSTM should not be null."); + "Assert only one Input(LSTMWeight) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), - "Input(LSTMBias) of AttentionLSTM should not be null."); + "Assert only one Input(LSTMBias) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), - "Input(AttentionWeight) of AttentionLSTM should not be null."); + "Assert only one Input(AttentionWeight) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of AttentionLSTM should not be null."); + "Assert only one Output(Hidden) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of AttentionLSTM should not be null."); + "Assert only one Output(Cell) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), - "Output(AttentionedX) of AttentionLSTM should not be null."); + "Assert only one Output(AttentionedX) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), - "Output(AttentionFCOut) of AttentionLSTM should not be null."); + "Assert only one Output(AttentionFCOut) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), - "Output(LSTMX) of AttentionLSTM should not be null."); + "Assert only one Output(LSTMX) of AttentionLSTM."); PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), - "Output(LSTMOUT) of AttentionLSTM should not be null."); + "Assert only one Output(LSTMOUT) of AttentionLSTM."); auto x_dims = ctx->GetInputDim("X"); const int M = x_dims[1]; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index 22cbf680c0670552fb014043c69fcadc56863529..4a7a6bcf7154d5680de751e3c933be46fb09fd74 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel { output_channels / groups * output_height * output_width * output_depth; int group_offset_filter = filter->numel() / groups; // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; if (user_workspace_size > 0) { @@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // Allocate on GPU memory - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_filter_desc, filter_data + i * group_offset_filter, - cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, - &beta, cudnn_output_desc, output_data + i * group_offset_out)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, + cudnn_filter_desc, filter_data + i * group_offset_filter, + cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, + &beta, cudnn_output_desc, output_data + i * group_offset_out)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { cudnn_filter_desc, filter_algo, &tmp_size)); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); } - // ------------------- cudnn conv workspace --------------------- - // Already on GPU - void* cudnn_workspace = nullptr; - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); + // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f, beta = 0.0f; if (input_grad) { @@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset input_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, - filter_data + i * group_offset_filter, cudnn_output_grad_desc, - output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + i * group_offset_in)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, + filter_data + i * group_offset_filter, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + data_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_input_desc, input_grad_data + i * group_offset_in)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } // ------------------- cudnn conv backward filter --------------------- @@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset filter_grad. for (int i = 0; i < groups; i++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, - cudnn_output_grad_desc, output_grad_data + i * group_offset_out, - cudnn_conv_desc, filter_algo, cudnn_workspace, - workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + i * group_offset_filter)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_input_desc, + input_data + i * group_offset_in, cudnn_output_grad_desc, + output_grad_data + i * group_offset_out, cudnn_conv_desc, + filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + i * group_offset_filter)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index c5cbadc892904dc064b49ebc461944c4671a69da..5385bcdaec3f9e86cf0d7bf33f270e48ae186fba 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); return this->AcquireMemory(weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline); + pipeline, is_persistent); } std::shared_ptr AcquireBiasMemoryFromPrimitive( @@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + const bool is_test = ctx.Attr("is_test"); + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -296,10 +299,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); + bool fuse_relu = ctx.Attr("fuse_relu"); int groups = ctx.Attr("groups"); - // TODO(pzelazko-intel) add support for group convolution and dilation - PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet"); + // TODO: add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); @@ -310,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives @@ -323,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), input->format()); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? filter->format() : mkldnn::memory::format::goihw); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -336,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), + (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw); std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. // Currently used whenever bias is != nullptr. auto dst_md = platform::MKLDNNMemDesc( @@ -348,11 +366,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias_tz = paddle::framework::vectorize2int(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( bias_tz, platform::MKLDNNGetDataType(), memory::format::x); - conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, - strides, paddings, mkldnn_engine); + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides, + paddings, mkldnn_engine, fuse_relu); } else { conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, - paddings, mkldnn_engine); + paddings, mkldnn_engine, fuse_relu); } // Save conv_pd/src_memory/weights_memory for backward pass dev_ctx.SetBlob(key_conv_pd, conv_pd); @@ -371,7 +390,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_memory_p = handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline); + user_weights_memory_p, pipeline, is_test); auto dst_memory_p = handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); @@ -402,11 +421,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } private: + mkldnn::primitive_attr AddRelu() const { + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + mkldnn::primitive_attr conv_attr; + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + mkldnn::post_ops post_operations; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine) const { + const mkldnn::engine& engine, + const bool fuse_relu) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -415,8 +449,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + mkldnn::primitive_attr conv_attr; + if (fuse_relu) { + conv_attr = AddRelu(); + } + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); return std::unique_ptr( p_conv_pd); @@ -427,7 +466,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const memory::desc& bias, const memory::desc& dst, const std::vector& strides, const std::vector& paddings, - const mkldnn::engine& engine) const { + const mkldnn::engine& engine, + const bool fuse_relu) const { memory::dims stride_dims = {strides[0], strides[1]}; memory::dims padding_dims = {paddings[0], paddings[1]}; @@ -436,8 +476,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bias, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + mkldnn::primitive_attr conv_attr; + if (fuse_relu) { + conv_attr = AddRelu(); + } + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); return std::unique_ptr( p_conv_pd); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 61ca80877a6dfcdf30a0ff346342116e36eec6f2..41d4fcf6de7c8fcb3cfbb2063b0a2ac1a2356168 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } void Conv2DOpMaker::Make() { + AddAttr("is_test", "").SetDefault(false); AddInput( "Input", "(Tensor) The input tensor of convolution operator. " @@ -161,6 +162,8 @@ void Conv2DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index 82fff68e7557b3f0b44e6faf2a50e5a0ecbba589..73831611d01b8c5b8d2d9f7f15634a0094e4a608 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { conv_desc.descriptor(paddings, strides, dilations); // ------------------- cudnn conv workspace --------------------- - void* cudnn_workspace = nullptr; size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; if (user_workspace_size > 0) { @@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); - // Allocate on GPU memory - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); - // ------------------- cudnn conv transpose forward --------------------- int input_offset = input->numel() / input->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups; int filter_offset = filter->numel() / groups; T alpha = 1.0f, beta = 0.0f; for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, - cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, - algo, cudnn_workspace, workspace_size_in_bytes, &beta, - cudnn_output_desc, output_data + output_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } - - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; @@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { std::max(workspace_size_in_bytes, bwd_filter_ws_size); } - // ------------------- cudnn conv workspace --------------------- - // Already on GPU - void* cudnn_workspace = nullptr; - platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); - cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. int input_offset = input->numel() / input->dims()[0] / groups; @@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); // Because beta is zero, it is unnecessary to reset input_grad. for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_filter_desc, - filter_data + filter_offset * g, cudnn_conv_desc, data_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, - input_grad_data + input_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } @@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Because beta is zero, it is unnecessary to reset filter_grad. // Gradient with respect to the filter for (int g = 0; g < groups; g++) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, cudnn_output_desc, - output_grad_data + output_grad_offset * g, cudnn_input_desc, - input_data + input_offset * g, cudnn_conv_desc, filter_algo, - cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, - filter_grad_data + filter_offset * g)); + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes); } } - - // Release the cudnn workspace - paddle::memory::Free(gpu, cudnn_workspace); } }; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 578ab63bc380ee62d76e34b7cf3cbd590bfa2eda..66f19fe7ecfa51b2ce917f0c5fcb6d486f1a7307 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") .SetDefault(false); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); AddComment(R"DOC( CrossEntropy Operator. diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 36b58d80144d242277f6fc970a3a61a6721d4b50..03974a7fc511b1e1cb5b0eca532b260fdf9bf964 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel { math::CrossEntropyFunctor()( ctx.template device_context(), &y_2d, &x_2d, &labels_2d, - ctx.Attr("soft_label")); + ctx.Attr("soft_label"), ctx.Attr("ignore_index")); } }; @@ -74,16 +74,22 @@ class XeGradFunctor { const T* dy, // NOLINT const T* x, // NOLINT const int64_t* label, // NOLINT - size_t num_classes) - : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} + size_t num_classes, size_t ignore_index) + : dx_(dx), + dy_(dy), + x_(x), + label_(label), + num_classes_(num_classes), + ignore_index_(ignore_index) {} HOSTDEVICE void operator()(size_t sample_id) { auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; for (size_t x_offset = sample_id * num_classes_; x_offset < (sample_id + 1) * num_classes_; ++x_offset) { - dx_[x_offset] = x_offset != x_is_true_offset - ? static_cast(0) - : -dy_[sample_id] / x_[x_offset]; + dx_[x_offset] = + (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_) + ? static_cast(0) + : -dy_[sample_id] / x_[x_offset]; } } @@ -93,6 +99,7 @@ class XeGradFunctor { const T* x_; const int64_t* label_; size_t num_classes_; + size_t ignore_index_; }; template @@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { // unnecessary to convert tensors to 2-D views. int rank = x->dims().size(); int64_t class_num = x->dims()[rank - 1]; + int64_t ignore_index = ctx.Attr("ignore_index"); if (ctx.Attr("soft_label")) { XeSoftlabelGradFunctor functor(dx_data, dy->data(), x->data(), label->data(), @@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { static_cast(dx->numel())); for_range(functor); } else { - XeGradFunctor functor(dx_data, dy->data(), x->data(), - label->data(), - static_cast(class_num)); + XeGradFunctor functor( + dx_data, dy->data(), x->data(), label->data(), + static_cast(class_num), static_cast(ignore_index)); platform::ForRange for_range( ctx.template device_context(), static_cast(dy->numel())); diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 0dee1781623d5a62830545c0952e5aadbe37accb..6abeca1da443248d6ad3c1bcc64dd775d77f4ed8 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" @@ -21,7 +22,7 @@ namespace operators { */ template inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, - const framework::Tensor& gt_boxes, const T* weights, + const framework::Tensor& gt_boxes, const float* weights, const bool normalized, framework::Tensor* box_delta) { auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); @@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num, } } +template +void BboxOverlaps(const framework::Tensor& r_boxes, + const framework::Tensor& c_boxes, + framework::Tensor* overlaps) { + auto r_boxes_et = framework::EigenTensor::From(r_boxes); + auto c_boxes_et = framework::EigenTensor::From(c_boxes); + auto overlaps_et = framework::EigenTensor::From(*overlaps); + int r_num = r_boxes.dims()[0]; + int c_num = c_boxes.dims()[0]; + auto zero = static_cast(0.0); + T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, + inter_area; + for (int i = 0; i < r_num; ++i) { + r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * + (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); + for (int j = 0; j < c_num; ++j) { + c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * + (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); + x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); + y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); + x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); + y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); + inter_w = std::max(x_max - x_min + 1, zero); + inter_h = std::max(y_max - y_min + 1, zero); + inter_area = inter_w * inter_h; + overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); + } + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index be06dc19743cfa6f093bcb3f4e9f91af315d4211..d7a53f1bef98ecda3ba7b36323678a11a632a15c 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Input(RpnRois) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtClasses"), "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), "Input(GtBoxes) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("ImScales"), - "Input(ImScales) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); PADDLE_ENFORCE(ctx->HasOutput("Rois"), "Output(Rois) of RpnTargetAssignOp should not be null"); @@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto gt_classes_dims = ctx->GetInputDim("GtClasses"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto im_scales_dims = ctx->GetInputDim("ImScales"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, "The rank of Input(RpnRois) must be 2."); - PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1, - "The rank of Input(GtClasses) must be 1."); PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, "The rank of Input(GtBoxes) must be 2."); - PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1, - "The rank of Input(ImScales) must be 1."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); int class_nums = ctx->Attrs().Get("class_nums"); ctx->SetOutputDim("Rois", {-1, 4}); - ctx->SetOutputDim("LabelsInt32", {-1}); + ctx->SetOutputDim("LabelsInt32", {-1, 1}); ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); @@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context, concat_functor(context, inputs, axis, out_tensor); } -template -void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes, - Tensor* overlaps) { - auto r_boxes_et = framework::EigenTensor::From(r_boxes); - auto c_boxes_et = framework::EigenTensor::From(c_boxes); - auto overlaps_et = framework::EigenTensor::From(*overlaps); - int r_num = r_boxes.dims()[0]; - int c_num = c_boxes.dims()[0]; - auto zero = static_cast(0.0); - T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, - inter_area; - for (int i = 0; i < r_num; ++i) { - r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * - (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); - for (int j = 0; j < c_num; ++j) { - c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * - (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); - x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); - y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); - x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); - y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); - inter_w = std::max(x_max - x_min + 1, zero); - inter_h = std::max(y_max - y_min + 1, zero); - inter_area = inter_w * inter_h; - overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area); - } - } -} - template std::vector> SampleFgBgGt( const platform::CPUDeviceContext& context, Tensor* iou, - const int batch_size_per_im, const float fg_fraction, const float fg_thresh, - const float bg_thresh_hi, const float bg_thresh_lo, - std::minstd_rand engine) { + const Tensor& is_crowd, const int batch_size_per_im, + const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, + const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) { std::vector fg_inds; std::vector bg_inds; std::vector gt_inds; - T* proposal_to_gt_overlaps = iou->mutable_data(context.GetPlace()); + int64_t gt_num = is_crowd.numel(); + const int* crowd_data = is_crowd.data(); + T* proposal_to_gt_overlaps = iou->data(); int64_t row = iou->dims()[0]; int64_t col = iou->dims()[1]; float epsilon = 0.00001; @@ -152,6 +125,9 @@ std::vector> SampleFgBgGt( for (int64_t i = 0; i < row; ++i) { const T* v = proposal_to_gt_overlaps + i * col; T max_overlap = *std::max_element(v, v + col); + if ((i < gt_num) && (crowd_data[i])) { + max_overlap = -1.0; + } if (max_overlap > fg_thresh) { for (int64_t j = 0; j < col; ++j) { T val = proposal_to_gt_overlaps[i * col + j]; @@ -170,17 +146,19 @@ std::vector> SampleFgBgGt( } // Reservoir Sampling + std::uniform_real_distribution uniform(0, 1); int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); int fg_rois_this_image = fg_inds.size(); int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); - std::uniform_real_distribution uniform(0, 1); - const int64_t fg_size = static_cast(fg_inds.size()); - if (fg_size > fg_rois_per_this_image) { - for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) { - std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); - std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + if (use_random) { + const int64_t fg_size = static_cast(fg_inds.size()); + if (fg_size > fg_rois_per_this_image) { + for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) { + std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); + std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); + } } } } @@ -192,12 +170,14 @@ std::vector> SampleFgBgGt( int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; int bg_rois_this_image = bg_inds.size(); int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image); - const int64_t bg_size = static_cast(bg_inds.size()); - if (bg_size > bg_rois_per_this_image) { - for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < fg_rois_per_this_image) - std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + if (use_random) { + const int64_t bg_size = static_cast(bg_inds.size()); + if (bg_size > bg_rois_per_this_image) { + for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) + std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + } } } std::vector new_bg_inds(bg_inds.begin(), @@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, template std::vector SampleRoisForOneImage( const platform::CPUDeviceContext& context, Tensor* rpn_rois, - Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale, + Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info, const int batch_size_per_im, const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo, const std::vector& bbox_reg_weights, const int class_nums, - std::minstd_rand engine) { + std::minstd_rand engine, bool use_random) { auto rpn_rois_et = framework::EigenTensor::From(*rpn_rois); - auto im_scale_data = im_scale->data()[0]; - rpn_rois_et = rpn_rois_et / im_scale_data; + auto im_scale = im_info->data()[2]; + rpn_rois_et = rpn_rois_et / im_scale; Tensor boxes; int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; @@ -270,8 +250,8 @@ std::vector SampleRoisForOneImage( // Generate proposal index std::vector> fg_bg_gt = SampleFgBgGt( - context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction, - fg_thresh, bg_thresh_hi, bg_thresh_lo, engine); + context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im, + fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random); std::vector fg_inds = fg_bg_gt[0]; std::vector bg_inds = fg_bg_gt[1]; std::vector gt_inds = fg_bg_gt[2]; @@ -291,15 +271,15 @@ std::vector SampleRoisForOneImage( // Compute targets Tensor bbox_targets_single; bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); - BoxToDelta(fg_num, sampled_boxes, sampled_gts, nullptr, false, - &bbox_targets_single); + BoxToDelta(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(), + false, &bbox_targets_single); // Scale rois Tensor sampled_rois; sampled_rois.mutable_data(sampled_boxes.dims(), context.GetPlace()); auto sampled_rois_et = framework::EigenTensor::From(sampled_rois); auto sampled_boxes_et = framework::EigenTensor::From(sampled_boxes); - sampled_rois_et = sampled_boxes_et * im_scale_data; + sampled_rois_et = sampled_boxes_et * im_scale; // Expand box targets Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; @@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* rpn_rois = context.Input("RpnRois"); auto* gt_classes = context.Input("GtClasses"); + auto* is_crowd = context.Input("IsCrowd"); auto* gt_boxes = context.Input("GtBoxes"); - auto* im_scales = context.Input("ImScales"); + auto* im_info = context.Input("ImInfo"); auto* rois = context.Output("Rois"); auto* labels_int32 = context.Output("LabelsInt32"); @@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::vector bbox_reg_weights = context.Attr>("bbox_reg_weights"); int class_nums = context.Attr("class_nums"); + bool use_random = context.Attr("use_random"); PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); PADDLE_ENFORCE_EQ( gt_classes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateProposalLabelsOp is_crowd needs 1 level of LoD"); PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); int64_t n = static_cast(rpn_rois->lod().back().size() - 1); rois->mutable_data({n * batch_size_per_im, kBoxDim}, context.GetPlace()); - labels_int32->mutable_data({n * batch_size_per_im}, + labels_int32->mutable_data({n * batch_size_per_im, 1}, context.GetPlace()); bbox_targets->mutable_data({n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); @@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); framework::LoD lod; @@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back(); for (int i = 0; i < n; ++i) { Tensor rpn_rois_slice = rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); Tensor gt_classes_slice = gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); Tensor gt_boxes_slice = gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); - Tensor im_scales_slice = im_scales->Slice(i, i + 1); + Tensor im_info_slice = im_info->Slice(i, i + 1); std::vector tensor_output = SampleRoisForOneImage( - dev_ctx, &rpn_rois_slice, >_classes_slice, >_boxes_slice, - &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh, - bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine); + dev_ctx, &rpn_rois_slice, >_classes_slice, &is_crowd_slice, + >_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, + engine, use_random); Tensor sampled_rois = tensor_output[0]; Tensor sampled_labels_int32 = tensor_output[1]; Tensor sampled_bbox_targets = tensor_output[2]; @@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { bbox_inside_weights->set_lod(lod); bbox_outside_weights->set_lod(lod); rois->Resize({num_rois, kBoxDim}); - labels_int32->Resize({num_rois}); + labels_int32->Resize({num_rois, 1}); bbox_targets->Resize({num_rois, kBoxDim * class_nums}); bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); @@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { // TODO(buxingyuan): Add Document AddInput("RpnRois", "RpnRois."); AddInput("GtClasses", "GtClasses."); + AddInput("IsCrowd", "IsCrowd."); AddInput("GtBoxes", "GtBoxes."); - AddInput("ImScales", "ImScales."); + AddInput("ImInfo", "ImInfo."); AddOutput("Rois", "Rois."); AddOutput("LabelsInt32", "LabelsInt32."); @@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("bg_thresh_lo", "bg_thresh_lo"); AddAttr>("bbox_reg_weights", "bbox_reg_weights"); AddAttr("class_nums", "class_nums"); - AddAttr("fix_seed", "fix_seed").SetDefault(false); - AddAttr("seed", "seed").SetDefault(0); + AddAttr("use_random", "use_random").SetDefault(true); AddComment(R"DOC( Generate Proposals Labels Operator. diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index ebe6830eccd87a156768eb0d4b96220bcc9f4edc..c33aa255362bc5234f2813fb93e70c943b03c33f 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, } for (int64_t i = 0; i < row; ++i) { - T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; - T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; - T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; - T anchor_center_y = - (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2; + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; @@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, bbox_center_y = variances_data[i * len + 1] * bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2]) * + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * anchor_width; - bbox_height = std::exp(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3]) * + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * anchor_height; } else { bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_center_y = bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; - bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + std::log(1000.0 / 16.0))) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + std::log(1000.0 / 16.0))) * + anchor_height; } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; } // return proposals; } @@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, float min_size, const Tensor &im_info, Tensor *keep) { const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); - min_size *= im_info_data[2]; + T im_scale = im_info_data[2]; keep->Resize({boxes->dims()[0], 1}); + min_size = std::max(min_size, 1.0f); int *keep_data = keep->mutable_data(ctx.GetPlace()); int keep_len = 0; for (int i = 0; i < boxes->dims()[0]; ++i) { T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; T x_ctr = boxes_data[4 * i] + ws / 2; T y_ctr = boxes_data[4 * i + 1] + hs / 2; - if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && - y_ctr <= im_info_data[0]) { + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { keep_data[keep_len++] = i; } } @@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { const T inter_ymin = std::max(box1[1], box2[1]); const T inter_xmax = std::min(box1[2], box2[2]); const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; + const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1); + const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1); const T inter_area = inter_w * inter_h; const T bbox1_area = BBoxArea(box1, normalized); const T bbox2_area = BBoxArea(box2, normalized); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 88757f25cd9a5789758640de2d9cae0b12350b25..dda423efd35b96f5e1d7c55389818f46ef3d8694 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("DistMat"), - "Input(DistMat) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Anchor"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of RpnTargetAssignOp should not be null"); PADDLE_ENFORCE( ctx->HasOutput("LocationIndex"), @@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasOutput("TargetLabel"), "Output(TargetLabel) of RpnTargetAssignOp should not be null"); - - auto in_dims = ctx->GetInputDim("DistMat"); - PADDLE_ENFORCE_EQ(in_dims.size(), 2, - "The rank of Input(DistMat) must be 2."); + PADDLE_ENFORCE( + ctx->HasOutput("TargetBBox"), + "Output(TargetBBox) of RpnTargetAssignOp should not be null"); + + auto anchor_dims = ctx->GetInputDim("Anchor"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchor) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); ctx->SetOutputDim("LocationIndex", {-1}); ctx->SetOutputDim("ScoreIndex", {-1}); @@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType( - ctx.Input("DistMat")->type()), + ctx.Input("Anchor")->type()), platform::CPUPlace()); } }; template -class RpnTargetAssignKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* anchor_t = context.Input("Anchor"); // (H*W*A) * 4 - auto* gt_bbox_t = context.Input("GtBox"); - auto* dist_t = context.Input("DistMat"); +void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +template +std::vector FilterStraddleAnchor( + const platform::CPUDeviceContext& context, const Tensor* anchor, + const float rpn_straddle_thresh, T im_height, T im_width) { + std::vector inds_inside; + int anchor_num = anchor->dims()[0]; + auto* anchor_data = anchor->data(); + if (rpn_straddle_thresh >= 0) { + int index; + for (int i = 0; i < anchor_num; ++i) { + index = i * 4; + if ((anchor_data[index + 0] >= -rpn_straddle_thresh) && + (anchor_data[index + 1] >= -rpn_straddle_thresh) && + (anchor_data[index + 2] < im_width + rpn_straddle_thresh) && + (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) { + inds_inside.emplace_back(i); + } + } + } else { + for (int i = 0; i < anchor_num; ++i) { + inds_inside.emplace_back(i); + } + } + int inside_num = inds_inside.size(); + Tensor inds_inside_t; + int* inds_inside_data = + inds_inside_t.mutable_data({inside_num}, context.GetPlace()); + std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data); + Tensor inside_anchor_t; + T* inside_anchor_data = + inside_anchor_t.mutable_data({inside_num, 4}, context.GetPlace()); + Gather(anchor->data(), 4, inds_inside_data, inside_num, + inside_anchor_data); + std::vector res; + res.emplace_back(inds_inside_t); + res.emplace_back(inside_anchor_t); + return res; +} + +template +Tensor FilterCrowdGt(const platform::CPUDeviceContext& context, + Tensor* gt_boxes, Tensor* is_crowd) { + int gt_num = gt_boxes->dims()[0]; + std::vector not_crowd_inds; + auto* is_crowd_data = is_crowd->data(); + for (int i = 0; i < gt_num; ++i) { + if (is_crowd_data[i] == 0) { + not_crowd_inds.emplace_back(i); + } + } + int ncrowd_num = not_crowd_inds.size(); + Tensor ncrowd_gt_boxes; + T* ncrowd_gt_boxes_data = + ncrowd_gt_boxes.mutable_data({ncrowd_num, 4}, context.GetPlace()); + Gather(gt_boxes->data(), 4, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_boxes_data); + return ncrowd_gt_boxes; +} + +void ReservoirSampling(const int num, std::vector* inds, + std::minstd_rand engine, bool use_random) { + std::uniform_real_distribution uniform(0, 1); + size_t len = inds->size(); + if (len > static_cast(num)) { + if (use_random) { + for (size_t i = num; i < len; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < num) + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); + } + } + inds->resize(num); + } +} + +template +void ScoreAssign(const T* anchor_by_gt_overlap_data, + const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max, + const int rpn_batch_size_per_im, const float rpn_fg_fraction, + const float rpn_positive_overlap, + const float rpn_negative_overlap, std::vector* fg_inds, + std::vector* bg_inds, std::vector* tgt_lbl, + std::minstd_rand engine, bool use_random) { + float epsilon = 0.00001; + int anchor_num = anchor_to_gt_max.dims()[0]; + int gt_num = gt_to_anchor_max.dims()[0]; + std::vector target_label(anchor_num, -1); + std::vector fg_inds_fake; + std::vector bg_inds_fake; + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); + const T* gt_to_anchor_max_data = gt_to_anchor_max.data(); + // TODO(buxingyuan): Match with Detectron now + // but it seems here is a bug in two directions assignment + // in which the later one may overwrites the former one. + for (int64_t i = 0; i < anchor_num; ++i) { + bool is_anchors_with_max_overlap = false; + for (int64_t j = 0; j < gt_num; ++j) { + T value = anchor_by_gt_overlap_data[i * gt_num + j]; + T diff = std::abs(value - gt_to_anchor_max_data[j]); + if (diff < epsilon) { + is_anchors_with_max_overlap = true; + break; + } + } + bool is_anchor_great_than_thresh = + (anchor_to_gt_max_data[i] >= rpn_positive_overlap); + if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) { + fg_inds_fake.push_back(i); + } + } - auto* loc_index_t = context.Output("LocationIndex"); - auto* score_index_t = context.Output("ScoreIndex"); - auto* tgt_bbox_t = context.Output("TargetBBox"); - auto* tgt_lbl_t = context.Output("TargetLabel"); + // Reservoir Sampling + int fg_num = static_cast(rpn_fg_fraction * rpn_batch_size_per_im); + ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random); + fg_num = static_cast(fg_inds_fake.size()); + for (int64_t i = 0; i < fg_num; ++i) { + target_label[fg_inds_fake[i]] = 1; + } - auto lod = dist_t->lod().back(); - int64_t batch_num = static_cast(lod.size() - 1); - int64_t anchor_num = dist_t->dims()[1]; - PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); + int bg_num = rpn_batch_size_per_im - fg_num; + for (int64_t i = 0; i < anchor_num; ++i) { + if (anchor_to_gt_max_data[i] < rpn_negative_overlap) { + bg_inds_fake.push_back(i); + } + } + ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random); + bg_num = static_cast(bg_inds_fake.size()); + for (int64_t i = 0; i < bg_num; ++i) { + target_label[bg_inds_fake[i]] = 0; + } - int rpn_batch_size = context.Attr("rpn_batch_size_per_im"); - float pos_threshold = context.Attr("rpn_positive_overlap"); - float neg_threshold = context.Attr("rpn_negative_overlap"); - float fg_fraction = context.Attr("fg_fraction"); + for (int64_t i = 0; i < anchor_num; ++i) { + if (target_label[i] == 1) fg_inds->emplace_back(i); + if (target_label[i] == 0) bg_inds->emplace_back(i); + } + fg_num = fg_inds->size(); + bg_num = bg_inds->size(); + + tgt_lbl->resize(fg_num + bg_num, 0); + std::vector fg_lbl(fg_num, 1); + std::vector bg_lbl(bg_num, 0); + std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data()); + std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num); +} + +template +std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, + const Tensor& anchor_by_gt_overlap, + const int rpn_batch_size_per_im, + const float rpn_positive_overlap, + const float rpn_negative_overlap, + const float rpn_fg_fraction, + std::minstd_rand engine, bool use_random) { + auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data(); + int anchor_num = anchor_by_gt_overlap.dims()[0]; + int gt_num = anchor_by_gt_overlap.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + std::vector tgt_lbl; + + // Calculate the max IoU between anchors and gt boxes + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max; + anchor_to_gt_max.mutable_data({anchor_num}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({anchor_num}, place); + gt_to_anchor_max.mutable_data({gt_num}, place); + + auto anchor_by_gt_overlap_et = + framework::EigenMatrix::From(anchor_by_gt_overlap); + auto anchor_to_gt_max_et = + framework::EigenVector::Flatten(anchor_to_gt_max); + auto gt_to_anchor_max_et = + framework::EigenVector::Flatten(gt_to_anchor_max); + auto anchor_to_gt_argmax_et = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + anchor_to_gt_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(1)); + anchor_to_gt_argmax_et = + anchor_by_gt_overlap_et.argmax(1).template cast(); + gt_to_anchor_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(0)); + + // Follow the Faster RCNN's implementation + ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, + rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap, + rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine, + use_random); + + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + gt_inds.reserve(fg_num); + for (int i = 0; i < fg_num; ++i) { + gt_inds.emplace_back(argmax[fg_inds[i]]); + } - int fg_num_per_batch = static_cast(rpn_batch_size * fg_fraction); + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t; + int* loc_index_data = loc_index_t.mutable_data({fg_num}, place); + int* score_index_data = + score_index_t.mutable_data({fg_num + bg_num}, place); + int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); + int* gt_inds_data = gt_inds_t.mutable_data({fg_num}, place); + std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data); + std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); + std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); + std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::vector loc_score_tgtlbl_gt; + loc_score_tgtlbl_gt.emplace_back(loc_index_t); + loc_score_tgtlbl_gt.emplace_back(score_index_t); + loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); + loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + + return loc_score_tgtlbl_gt; +} - int64_t max_num = batch_num * anchor_num; +template +class RpnTargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_boxes = context.Input("GtBoxes"); + auto* is_crowd = context.Input("IsCrowd"); + auto* im_info = context.Input("ImInfo"); + + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_bbox = context.Output("TargetBBox"); + auto* tgt_lbl = context.Output("TargetLabel"); + + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "RpnTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "RpnTargetAssignOp is_crowd needs 1 level of LoD"); + int64_t anchor_num = static_cast(anchor->dims()[0]); + int64_t batch_num = static_cast(gt_boxes->lod().back().size() - 1); + + int rpn_batch_size_per_im = context.Attr("rpn_batch_size_per_im"); + float rpn_straddle_thresh = context.Attr("rpn_straddle_thresh"); + float rpn_positive_overlap = context.Attr("rpn_positive_overlap"); + float rpn_negative_overlap = context.Attr("rpn_negative_overlap"); + float rpn_fg_fraction = context.Attr("rpn_fg_fraction"); + bool use_random = context.Attr("use_random"); + + int64_t max_num = batch_num * rpn_batch_size_per_im; auto place = context.GetPlace(); - tgt_bbox_t->mutable_data({max_num, 4}, place); - auto* loc_index = loc_index_t->mutable_data({max_num}, place); - auto* score_index = score_index_t->mutable_data({max_num}, place); + loc_index->mutable_data({max_num}, place); + score_index->mutable_data({max_num}, place); + tgt_bbox->mutable_data({max_num, 4}, place); + tgt_lbl->mutable_data({max_num, 1}, place); - Tensor tmp_tgt_lbl; - auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data({max_num}, place); auto& dev_ctx = context.device_context(); - math::SetConstant iset; - iset(dev_ctx, &tmp_tgt_lbl, static_cast(-1)); std::random_device rnd; std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); + int seed = rnd(); engine.seed(seed); - int fg_num = 0; - int bg_num = 0; + framework::LoD lod_loc, loc_score; + std::vector lod0_loc(1, 0); + std::vector lod0_score(1, 0); + + int total_loc_num = 0; + int total_score_num = 0; + auto gt_boxes_lod = gt_boxes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); for (int i = 0; i < batch_num; ++i) { - Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); - Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); - auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, - rpn_batch_size, fg_num_per_batch, engine, - tmp_lbl_data + i * anchor_num); - - int cur_fg_num = fg_bg_gt[0].size(); - int cur_bg_num = fg_bg_gt[1].size(); - std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, - [i, anchor_num](int d) { return d + i * anchor_num; }); - memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); - std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), - score_index + cur_fg_num, - [i, anchor_num](int d) { return d + i * anchor_num; }); + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + auto* im_info_data = im_info_slice.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + + // Filter straddle anchor + std::vector filter_output = FilterStraddleAnchor( + dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width); + Tensor inds_inside = filter_output[0]; + Tensor inside_anchor = filter_output[1]; + + // Filter crowd gt + Tensor ncrowd_gt_boxes = + FilterCrowdGt(dev_ctx, >_boxes_slice, &is_crowd_slice); + auto ncrowd_gt_boxes_et = + framework::EigenTensor::From(ncrowd_gt_boxes); + ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale; + + Tensor anchor_by_gt_overlap; + anchor_by_gt_overlap.mutable_data( + {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place); + BboxOverlaps(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap); + + auto loc_score_tgtlbl_gt = SampleRpnFgBgGt( + dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine, + use_random); + + Tensor sampled_loc_index = loc_score_tgtlbl_gt[0]; + Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; + Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; + Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + + int loc_num = sampled_loc_index.dims()[0]; + int score_num = sampled_score_index.dims()[0]; + // unmap to all anchor + Tensor sampled_loc_index_unmap, sampled_score_index_unmap; + sampled_loc_index_unmap.mutable_data({loc_num}, place); + sampled_score_index_unmap.mutable_data({score_num}, place); + Gather(inds_inside.data(), 1, sampled_loc_index.data(), + loc_num, sampled_loc_index_unmap.data()); + Gather(inds_inside.data(), 1, sampled_score_index.data(), + score_num, sampled_score_index_unmap.data()); // get target bbox deltas - if (cur_fg_num) { - Tensor fg_gt; - T* gt_data = fg_gt.mutable_data({cur_fg_num, 4}, place); - Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); - T* tgt_data = tgt_bbox.data(); - Gather(anchor_t->data(), 4, - reinterpret_cast(&fg_bg_gt[0][0]), cur_fg_num, - tgt_data); - Gather(gt_bbox.data(), 4, reinterpret_cast(&fg_bg_gt[2][0]), - cur_fg_num, gt_data); - BoxToDelta(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); - } - - loc_index += cur_fg_num; - score_index += cur_fg_num + cur_bg_num; - fg_num += cur_fg_num; - bg_num += cur_bg_num; - } - - int lbl_num = fg_num + bg_num; - PADDLE_ENFORCE_LE(fg_num, max_num); - PADDLE_ENFORCE_LE(lbl_num, max_num); - - tgt_bbox_t->Resize({fg_num, 4}); - loc_index_t->Resize({fg_num}); - score_index_t->Resize({lbl_num}); - auto* lbl_data = tgt_lbl_t->mutable_data({lbl_num, 1}, place); - Gather(tmp_lbl_data, 1, score_index_t->data(), lbl_num, - lbl_data); - } - - private: - void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max, - const int row, const int col, const float pos_threshold, - const float neg_threshold, int64_t* target_label, - std::vector* fg_inds, std::vector* bg_inds) const { - float epsilon = 0.0001; - for (int64_t i = 0; i < row; ++i) { - const T* v = dist_data + i * col; - T max = *std::max_element(v, v + col); - for (int64_t j = 0; j < col; ++j) { - if (std::abs(max - v[j]) < epsilon) { - target_label[j] = 1; - } - } - } - - // Pick the fg/bg - const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); - for (int64_t j = 0; j < col; ++j) { - if (anchor_to_gt_max_data[j] >= pos_threshold) { - target_label[j] = 1; - } else if (anchor_to_gt_max_data[j] < neg_threshold) { - target_label[j] = 0; - } - if (target_label[j] == 1) { - fg_inds->push_back(j); - } else if (target_label[j] == 0) { - bg_inds->push_back(j); - } + Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox; + auto* sampled_anchor_data = + sampled_anchor.mutable_data({loc_num, 4}, place); + auto* sampled_gt_data = sampled_gt.mutable_data({loc_num, 4}, place); + Gather(anchor->data(), 4, sampled_loc_index_unmap.data(), + loc_num, sampled_anchor_data); + Gather(ncrowd_gt_boxes.data(), 4, sampled_gt_index.data(), + loc_num, sampled_gt_data); + sampled_tgt_bbox.mutable_data({loc_num, 4}, place); + BoxToDelta(loc_num, sampled_anchor, sampled_gt, nullptr, false, + &sampled_tgt_bbox); + + // Add anchor offset + int anchor_offset = i * anchor_num; + auto sampled_loc_index_unmap_et = + framework::EigenTensor::From(sampled_loc_index_unmap); + sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset; + auto sampled_score_index_unmap_et = + framework::EigenTensor::From(sampled_score_index_unmap); + sampled_score_index_unmap_et = + sampled_score_index_unmap_et + anchor_offset; + AppendRpns(loc_index, total_loc_num, &sampled_loc_index_unmap); + AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); + AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); + AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + total_loc_num += loc_num; + + total_score_num += score_num; + lod0_loc.emplace_back(total_loc_num); + lod0_score.emplace_back(total_score_num); } - } - - void ReservoirSampling(const int num, std::minstd_rand engine, - std::vector* inds) const { - std::uniform_real_distribution uniform(0, 1); - size_t len = inds->size(); - if (len > static_cast(num)) { - for (size_t i = num; i < len; ++i) { - int rng_ind = std::floor(uniform(engine) * i); - if (rng_ind < num) - std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); - } - inds->resize(num); - } - } - // std::vector> RpnTargetAssign( - std::vector> SampleFgBgGt( - const platform::CPUDeviceContext& ctx, const Tensor& dist, - const float pos_threshold, const float neg_threshold, - const int rpn_batch_size, const int fg_num, std::minstd_rand engine, - int64_t* target_label) const { - auto* dist_data = dist.data(); - int row = dist.dims()[0]; - int col = dist.dims()[1]; - - std::vector fg_inds; - std::vector bg_inds; - std::vector gt_inds; - - // Calculate the max IoU between anchors and gt boxes - // Map from anchor to gt box that has highest overlap - auto place = ctx.GetPlace(); - Tensor anchor_to_gt_max, anchor_to_gt_argmax; - anchor_to_gt_max.mutable_data({col}, place); - int* argmax = anchor_to_gt_argmax.mutable_data({col}, place); - - auto x = framework::EigenMatrix::From(dist); - auto x_col_max = framework::EigenVector::Flatten(anchor_to_gt_max); - auto x_col_argmax = - framework::EigenVector::Flatten(anchor_to_gt_argmax); - x_col_max = x.maximum(Eigen::DSizes(0)); - x_col_argmax = x.argmax(0).template cast(); - - // Follow the Faster RCNN's implementation - ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold, - neg_threshold, target_label, &fg_inds, &bg_inds); - // Reservoir Sampling - ReservoirSampling(fg_num, engine, &fg_inds); - int fg_num2 = static_cast(fg_inds.size()); - int bg_num = rpn_batch_size - fg_num2; - ReservoirSampling(bg_num, engine, &bg_inds); - - gt_inds.reserve(fg_num2); - for (int i = 0; i < fg_num2; ++i) { - gt_inds.emplace_back(argmax[fg_inds[i]]); - } - std::vector> fg_bg_gt; - fg_bg_gt.emplace_back(fg_inds); - fg_bg_gt.emplace_back(bg_inds); - fg_bg_gt.emplace_back(gt_inds); - - return fg_bg_gt; + PADDLE_ENFORCE_LE(total_loc_num, max_num); + PADDLE_ENFORCE_LE(total_score_num, max_num); + + lod_loc.emplace_back(lod0_loc); + loc_score.emplace_back(lod0_score); + loc_index->set_lod(lod_loc); + score_index->set_lod(loc_score); + tgt_bbox->set_lod(lod_loc); + tgt_lbl->set_lod(loc_score); + loc_index->Resize({total_loc_num}); + score_index->Resize({total_score_num}); + tgt_bbox->Resize({total_loc_num, 4}); + tgt_lbl->Resize({total_score_num, 1}); } }; @@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("Anchor", "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); - AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); - AddInput( - "DistMat", - "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " - "[K, M]. It is pair-wise distance matrix between the entities " - "represented by each row and each column. For example, assumed one " - "entity is A with shape [K], another entity is B with shape [M]. The " - "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " - "the distance is, the better macthing the pairs are. Please note, " - "This tensor can contain LoD information to represent a batch of " - "inputs. One instance of this batch can contain different numbers of " - "entities."); + AddInput("GtBoxes", + "(LoDTensor) input groud-truth bbox with shape [K, 4]."); + AddInput("IsCrowd", + "(LoDTensor) input which indicates groud-truth is crowd."); + AddInput("ImInfo", + "(LoDTensor) input image information with shape [N, 3]. " + "N is the batch size, each image information includes height, " + "width and scale."); + AddAttr("rpn_batch_size_per_im", + "Total number of RPN examples per image.") + .SetDefault(256); + AddAttr( + "rpn_straddle_thresh", + "Remove RPN anchors that go outside the image by straddle_thresh " + "pixels, " + "Set to -1 or a large value, e.g. 100000, to disable pruning anchors."); AddAttr( "rpn_positive_overlap", "Minimum overlap required between an anchor and ground-truth " @@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "box for the (anchor, gt box) pair to be a negative examples.") .SetDefault(0.3); AddAttr( - "fg_fraction", + "rpn_fg_fraction", "Target fraction of RoI minibatch that " "is labeled foreground (i.e. class > 0), 0-th class is background.") .SetDefault(0.25); - AddAttr("rpn_batch_size_per_im", - "Total number of RPN examples per image.") - .SetDefault(256); - AddAttr("fix_seed", - "A flag indicating whether to use a fixed seed to generate " - "random mask. NOTE: DO NOT set this flag to true in " - "training. Setting this flag to true is only useful in " - "unittest.") - .SetDefault(false); - AddAttr("seed", "RpnTargetAssign random seed.").SetDefault(0); + AddAttr("use_random", + "A flag indicating whether to use a ReservoirSampling. " + "NOTE: DO NOT set this flag to false in training. " + "Setting this flag to false is only useful in unittest.") + .SetDefault(true); AddOutput( "LocationIndex", "(Tensor), The indexes of foreground anchors in all RPN anchors, the " @@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { "ScoreIndex is [F + B], F and B are sampled foreground and backgroud " " number."); AddOutput("TargetBBox", - "(Tensor), The target bbox deltas with shape " + "(Tensor), The target bbox deltas with shape " "[F, 4], F is the sampled foreground number."); AddOutput( "TargetLabel", - "(Tensor), The target labels of each anchor with shape " + "(Tensor), The target labels of each anchor with shape " "[F + B, 1], F and B are sampled foreground and backgroud number."); AddComment(R"DOC( -This operator can be, for given the IoU between the ground truth bboxes and the +This operator can be, for a given set of ground truth bboxes and the anchors, to assign classification and regression targets to each prediction. -The Score index and LocationIndex will be generated according to the DistMat. +The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU. The rest anchors would not contibute to the RPN training loss ScoreIndex is composed of foreground anchor indexes(positive labels) and diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index da5d20505e9b06c0717af8d79d5456a9ade1e89c..56734b81e8716a0c0c37a11e35c9118ee7b55020 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -20,6 +20,7 @@ if(WITH_GRPC) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + cc_test(varhandle_test SRCS varhandle_test.cc) return() endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940..e22bc552f85b85c75f06b4158f2abac2d3843256 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() { } channels_.clear(); } - client_thread_->join(); } -bool GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + SendProcessor* s = new SendProcessor(ch); + VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, - this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { auto* var = p_scope->FindVar(var_name_val); ::grpc::ByteBuffer req; SerializeToByteBuffer(var_name_val, var, *p_ctx, &req); - // varhandle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Send"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - SendProcessor* s = new SendProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = nullptr; auto call = s->stub_g_.PrepareUnaryCall( @@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep, }); req_count_++; - return true; + return h; } void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { framework::Variable* outvar = nullptr; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar); } template @@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { result->Swap(&tmp); } -bool GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, - this] { + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); ::grpc::ByteBuffer buf; RequestToByteBuffer(req, &buf); - // var handle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Get"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( @@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep, req_count_++; - return true; + return h; } -bool GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h( + new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] { + time_out, s, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); - // var handle - VarHandle var_h; - var_h.ep = ep_val; - var_h.scope = p_scope; - var_h.name = out_var_name_val; - var_h.ctx = p_ctx; - var_h.method = "Prefetch"; - - VLOG(3) << var_h.String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; // stub context - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( @@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep, }); req_count_++; - return true; + return h; } -void GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { +VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h( + new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(COMPLETE_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } -void GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dir, - int64_t time_out) { +VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { const auto ch = GetChannel(ep); CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - s->Prepare(time_out); + VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE, + nullptr, nullptr)); + s->Prepare(h, time_out); sendrecv::VariableMessage req; req.set_varname(CHECKPOINT_SAVE_MESSAGE); @@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); req_count_++; + return h; } bool GRPCClient::Wait() { @@ -276,32 +268,42 @@ void GRPCClient::Proceed() { void* tag = nullptr; bool ok = false; + VLOG(3) << "GRPCClient Proceed begin"; while (!stopped_ && cq_.Next(&tag, &ok)) { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); if (c->status_.ok()) { - VLOG(3) << c->var_h_.String() << " process"; + VLOG(3) << c->GetVarHandlePtr()->String() << " process"; c->Process(); } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { - LOG(ERROR) << c->var_h_.String() + LOG(ERROR) << c->GetVarHandlePtr()->String() << " meets grpc error:" << c->status_.error_message(); { std::lock_guard lk(sync_mutex_); ok_ = false; } - sync_cond_.notify_all(); + c->Finish(false); } else { - LOG(FATAL) << c->var_h_.String() + LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error:" << c->status_.error_message(); + c->Finish(false); } - delete c; + + bool notify = false; { std::lock_guard lk(sync_mutex_); req_count_--; + notify = (req_count_ <= 0 || !c->status_.ok()); + } + + delete c; + + if (notify) { + sync_cond_.notify_all(); } - sync_cond_.notify_all(); } + VLOG(3) << "GRPCClient Proceed end"; } std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 0c95ffeb5ce7e1586c5968fb122acd12c0c0196e..75a3662316462a222760bfbb7d7906c70f46d143 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); class BaseProcessor { public: - explicit BaseProcessor(std::shared_ptr ch) { - context_ = nullptr; - } + BaseProcessor() { context_ = nullptr; } virtual ~BaseProcessor() {} - virtual void Prepare(const VarHandle& var_info, int64_t time_out) { + virtual void Prepare(VarHandlePtr h, int64_t time_out) { + var_h_ = h; + context_.reset(new grpc::ClientContext()); - var_h_ = var_info; context_->set_wait_for_ready(true); if (time_out) { std::chrono::system_clock::time_point deadline = @@ -71,21 +70,21 @@ class BaseProcessor { } } - virtual void Prepare(int64_t time_out) { - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + std::chrono::milliseconds(time_out); - - context_->set_deadline(deadline); + void Process() { + ProcessImpl(); + var_h_->Finish(true); } - virtual void Process() = 0; + VarHandlePtr GetVarHandlePtr() { return var_h_; } + bool Wait() { return var_h_->Wait(); } + void Finish(bool ok) { return var_h_->Finish(ok); } + virtual void ProcessImpl() = 0; std::unique_ptr context_; grpc::Status status_; - VarHandle var_h_; + + protected: + VarHandlePtr var_h_; }; typedef std::function @@ -94,13 +93,13 @@ typedef std::function class SendProcessor : public BaseProcessor { public: explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(ch), stub_g_(ch) {} + : BaseProcessor(), stub_g_(ch) {} virtual ~SendProcessor() {} - virtual void Process() { + void ProcessImpl() override { if (response_call_back_) { - response_call_back_(var_h_, reply_); + response_call_back_(*var_h_.get(), reply_); } } @@ -115,13 +114,13 @@ typedef std::function class GetProcessor : public BaseProcessor { public: explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(ch), stub_g_(ch) {} + : BaseProcessor(), stub_g_(ch) {} virtual ~GetProcessor() {} - virtual void Process() { + void ProcessImpl() override { if (response_call_back_) { - response_call_back_(var_h_, reply_); + response_call_back_(*var_h_.get(), reply_); } } @@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor { public: explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~BatchBarrierProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VoidMessage reply_; std::unique_ptr stub_; }; @@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor { class FetchBarrierProcessor : public BaseProcessor { public: explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~FetchBarrierProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VariableMessage reply_; std::unique_ptr stub_; }; @@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor { class CheckpointNotifyProcessor : public BaseProcessor { public: explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor(ch) { + : BaseProcessor() { stub_ = sendrecv::SendRecvService::NewStub(ch); } virtual ~CheckpointNotifyProcessor() {} - virtual void Process() {} + void ProcessImpl() override {} sendrecv::VoidMessage reply_; std::unique_ptr stub_; }; @@ -177,32 +176,37 @@ class GRPCClient : public RPCClient { GRPCClient() : ok_(true), completed_(false), stopped_(false) {} virtual ~GRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncCheckpointNotify(const std::string& ep, const std::string& dir, - int64_t time_out = FLAGS_rpc_deadline) override; - - void AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; bool Wait() override; diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 64ac7281848f91302bc0aa3cb81dd198e56fb653..3c3f9d17c871ac1cb4df83db17cf489d5b9e0563 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -28,6 +28,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { @@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; class RPCServer; -struct VarHandle { - // RPC endpoint. - std::string ep; - const platform::DeviceContext* ctx; - const framework::Scope* scope; - // Variable name. - std::string name; - // RPC method name. - std::string method; +class VarHandle { + public: + VarHandle(const std::string ep, const std::string& method, + const std::string& name, + const platform::DeviceContext* p_ctx = nullptr, + const framework::Scope* p_scope = nullptr) + : ok_(kVarHandleDefaultState) { + ep_ = ep; + ctx_ = p_ctx; + scope_ = p_scope; + name_ = name; + method_ = method; + } + + virtual ~VarHandle() {} + + public: + bool Wait() { + { + std::unique_lock lk(sync_mutex_); + wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; }); + } + VLOG(7) << "VarHandle wait:" << ok_; + return ok_ != 0; + } + + void Finish(bool ok) { + { + std::unique_lock lk(sync_mutex_); + ok_ = ok; + } + VLOG(7) << "VarHandle finish:" << ok; + wait_cond_.notify_all(); + } std::string String() const { std::ostringstream s; - s << method << " name:[" << name << "], ep:[" << ep << "]"; + s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_ + << "]"; return s.str(); } + + std::string ep() const { return ep_; } + const platform::DeviceContext* ctx() const { return ctx_; } + const framework::Scope* scope() const { return scope_; } + std::string name() const { return name_; } + std::string method() const { return method_; } + + protected: + // RPC endpoint. + std::string ep_; + const platform::DeviceContext* ctx_; + const framework::Scope* scope_; + // Variable name. + std::string name_; + // RPC method name. + std::string method_; + + protected: + std::mutex sync_mutex_; + std::condition_variable wait_cond_; + int ok_; + + static const int kVarHandleDefaultState = -1; + + private: + DISABLE_COPY_AND_ASSIGN(VarHandle); }; +typedef std::shared_ptr VarHandlePtr; + class RequestHandler { public: explicit RequestHandler(bool sync_mode) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 31159a02592a2aff75f7ecf5be924989f0f47071..849e412504eb9180b746db65fd4fa353ed0c05a1 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname, LOG(FATAL) << "sync: Can not find server side var: " << varname; return false; } - - if (invar->IsType()) { - std::unique_lock lock(mutex_sparse_vars_); - sparse_vars_.push_back(invar); - } } } return true; } -void RequestSendHandler::ResetSparseVarRecorder() { - std::unique_lock lock(mutex_sparse_vars_); - for (auto* var : sparse_vars_) { - var->GetMutable()->mutable_rows()->clear(); - } - sparse_vars_.clear(); -} - bool RequestGetHandler::Handle(const std::string& varname, framework::Scope* scope, framework::Variable* invar, diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index 87185500f2ffc3a8578eea339cc7a1e2b0e46631..8be5b21bb89a580f4091de19186fd2d7e5802478 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler { bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, const std::string& out_var_name = "") override; - void ResetSparseVarRecorder(); - - private: - std::mutex mutex_sparse_vars_; - std::vector sparse_vars_; }; class RequestGetHandler final : public RequestHandler { diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 22a022a5d25e5c6628b80294494b87ca105a04c7..3539ee5e459d6dfe0b6510806464bcc6817910bb 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -14,12 +14,14 @@ #pragma once +#include // NOLINT #include #include "gflags/gflags.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/distributed/request_handler.h" DECLARE_int32(rpc_deadline); @@ -31,37 +33,36 @@ class RPCClient { public: RPCClient() {} virtual ~RPCClient() {} - virtual bool AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual bool AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncCheckpointNotify(const std::string& ep, - const std::string& dir, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual void AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncPrefetchVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; // Complete tells all the pserver instances that finishe the training, // the pserver can reduce it's barrier count, and continue to train diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 406e7294c190172347d432fb155c2a81c43dda25..084480ae48b8b9267ade1a840f6a70519cb28e48 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -101,6 +101,8 @@ void RPCServer::Complete() { { std::unique_lock lock(mutex_); client_num_--; + need_reset_all_vars_ = true; + VLOG(4) << "decrease client_num to: " << client_num_; if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { barrier_counter_[kRequestGet]--; @@ -109,6 +111,11 @@ void RPCServer::Complete() { barrier_cond_.notify_all(); } +bool RPCServer::NeedResetAllVars() { + std::unique_lock lock(mutex_); + return need_reset_all_vars_; +} + int RPCServer::GetClientNum() { std::unique_lock lock(mutex_); return client_num_; @@ -120,6 +127,7 @@ void RPCServer::ResetBarrierCounter() { for (auto& t : barrier_counter_) { t.second = 0; } + need_reset_all_vars_ = false; } void RPCServer::RegisterRPC(const std::string& rpc_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index d813ba03e2fbec6e808f59f814a9b2f4bfbcd77b..d88e8c640ffb5ea44e88318cc973c9a783862435 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -49,7 +49,8 @@ class RPCServer { bind_address_(address), exit_flag_(false), selected_port_(0), - client_num_(client_num) {} + client_num_(client_num), + need_reset_all_vars_(false) {} virtual ~RPCServer() {} virtual void StartServer() = 0; @@ -86,6 +87,8 @@ class RPCServer { void ResetBarrierCounter(); RPCServerProfiler& Profiler() { return profiler_; } + bool NeedResetAllVars(); + protected: virtual void ShutDownImpl() = 0; @@ -104,6 +107,7 @@ class RPCServer { std::atomic exit_flag_; int selected_port_; int client_num_; + bool need_reset_all_vars_; std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0fcaf886475c5e03d959ffd6af22b2123526b9f --- /dev/null +++ b/paddle/fluid/operators/distributed/varhandle_test.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +using paddle::operators::distributed::VarHandlePtr; +using paddle::operators::distributed::VarHandle; + +void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); } + +void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); } + +TEST(VarHandle, Run) { + std::vector a; + for (int i = 0; i < 12; i++) { + VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr)); + a.push_back(s); + } + + std::vector> t; + for (int i = 0; i < 6; i++) { + t.emplace_back(new std::thread(WaitFalse, a[i])); + } + + for (int i = 0; i < 6; i++) { + a[i]->Finish(false); + t[i]->join(); + } + + for (int i = 6; i < 12; i++) { + t.emplace_back(new std::thread(WaitTrue, a[i])); + } + + for (int i = 6; i < 12; i++) { + a[i]->Finish(true); + t[i]->join(); + } +} diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc index 916f84cb4a78c3721cb67bd3cf8d3759a8eaf1bf..31e87d9113118ebe7a4b25ffee5ba55e2714fb66 100644 --- a/paddle/fluid/operators/fusion_gru_op.cc +++ b/paddle/fluid/operators/fusion_gru_op.cc @@ -25,14 +25,14 @@ namespace paddle { namespace operators { void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU."); PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of GRU should not be null."); + "Assert only one Input(WeightX) of GRU."); PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of GRU should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); + "Assert only one Input(WeightH) of GRU."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of GRU should not be null."); + "Assert only one Output(Hidden) of GRU."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); @@ -80,11 +80,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), - "Output(ReorderedH0) of GRU should not be null."); + "Assert only one Output(ReorderedH0) of GRU."); PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), - "Output(BatchedInput) of GRU should not be null."); + "Assert only one Output(BatchedInput) of GRU."); PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"), - "Output(BatchedOut) of GRU should not be null."); + "Assert only one Output(BatchedOut) of GRU."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedOut", out_dims); } diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index ef23ab3f981786d33567619ad0194d21f31bdc8e..55e465e3af08c012b8cff7714452ed32b32a5556 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -24,20 +24,17 @@ namespace paddle { namespace operators { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM."); PADDLE_ENFORCE(ctx->HasInput("WeightX"), - "Input(WeightX) of LSTM should not be null."); + "Assert only one Input(WeightX) of LSTM."); PADDLE_ENFORCE(ctx->HasInput("WeightH"), - "Input(WeightH) of LSTM should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Bias"), - "Input(Bias) of LSTM should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("XX"), - "Output(XX) of LSTM should not be null."); + "Assert only one Input(WeightH) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("Hidden"), - "Output(Hidden) of LSTM should not be null."); + "Assert only one Output(Hidden) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("Cell"), - "Output(Cell) of LSTM should not be null."); + "Assert only one Output(Cell) of LSTM."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); @@ -96,15 +93,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { } else { xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), - "Output(BatchedInput) of LSTM should not be null."); + "Assert only one Output(BatchedInput) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), - "Output(BatchedHidden) of LSTM should not be null."); + "Assert only one Output(BatchedHidden) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), - "Output(BatchedCell) of LSTM should not be null."); + "Assert only one Output(BatchedCell) of LSTM."); PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), - "Output(ReorderedH0) of LSTM should not be null."); + "Assert only one Output(ReorderedH0) of LSTM"); PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), - "Output(ReorderedC0) of LSTM should not be null."); + "Assert only one Output(ReorderedC0) of LSTM."); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedHidden", out_dims); ctx->SetOutputDim("BatchedCell", out_dims); diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 4cc2159d9f22809a640f82ad19415f3e5a2d9999..966d78b84130c172c41e8049bf6bb1dc659d7d48 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" @@ -101,7 +102,7 @@ static int64_t GetTimestamp() { void ListenAndServOp::RunSyncLoop( framework::Executor *executor, framework::ProgramDesc *program, - framework::Scope *recv_scope, + framework::Scope *recv_scope, platform::DeviceContext *dev_ctx, const std::vector &prefetch_block_id_list, const int checkpoint_point_block_id) const { VLOG(2) << "RunSyncLoop"; @@ -128,6 +129,7 @@ void ListenAndServOp::RunSyncLoop( rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->ResetBarrierCounter(); + while (true) { rpc_service_->Profiler().OneStep(); // Get from multiple trainers, we don't care about the order in which @@ -165,9 +167,7 @@ void ListenAndServOp::RunSyncLoop( recv_scope); VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; - // reset received sparse vars to avoid reuse it in the next mini-batch - dynamic_cast(request_send_handler_.get()) - ->ResetSparseVarRecorder(); + ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars()); rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet); @@ -175,6 +175,42 @@ void ListenAndServOp::RunSyncLoop( } // while(true) } +void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope, + platform::DeviceContext *dev_ctx, + bool reset_all) const { + for (auto &varname : sparse_vars_) { + auto var = recv_scope->FindVar(varname); + if (var == nullptr) { + VLOG(2) << "can not find var " << varname << " in received scope"; + continue; + } + if (var->IsType()) { + VLOG(3) << "reset sparse var: " << varname; + var->GetMutable()->mutable_rows()->clear(); + } else { + PADDLE_THROW("The type of sparse var should be SelectedRows"); + } + } + if (UNLIKELY(reset_all)) { + for (auto &varname : dense_vars_) { + auto var = recv_scope->FindVar(varname); + if (var == nullptr) { + VLOG(2) << "can not find var " << varname << " in received scope"; + continue; + } + if (var->IsType()) { + math::set_constant(*dev_ctx, var->GetMutable(), + static_cast(0)); + } else if (var->IsType()) { + math::set_constant(*dev_ctx, var->GetMutable(), + static_cast(0)); + } else { + PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]"); + } + } + } +} + void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope) const { @@ -248,6 +284,25 @@ static void FillRequestCtx( h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); } +void ListenAndServOp::CacheVarsType(const std::vector &varnames, + const framework::Scope &scope) const { + for (const auto &varname : varnames) { + auto var = scope.FindVar(varname); + PADDLE_ENFORCE(var != nullptr, + "Received var should be initialized in the received scope."); + if (var->IsType()) { + sparse_vars_.push_back(varname); + } else if (var->IsType() || + var->IsType()) { + dense_vars_.push_back(varname); + } else { + PADDLE_THROW( + "The type of received var should be in [SelectedRows, LoDTensor, " + "Tensor]."); + } + } +} + void ListenAndServOp::RunImpl(const framework::Scope &scope, const platform::Place &dev_place) const { // Mark this as PS that it should decide profiling by listening from trainer. @@ -258,6 +313,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, bool sync_mode = Attr("sync_mode"); auto fan_in = Attr("Fanin"); + auto inputs = Inputs("X"); PADDLE_ENFORCE(!rpc_service_); std::string endpoint = Attr("endpoint"); @@ -348,11 +404,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, signal(SIGINT, SignalHandler::StopAndExit); signal(SIGTERM, SignalHandler::StopAndExit); + // Cache the type of the received vars as `sparse_vars_` and `dense_vars_` + // so that we can reset them at the end of each iteration. + // NOTE: only used in sync update + CacheVarsType(inputs, recv_scope); + // Write to a file of server selected port for python use. SavePort(); if (sync_mode) { - RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list, - checkpoint_block_id); + RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, + prefetch_block_id_list, checkpoint_block_id); } else { RunAsyncLoop(&executor, program, &recv_scope); } diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h index 978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0..5f889793ab16249a4e06801090db087a089dbed1 100644 --- a/paddle/fluid/operators/listen_and_serv_op.h +++ b/paddle/fluid/operators/listen_and_serv_op.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -48,6 +49,7 @@ class ListenAndServOp : public framework::OperatorBase { void RunSyncLoop(framework::Executor* executor, framework::ProgramDesc* program, framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx, const std::vector& prefetch_block_id_list, const int checkpoint_point_block_id) const; @@ -64,6 +66,13 @@ class ListenAndServOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override; + void ResetReceivedVars(framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx, + bool reset_all = false) const; + + void CacheVarsType(const std::vector& varnames, + const framework::Scope& scope) const; + protected: mutable std::shared_ptr rpc_service_; mutable std::shared_ptr request_send_handler_; @@ -74,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase { request_checkpoint_handler_; mutable std::shared_ptr server_thread_; + mutable std::vector sparse_vars_; + mutable std::vector dense_vars_; }; class SignalHandler { diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc index caff35e03ae3a144f799d982c859ded62cb3e93d..18bf1a66f6d9903f32048574dc93faf7e98953ac 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -28,7 +28,8 @@ class CrossEntropyFunctor { public: void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel) { + const framework::Tensor* labels, const bool softLabel, + const int ignore_index) { const int batch_size = prob->dims()[0]; if (softLabel) { auto in = EigenMatrix::From(*prob); @@ -49,8 +50,12 @@ class CrossEntropyFunctor { int lbl = label_data[i]; PADDLE_ENFORCE_GE(lbl, 0); PADDLE_ENFORCE_LT(lbl, class_num); + PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index); int index = i * class_num + lbl; - loss_data[i] = -math::TolerableValue()(std::log(prob_data[index])); + loss_data[i] = + lbl == ignore_index + ? 0 + : -math::TolerableValue()(std::log(prob_data[index])); } } } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b..c92341ea55ea21773acba33665e267b2f1c25fe3 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -23,11 +23,14 @@ namespace math { namespace { template __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, - const int N, const int D) { + const int N, const int D, + const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - PADDLE_ASSERT(label[i] >= 0 && label[i] < D); - Y[i] = -math::TolerableValue()(log(X[i * D + label[i]])); + PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index); + Y[i] = ignore_index == label[i] + ? 0 + : -math::TolerableValue()(log(X[i * D + label[i]])); } } @@ -57,7 +60,8 @@ class CrossEntropyFunctor { public: void operator()(const platform::CUDADeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, bool softLabel) { + const framework::Tensor* labels, bool softLabel, + const int ignore_index) { const T* prob_data = prob->data(); T* loss_data = out->mutable_data(ctx.GetPlace()); @@ -77,7 +81,8 @@ class CrossEntropyFunctor { int block = 512; int grid = (batch_size + block - 1) / block; CrossEntropyKernel<<>>( - loss_data, prob_data, label_data, batch_size, class_num); + loss_data, prob_data, label_data, batch_size, class_num, + ignore_index); } } }; diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index adc5b3fe47cd3bf524eb56747b6bd51e345a2eb6..e8aeb5d0575ac0f6b8761e97896df73578e8a103 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -38,7 +38,8 @@ class CrossEntropyFunctor { public: void operator()(const DeviceContext& context, framework::Tensor* out, const framework::Tensor* prob, - const framework::Tensor* labels, const bool softLabel); + const framework::Tensor* labels, const bool softLabel, + const int ignore_index); }; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 4b804740a06f9e29704f2b3f58a90191e3559347..0519c15e13aac99802ff0f95b975712b36b44246 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " << outs[i] << " back"; - rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]); + rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, + ins[i], outs[i])); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } }; diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index a1f368e8690512cec2db7593aabc0279bbe174eb..4d34b8a1686efb1fc30020f0d27e9a3c3a6c0866 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); + rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i])); } if (sync_mode) { - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 82a70e4bf13247d784371ffdf419c9f792d7f721..48322ac7fd54a2e4cc3405a2c4dcddfc273f5a66 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include // NOLINT #include +#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); + std::vector rets; for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); + rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } if (sync_send) { - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } } } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 53cb716a979229c99fcbdc12f1aeab4e21b320f3..1a9324ec862fc3dd7ce669c5fed94527cac22b8f 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker "(bool, default: false), A flag to indicate whether to interpretate " "the given labels as soft labels.") .SetDefault(false); + AddAttr( + "ignore_index", + "(int, default -100), Specifies a target value that is ignored and" + "does not contribute to the input gradient. Only valid if soft_label" + "is set to False") + .SetDefault(-100); AddComment(R"DOC( Softmax With Cross Entropy Operator. diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index a559b01ed32a48e3befb37c2ae8935b4f3a4acb0..a07c17348ebb3f768d1c8be65c2d31e3c130bd23 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -26,11 +26,13 @@ using Tensor = framework::Tensor; namespace { template __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, - const int batch_size, const int class_num) { + const int batch_size, const int class_num, + const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; i += blockDim.x * gridDim.x) { int idx = i * class_num + labels[i]; - logit_grad[idx] -= static_cast(1.); + logit_grad[idx] -= + ignore_index == labels[i] ? static_cast(0.) : static_cast(1.); } } @@ -260,6 +262,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { auto* loss_data = loss->mutable_data(context.GetPlace()); auto soft_label = context.Attr("soft_label"); + auto ignore_index = context.Attr("ignore_index"); if (soft_label) { int batch_size = logits->dims()[0]; int feature_size = logits->dims()[1]; @@ -272,7 +275,8 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { math::SoftmaxCUDNNFunctor()(context.cuda_device_context(), logits, softmax); math::CrossEntropyFunctor()( - context.cuda_device_context(), loss, softmax, labels, false); + context.cuda_device_context(), loss, softmax, labels, false, + ignore_index); } } }; @@ -295,7 +299,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { const int class_num = logit_grad->dims()[1]; int block = 512; auto stream = context.cuda_device_context().stream(); - + auto ignore_index = context.Attr("ignore_index"); if (context.Attr("soft_label")) { int grid = (batch_size * class_num + block - 1) / block; const T* label_data = labels->data(); @@ -305,7 +309,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { int grid = (batch_size + block - 1) / block; const int64_t* label_data = labels->data(); CrossEntropyGrad<<>>( - logit_grad_data, label_data, batch_size, class_num); + logit_grad_data, label_data, batch_size, class_num, ignore_index); int num = batch_size * class_num; grid = (num + block - 1) / block; Scale<<>>(logit_grad_data, loss_grad_data, num, diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index dd6f6aca5ada7aa215d3b3444194fc53efeb7020..e9aba3b37b8cc01d4fe5de5200579d4e93f67e56 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { math::SoftmaxFunctor()(dev_ctx, logits, softmax); math::CrossEntropyFunctor()( - dev_ctx, loss, softmax, labels, context.Attr("soft_label")); + dev_ctx, loss, softmax, labels, context.Attr("soft_label"), + context.Attr("ignore_index")); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index bc556ab3643cefa3e45d2a8a3835937753af723f..79e75ea9a035b654f0bb7026d3a491bebe0b23c4 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel { fluid_t->mutable_data(platform::CUDAPlace( boost::get(context.GetPlace()).device)), size * sizeof(float)); - //} else { - // engine->GetOutputInGPU( - // y, fluid_t->mutable_data(platform::CUDAPlace()), - // size * sizeof(float)); - //} + output_index += 1; } diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 2cc26da013f59f5b7ee1747d57baca9c1c0efe2c..c6f1d1f3d544117311821d980300dffea03891a5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memory.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/framework/rw_lock.h" +#endif namespace paddle { namespace platform { @@ -142,7 +145,58 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { mutable unsigned int* semaphore_; }; -CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { +class CudnnHolder { + public: + CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) + : workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) { + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); + } + + cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } + + void RunFunc(const std::function& cudnn_func, + size_t required_workspace_len) { + std::lock_guard lock(mtx_); + if (required_workspace_len > workspace_len_) { + ReallocateWorkspace(required_workspace_len); + } + cudnn_func(workspace_); + } + + ~CudnnHolder() { + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); + if (workspace_ != nullptr) { + paddle::memory::Free(place_, workspace_); + } + } + + private: + void ReallocateWorkspace(size_t required_workspace_len) { + if (required_workspace_len <= workspace_len_) { + return; + } + if (workspace_ != nullptr) { + // Maybe someone is using the current workspace + PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); + paddle::memory::Free(place_, workspace_); + } + workspace_ = paddle::memory::Alloc(place_, required_workspace_len); + workspace_len_ = required_workspace_len; + } + + cudnnHandle_t cudnn_handle_; + void* workspace_; + size_t workspace_len_; + + const cudaStream_t* stream_; // not owned; + const CUDAPlace place_; + + std::mutex mtx_; +}; + +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) + : place_(place), cudnn_holder_(nullptr) { SetDeviceId(place_.device); compute_capability = GetCUDAComputeCapability(place_.device); multi_process = GetCUDAMultiProcessors(place_.device); @@ -154,10 +208,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); if (dynload::HasCUDNN()) { - PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); - } else { - cudnn_handle_ = nullptr; + cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } } @@ -165,9 +216,6 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); - if (cudnn_handle_ != nullptr) { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); - } eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -196,7 +244,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { return cublas_handle_; } -cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { + return cudnn_holder_->cudnn_handle(); +} + +void CUDADeviceContext::RunCudnnFuncWithWorkspace( + const std::function& cudnn_func, size_t workspace_len) const { + cudnn_holder_->RunFunc(cudnn_func, workspace_len); +} cudaStream_t CUDADeviceContext::stream() const { return stream_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index b97dad20db0b003b4886b7c7cfd1c8de8bf44ab9..3ed49fc4233d4c0cd6cc16319eda08480ab9b434 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -69,6 +69,7 @@ struct DefaultDeviceContextType { #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; +class CudnnHolder; class CUDADeviceContext : public DeviceContext { public: @@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; + /*! \brief Run a cudnn function with the workspace provided by + * CUDADeviceContext */ + void RunCudnnFuncWithWorkspace(const std::function& cudnn_func, + size_t workspace_len) const; + /*! \brief Return cuda stream in the device context. */ cudaStream_t stream() const; @@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; + std::unique_ptr cudnn_holder_; cudaStream_t stream_; - cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; int compute_capability; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f6e9a52b275353c03c1f350719766922a97f6cb3..c0a2543ba5d8ff8f34cb6231c51cb5053a6a9481 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -192,7 +192,8 @@ class MKLDNNHandler { mkldnn::memory::primitive_desc& user_mpd, // NOLINT const std::shared_ptr user_memory_p, const std::string& suffix, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -213,7 +214,7 @@ class MKLDNNHandler { pipeline.push_back(*reorder_p); } dev_ctx_.SetBlob(local_key, target_memory_p); - } else { + } else if (!is_persistent) { // Make reorder if needed auto reorder_p = std::static_pointer_cast( dev_ctx_.GetBlob(key_reorder_p)); diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index f21f8d23f99c27529b2ed1995c92fd4eee4a5807..67501186d150171728194f23bc02d2c014848dd7 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) { PADDLE_ENFORCE(desc->ParseFromString(data), "Fail to parse ProgramDesc from string. This could " "be a bug of Paddle."); - }); + }) + .def("_version", [](pd::ProgramDesc &self) -> int64_t { + return self.Proto()->version().version(); + }); } void BindBlockDesc(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5b20b87174e42f4dfdd22214e8f9dd20c7296374..8bc30fc123163983f4bddc19af489920db93e0c0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -33,6 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/version.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/enforce.h" @@ -530,6 +531,8 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("_is_program_version_supported", IsProgramVersionSupported); + BindProgramDesc(&m); BindBlockDesc(&m); BindVarDsec(&m); @@ -680,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 1fe7f42ca1c692e4d7034883022852657be8cc20..8572dc1e8e543b552e3ed5a180ec942faf90a624 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,4 +1,5 @@ cc_library(stringpiece SRCS piece.cc) +cc_library(pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/fluid/string/pretty_log.cc b/paddle/fluid/string/pretty_log.cc new file mode 100644 index 0000000000000000000000000000000000000000..4534fdc58b81fe03b3a1fc19b55aa62ddbf5eaf1 --- /dev/null +++ b/paddle/fluid/string/pretty_log.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/pretty_log.h" +#include + +DEFINE_bool(color, true, "Whether to turn on pretty log"); + +namespace paddle { +namespace string {} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h new file mode 100644 index 0000000000000000000000000000000000000000..a3b4e38f453835828a4a53130e11c854ac3f4a74 --- /dev/null +++ b/paddle/fluid/string/pretty_log.h @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/string/printf.h" + +DECLARE_bool(color); + +namespace paddle { + +namespace string { + +inline std::string black() { return FLAGS_color ? "\e[30m" : ""; } +inline std::string red() { return FLAGS_color ? "\e[31m" : ""; } +inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; } +inline std::string green() { return FLAGS_color ? "\e[32m" : ""; } +inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; } +inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; } +inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; } +inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; } +inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string white() { return FLAGS_color ? "\e[37m" : ""; } +inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; } +inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; } +inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; } +inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; } +inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; } +inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; } + +using TextBlock = std::pair; + +struct Style { + static std::string info() { return black(); } + static std::string warn() { return b_red(); } + static std::string suc() { return green(); } + static std::string H1() { return bold() + purple(); } + static std::string H2() { return green(); } + static std::string H3() { return green(); } + static std::string detail() { return light_gray(); } +}; + +template +static void PrettyLogEndl(const std::string& style, const char* fmt, + const Args&... args) { + std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; +} +template +static void PrettyLog(const std::string& style, const char* fmt, + const Args&... args) { + std::cerr << style << Sprintf(fmt, args...) << reset(); +} + +} // namespace string +} // namespace paddle diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 633d2334cf74eef1a8b86b68494595cca6c92ccd..e809efaf9f5452493a5a5a2f6d1e0d2a07be1305 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1569,6 +1569,9 @@ class Program(object): """ return self.desc + def _version(self): + return self.desc._version() + def clone(self, for_test=False): """ Create a new, duplicated program. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 5c4ec99c533829240ac0bcc9647acb870f3412f8..656fafa0cb54d70e0eba8ec2bef21488c50d8d94 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -750,6 +750,10 @@ def load_inference_model(dirname, program_desc_str = f.read() program = Program.parse_from_string(program_desc_str) + if not core._is_program_version_supported(program._version()): + raise ValueError("Unsupported program version: %d\n" % + program._version()) + # Binary data also need versioning. load_persistables(executor, dirname, program, params_filename) if pserver_endpoints: diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 1bc1dbbecaccd328d84cd3364a50c8f828d823c0..1c73c837e2aa422b67704e171f66f5cd48e171ce 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -55,15 +55,19 @@ for _OP in set(__auto__): globals()[_OP] = generate_layer_fn(_OP) -def rpn_target_assign(loc, - scores, +def rpn_target_assign(bbox_pred, + cls_logits, anchor_box, anchor_var, - gt_box, + gt_boxes, + is_crowd, + im_info, rpn_batch_size_per_im=256, - fg_fraction=0.25, + rpn_straddle_thresh=0.0, + rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, - rpn_negative_overlap=0.3): + rpn_negative_overlap=0.3, + use_random=True): """ ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** @@ -83,14 +87,13 @@ def rpn_target_assign(loc, the positive anchors. Args: - loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the + bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the predicted locations of M bounding bboxes. N is the batch size, and each bounding box has four coordinate values and the layout is [xmin, ymin, xmax, ymax]. - scores(Variable): A 3-D Tensor with shape [N, M, C] represents the - predicted confidence predictions. N is the batch size, C is the - class number, M is number of bounding boxes. For each category - there are total M scores which corresponding M bounding boxes. + cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the + predicted confidence predictions. N is the batch size, 1 is the + frontground and background sigmoid, M is number of bounding boxes. anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes, each box is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate of the anchor box, @@ -99,11 +102,16 @@ def rpn_target_assign(loc, coordinate of the anchor box. anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded variances of anchors. - gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D + gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth bboxes of mini-batch input. + is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd. + im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size, + 3 is the height, width and scale. rpn_batch_size_per_im(int): Total number of RPN examples per image. - fg_fraction(float): Target fraction of RoI minibatch that is labeled + rpn_straddle_thresh(float): Remove RPN anchors that go outside the image + by straddle_thresh pixels. + rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0), 0-th class is background. rpn_positive_overlap(float): Minimum overlap required between an anchor and ground-truth box for the (anchor, gt box) pair to be a positive @@ -129,45 +137,48 @@ def rpn_target_assign(loc, Examples: .. code-block:: python - loc = layers.data(name='location', shape=[2, 80], + bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], append_batch_size=False, dtype='float32') - scores = layers.data(name='scores', shape=[2, 40], + cls_logits = layers.data(name='cls_logits', shape=[100, 1], append_batch_size=False, dtype='float32') anchor_box = layers.data(name='anchor_box', shape=[20, 4], append_batch_size=False, dtype='float32') - gt_box = layers.data(name='gt_box', shape=[10, 4], + gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], append_batch_size=False, dtype='float32') loc_pred, score_pred, loc_target, score_target = - fluid.layers.detection_output(loc=location, - scores=scores, + fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, - gt_box=gt_box) + gt_boxes=gt_boxes) """ helper = LayerHelper('rpn_target_assign', **locals()) - # Compute overlaps between the prior boxes and the gt boxes overlaps - iou = iou_similarity(x=gt_box, y=anchor_box) # Assign target label to anchors loc_index = helper.create_tmp_variable(dtype='int32') score_index = helper.create_tmp_variable(dtype='int32') - target_label = helper.create_tmp_variable(dtype='int64') + target_label = helper.create_tmp_variable(dtype='int32') target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) helper.append_op( type="rpn_target_assign", - inputs={'Anchor': anchor_box, - 'GtBox': gt_box, - 'DistMat': iou}, + inputs={ + 'Anchor': anchor_box, + 'GtBoxes': gt_boxes, + 'IsCrowd': is_crowd, + 'ImInfo': im_info + }, outputs={ 'LocationIndex': loc_index, 'ScoreIndex': score_index, 'TargetLabel': target_label, - 'TargetBBox': target_bbox, + 'TargetBBox': target_bbox }, attrs={ 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, 'rpn_positive_overlap': rpn_positive_overlap, 'rpn_negative_overlap': rpn_negative_overlap, - 'fg_fraction': fg_fraction + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random }) loc_index.stop_gradient = True @@ -175,12 +186,12 @@ def rpn_target_assign(loc, target_label.stop_gradient = True target_bbox.stop_gradient = True - scores = nn.reshape(x=scores, shape=(-1, 1)) - loc = nn.reshape(x=loc, shape=(-1, 4)) - predicted_scores = nn.gather(scores, score_index) - predicted_location = nn.gather(loc, loc_index) + cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) + bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) + predicted_cls_logits = nn.gather(cls_logits, score_index) + predicted_bbox_pred = nn.gather(bbox_pred, loc_index) - return predicted_scores, predicted_location, target_label, target_bbox + return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox def detection_output(loc, @@ -1258,15 +1269,17 @@ def anchor_generator(input, def generate_proposal_labels(rpn_rois, gt_classes, + is_crowd, gt_boxes, - im_scales, + im_info, batch_size_per_im=256, fg_fraction=0.25, fg_thresh=0.25, bg_thresh_hi=0.5, bg_thresh_lo=0.0, bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=None): + class_nums=None, + use_random=True): """ ** Generate proposal labels Faster-RCNN ** TODO(buxingyuan): Add Document @@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois, inputs={ 'RpnRois': rpn_rois, 'GtClasses': gt_classes, + 'IsCrowd': is_crowd, 'GtBoxes': gt_boxes, - 'ImScales': im_scales + 'ImInfo': im_info }, outputs={ 'Rois': rois, @@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois, 'bg_thresh_hi': bg_thresh_hi, 'bg_thresh_lo': bg_thresh_lo, 'bbox_reg_weights': bbox_reg_weights, - 'class_nums': class_nums + 'class_nums': class_nums, + 'use_random': use_random }) rois.stop_gradient = True diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8af1f7b1bd48c56c584e415e44d08c13c2223879..81257f50565e574a67b7a86c269477af292a4b17 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -968,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): return out -def cross_entropy(input, label, soft_label=False): +def cross_entropy(input, label, soft_label=False, ignore_index=-100): """ **Cross Entropy Layer** @@ -1012,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False): tensor with shape [N x D]. soft_label (bool): a flag indicating whether to interpretate the given labels as soft - labels, default `False`. + labels. Default: `False`. + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid + if soft_label is set to False. Default: -100 Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. @@ -1037,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False): inputs={'X': [input], 'Label': [label]}, outputs={'Y': [out]}, - attrs={"soft_label": soft_label}) + attrs={"soft_label": soft_label, + "ignore_index": ignore_index}) return out @@ -4249,7 +4253,10 @@ def multiplex(inputs, index): return out -def softmax_with_cross_entropy(logits, label, soft_label=False): +def softmax_with_cross_entropy(logits, + label, + soft_label=False, + ignore_index=-100): """ **Softmax With Cross Entropy Operator.** @@ -4291,6 +4298,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): soft_label is set to true, Label is a Tensor with soft_label (bool): A flag to indicate whether to interpretate the given labels as soft labels. By default, `soft_label` is set to False. + ignore_index (int): Specifies a target value that is ignored and does + not contribute to the input gradient. Only valid + if soft_label is set to False. Default: -100 + Returns: Variable: The cross entropy loss is a 2-D tensor with shape [N x 1]. @@ -4312,7 +4323,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): 'Label': label}, outputs={'Softmax': softmax, 'Loss': loss}, - attrs={'soft_label': soft_label}) + attrs={'soft_label': soft_label, + 'ignore_index': ignore_index}) return loss diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4790e0f6119e96b11b049bfdd3b46d40a382683b..44af29d3390e35129d0ee65b31eacad6b28a9d60 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -128,6 +128,13 @@ class ParallelExecutor(object): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exec_strategy.num_threads = cpu_num * 2 + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + if build_strategy is None: build_strategy = BuildStrategy() @@ -135,11 +142,6 @@ class ParallelExecutor(object): main = main if main else framework.default_main_program() if scope == None: scope = executor.global_scope() - # FIXME(Yancey1989): it's a temporary approach to determinate the distribute - # train program, call self.bcast_param() at the end of each mini-batch. - self.is_dist = True if "recv" in [ - op.type for op in main.global_block().ops - ] else False if share_vars_from and not isinstance(share_vars_from, ParallelExecutor): @@ -279,21 +281,11 @@ class ParallelExecutor(object): self.executor.run(fetch_list, fetch_var_name) arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - if self.is_dist: - self._bcast_params() - if return_numpy: return executor.as_numpy(arr) return [arr[i] for i in range(len(arr))] - def _bcast_params(self): - """ - Broadcast the parameters to other devices. It is used during - distributed training. - """ - self.executor._bcast_params(set(self.persistable_vars)) - @property def device_count(self): return len(self._act_places) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index e5ae95e2d943917b9bc10f0d4c4bdc5f8fb07fdb..de276755bb1eb2746cc780575a40357255223809 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -178,7 +178,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index ff91be72c918f8dac65b7030e45c4a00deb965ac..dd547f3448ae55c07b6c09f9de4ac08d8ec5ee88 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -152,7 +152,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index fa72c939e57356f26d60032dd0a91c894b28c505..973308498bec3cddde2ef651751ad5d0c9f84503 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -155,7 +155,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index 440d2a30835cb89089709f024a4dcc6e4113efa8..cb4aeb430e1a9662a183084c0cdacc41c5a8ec11 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -137,7 +137,4 @@ if __name__ == '__main__': for parallel in (False, True): if use_cuda and not core.is_compiled_with_cuda(): continue - # TODO(minqiyang): remove this line after fixing the deletion - # order problem of Scope in ParallelExecutor in manylinux - if six.PY2: - main(use_cuda=use_cuda, parallel=parallel) + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index e2564763d19d180f7c6933429dddf58c77be7bb8..56129641ce5900d82aedf243d2fa1eadfd6b8d86 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase): class TestGenerateProposalLabels(unittest.TestCase): def test_generate_proposal_labels(self): - rpn_rois = layers.data( - name='rpn_rois', - shape=[4, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - gt_classes = layers.data( - name='gt_classes', - shape=[6], - dtype='int32', - lod_level=1, - append_batch_size=False) - gt_boxes = layers.data( - name='gt_boxes', - shape=[6, 4], - dtype='float32', - lod_level=1, - append_batch_size=False) - im_scales = layers.data( - name='im_scales', - shape=[1], - dtype='float32', - lod_level=1, - append_batch_size=False) - class_nums = 5 - rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( - rpn_rois=rpn_rois, - gt_classes=gt_classes, - gt_boxes=gt_boxes, - im_scales=im_scales, - batch_size_per_im=2, - fg_fraction=0.5, - fg_thresh=0.5, - bg_thresh_hi=0.5, - bg_thresh_lo=0.0, - bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], - class_nums=class_nums) - assert rois.shape[1] == 4 - assert rois.shape[0] == labels_int32.shape[0] - assert rois.shape[0] == bbox_targets.shape[0] - assert rois.shape[0] == bbox_inside_weights.shape[0] - assert rois.shape[0] == bbox_outside_weights.shape[0] - assert bbox_targets.shape[1] == 4 * class_nums - assert bbox_inside_weights.shape[1] == 4 * class_nums - assert bbox_outside_weights.shape[1] == 4 * class_nums + program = Program() + with program_guard(program): + rpn_rois = layers.data( + name='rpn_rois', + shape=[4, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + gt_classes = layers.data( + name='gt_classes', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + is_crowd = layers.data( + name='is_crowd', + shape=[6], + dtype='int32', + lod_level=1, + append_batch_size=False) + gt_boxes = layers.data( + name='gt_boxes', + shape=[6, 4], + dtype='float32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) + class_nums = 5 + rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( + rpn_rois=rpn_rois, + gt_classes=gt_classes, + is_crowd=is_crowd, + gt_boxes=gt_boxes, + im_info=im_info, + batch_size_per_im=2, + fg_fraction=0.5, + fg_thresh=0.5, + bg_thresh_hi=0.5, + bg_thresh_lo=0.0, + bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], + class_nums=class_nums) + assert rois.shape[1] == 4 + assert rois.shape[0] == labels_int32.shape[0] + assert rois.shape[0] == bbox_targets.shape[0] + assert rois.shape[0] == bbox_inside_weights.shape[0] + assert rois.shape[0] == bbox_outside_weights.shape[0] + assert bbox_targets.shape[1] == 4 * class_nums + assert bbox_inside_weights.shape[1] == 4 * class_nums + assert bbox_outside_weights.shape[1] == 4 * class_nums class TestMultiBoxHead(unittest.TestCase): @@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase): def test_rpn_target_assign(self): program = Program() with program_guard(program): - loc_shape = [10, 50, 4] - score_shape = [10, 50, 2] + bbox_pred_shape = [10, 50, 4] + cls_logits_shape = [10, 50, 2] anchor_shape = [50, 4] - loc = layers.data( - name='loc', - shape=loc_shape, + bbox_pred = layers.data( + name='bbox_pred', + shape=bbox_pred_shape, append_batch_size=False, dtype='float32') - scores = layers.data( - name='scores', - shape=score_shape, + cls_logits = layers.data( + name='cls_logits', + shape=cls_logits_shape, append_batch_size=False, dtype='float32') anchor_box = layers.data( @@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase): shape=anchor_shape, append_batch_size=False, dtype='float32') - gt_box = layers.data( - name='gt_box', shape=[4], lod_level=1, dtype='float32') - + gt_boxes = layers.data( + name='gt_boxes', shape=[4], lod_level=1, dtype='float32') + is_crowd = layers.data( + name='is_crowd', + shape=[10], + dtype='int32', + lod_level=1, + append_batch_size=False) + im_info = layers.data( + name='im_info', + shape=[1, 3], + dtype='float32', + lod_level=1, + append_batch_size=False) pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( - loc=loc, - scores=scores, + bbox_pred=bbox_pred, + cls_logits=cls_logits, anchor_box=anchor_box, anchor_var=anchor_var, - gt_box=gt_box, + gt_boxes=gt_boxes, + is_crowd=is_crowd, + im_info=im_info, rpn_batch_size_per_im=256, - fg_fraction=0.25, + rpn_straddle_thresh=0.0, + rpn_fg_fraction=0.5, rpn_positive_overlap=0.7, rpn_negative_overlap=0.3) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8ac1cb164e158cf38d1c0570f5bf37ee6a6badae..9892d3f8075d21b9aa01cfda0bb73e4d12008852 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,6 +28,10 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test +if(APPLE) + # this op is not support on mac + list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) +endif() function(py_test_modules TARGET_NAME) if(WITH_TESTING) @@ -46,6 +50,7 @@ function(py_test_modules TARGET_NAME) endfunction() list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) +list(REMOVE_ITEM TEST_OPS test_dist_transpiler) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) @@ -61,11 +66,12 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) + py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) -py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) -py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py index fa367f95fc9c65dd782d53a2799cacadf74dcfd2..f22badbea0c67b210f7ac4e14e5d647f1cffa6cc 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py @@ -209,5 +209,34 @@ class TestCrossEntropyOp6(OpTest): ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) +class TestCrossEntropyOp7(OpTest): + """Test cross-entropy with ignore index. + """ + + def setUp(self): + self.op_type = "cross_entropy" + batch_size = 30 + class_num = 10 + ignore_index = 3 + + X = randomize_probability(batch_size, class_num, dtype='float64') + + label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64") + cross_entropy = np.asmatrix( + [[-np.log(X[i][label[i][0]])] + if label[i][0] != ignore_index else [0] + for i in range(X.shape[0])], + dtype="float64") + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": cross_entropy} + self.attrs = {"soft_label": False, "ignore_index": ignore_index} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py index e39eedd282daf6bbe0603a22c357e06c95c086b6..4bd24510bc8ac7f0fbaad3fd1919ab589cd21c4b 100644 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ b/python/paddle/fluid/tests/unittests/test_data_balance.py @@ -84,7 +84,7 @@ class TestDataBalance(unittest.TestCase): self.data_file_name = './data_balance_test.recordio' self.lod_data_file_name = './data_balance_with_lod_test.recordio' self.total_ins_num = 50 - self.batch_size = 10 + self.batch_size = 12 self.prepare_data() self.prepare_lod_data() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index b85501ef6b80d1f5004aa0dd08c3123d3bda48a5..a198b25520f97ce23b9c1ebb9cd82fc458222d73 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase): t = self._transpiler_instance(config) - trainer_main = t.get_trainer_program() + trainer_main = t.get_trainer_program(wait_port=False) trainer_startup = fluid.default_startup_program() assert (src.num_blocks == 1) diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py similarity index 77% rename from python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py rename to python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py index 6dc101b6dad8813893c6a891da0e16f952bb4c2d..2d5cd3b24bff52d82353ccf3fd2ecb69166c66c6 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py @@ -20,10 +20,10 @@ import paddle.fluid as fluid from op_test import OpTest -def generate_proposal_labels_in_python( - rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, - class_nums): +def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes, + im_info, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, + bbox_reg_weights, class_nums): rois = [] labels_int32 = [] bbox_targets = [] @@ -31,13 +31,13 @@ def generate_proposal_labels_in_python( bbox_outside_weights = [] lod = [] assert len(rpn_rois) == len( - im_scales), 'batch size of rpn_rois and ground_truth is not matched' + im_info), 'batch size of rpn_rois and ground_truth is not matched' - for im_i in range(len(im_scales)): + for im_i in range(len(im_info)): frcn_blobs = _sample_rois( - rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i], - batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, - bg_thresh_lo, bbox_reg_weights, class_nums) + rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i], + im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh, + bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums) lod.append(frcn_blobs['rois'].shape[0]) @@ -50,13 +50,14 @@ def generate_proposal_labels_in_python( return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod -def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, - fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, - bbox_reg_weights, class_nums): +def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, + batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, + bg_thresh_lo, bbox_reg_weights, class_nums): rois_per_image = int(batch_size_per_im) fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) # Roidb + im_scale = im_info[2] inv_im_scale = 1. / im_scale rpn_rois = rpn_rois * inv_im_scale @@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[ overlapped_boxes_ind] + crowd_ind = np.where(is_crowd)[0] + gt_overlaps[crowd_ind] = -1 + max_overlaps = gt_overlaps.max(axis=1) max_classes = gt_overlaps.argmax(axis=1) @@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, fg_inds = np.where(max_overlaps >= fg_thresh)[0] fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0]) # Sample foreground if there are too many - if fg_inds.shape[0] > fg_rois_per_this_image: - fg_inds = np.random.choice( - fg_inds, size=fg_rois_per_this_image, replace=False) + # if fg_inds.shape[0] > fg_rois_per_this_image: + # fg_inds = np.random.choice( + # fg_inds, size=fg_rois_per_this_image, replace=False) + fg_inds = fg_inds[:fg_rois_per_this_image] # Background bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >= @@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.shape[0]) # Sample background if there are too many - if bg_inds.shape[0] > bg_rois_per_this_image: - bg_inds = np.random.choice( - bg_inds, size=bg_rois_per_this_image, replace=False) + # if bg_inds.shape[0] > bg_rois_per_this_image: + # bg_inds = np.random.choice( + # bg_inds, size=bg_rois_per_this_image, replace=False) + bg_inds = bg_inds[:bg_rois_per_this_image] keep_inds = np.append(fg_inds, bg_inds) sampled_labels = max_classes[keep_inds] @@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest): self.inputs = { 'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod), 'GtClasses': (self.gt_classes[0], self.gts_lod), + 'IsCrowd': (self.is_crowd[0], self.gts_lod), 'GtBoxes': (self.gt_boxes[0], self.gts_lod), - 'ImScales': self.im_scales[0] + 'ImInfo': self.im_info } self.attrs = { 'batch_size_per_im': self.batch_size_per_im, @@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest): 'bg_thresh_hi': self.bg_thresh_hi, 'bg_thresh_lo': self.bg_thresh_lo, 'bbox_reg_weights': self.bbox_reg_weights, - 'class_nums': self.class_nums + 'class_nums': self.class_nums, + 'use_random': False } self.outputs = { - 'Rois': (self.rois[0], [self.lod]), - 'LabelsInt32': (self.labels_int32[0], [self.lod]), - 'BboxTargets': (self.bbox_targets[0], [self.lod]), - 'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]), - 'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]), + 'Rois': (self.rois, [self.lod]), + 'LabelsInt32': (self.labels_int32, [self.lod]), + 'BboxTargets': (self.bbox_targets, [self.lod]), + 'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]), + 'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]), } def test_check_output(self): @@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest): self.set_data() def init_test_params(self): - self.batch_size_per_im = 10 - self.fg_fraction = 1.0 + self.batch_size_per_im = 512 + self.fg_fraction = 0.25 self.fg_thresh = 0.5 self.bg_thresh_hi = 0.5 self.bg_thresh_lo = 0.0 @@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest): def init_test_input(self): np.random.seed(0) - image_nums = 1 gt_nums = 6 # Keep same with batch_size_per_im for unittest - proposal_nums = self.batch_size_per_im - gt_nums - images_shape = [] - self.im_scales = [] - for i in range(image_nums): - images_shape.append(np.random.randint(200, size=2)) - self.im_scales.append(np.ones((1)).astype(np.float32)) + proposal_nums = 2000 #self.batch_size_per_im - gt_nums + images_shape = [[64, 64]] + self.im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + self.im_info[i, 0] = images_shape[i][0] + self.im_info[i, 1] = images_shape[i][1] + self.im_info[i, 2] = 0.8 #scale self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape, proposal_nums) @@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest): images_shape, self.class_nums, gt_nums) self.gt_classes = [gt['gt_classes'] for gt in ground_truth] self.gt_boxes = [gt['boxes'] for gt in ground_truth] + self.is_crowd = [gt['is_crowd'] for gt in ground_truth] def init_test_output(self): self.rois, self.labels_int32, self.bbox_targets, \ self.bbox_inside_weights, self.bbox_outside_weights, \ self.lod = generate_proposal_labels_in_python( - self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales, + self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info, self.batch_size_per_im, self.fg_fraction, self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo, self.bbox_reg_weights, self.class_nums ) + self.rois = np.vstack(self.rois) + self.labels_int32 = np.hstack(self.labels_int32) + self.labels_int32 = self.labels_int32[:, np.newaxis] + self.bbox_targets = np.vstack(self.bbox_targets) + self.bbox_inside_weights = np.vstack(self.bbox_inside_weights) + self.bbox_outside_weights = np.vstack(self.bbox_outside_weights) def _generate_proposals(images_shape, proposal_nums): @@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums): for i, image_shape in enumerate(images_shape): proposals = _generate_boxes(image_shape, proposal_nums) rpn_rois.append(proposals) - num_proposals += len(proposals) + num_proposals = len(proposals) rpn_rois_lod.append(num_proposals) return rpn_rois, [rpn_rois_lod] @@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums): gt_classes = np.random.randint( low=1, high=class_nums, size=gt_nums).astype(np.int32) gt_boxes = _generate_boxes(image_shape, gt_nums) - ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes)) + is_crowd = np.zeros((gt_nums), dtype=np.int32) + is_crowd[0] = 1 + ground_truth.append( + dict( + gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd)) num_gts += len(gt_classes) gts_lod.append(num_gts) return ground_truth, [gts_lod] diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py similarity index 88% rename from python/paddle/fluid/tests/unittests/test_generate_proposals.py rename to python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 3fbd2ce95a4f22b91cd4955f914e12f422b0ee83..86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances): #anchor_loc: width, height, center_x, center_y anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) - anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] - anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] - anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 - anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 + anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1 + anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1 + anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0] + anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1] #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) @@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances): i, 0] + anchor_loc[i, 2] pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ i, 1] + anchor_loc[i, 3] - pred_bbox[i, 2] = math.exp(variances[i, 2] * - bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(variances[i, 3] * - bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(variances[i, 2] * bbox_deltas[i, 2], math.log( + 1000 / 16.0))) * anchor_loc[i, 0] + pred_bbox[i, 3] = math.exp( + min(variances[i, 3] * bbox_deltas[i, 3], math.log( + 1000 / 16.0))) * anchor_loc[i, 1] else: for i in range(bbox_deltas.shape[0]): pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ i, 2] pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ i, 3] - pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] - pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] + pred_bbox[i, 2] = math.exp( + min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i, + 0] + pred_bbox[i, 3] = math.exp( + min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i, + 1] proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 - proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 + proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1 + proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1 return proposals @@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info): """Only keep boxes with both sides >= min_size and center within the image. """ # Scale min_size to match image scale - min_size *= im_info[2] + im_scale = im_info[2] + min_size = max(min_size, 1.0) ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 + ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1 + hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1 x_ctr = boxes[:, 0] + ws / 2. y_ctr = boxes[:, 1] + hs / 2. - keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & - (y_ctr < im_info[0]))[0] + keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) & + (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0] return keep @@ -204,7 +213,7 @@ def iou(box_a, box_b): xb = min(xmax_a, xmax_b) yb = min(ymax_a, ymax_b) - inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) + inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0) iou_ratio = inter_area / (area_a + area_b - inter_area) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index bc4d364c74c6cb6b8f0df59e7ede77e6271f4b96..b04346b052903959f44aa96f6fccb7d20652e854 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -556,6 +556,15 @@ class TestBook(unittest.TestCase): out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0) print(str(program)) + def test_cross_entropy(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[30, 10], dtype="float32") + label = layers.data(name="label", shape=[30, 1], dtype="int32") + mode = 'channel' + out = layers.cross_entropy(x, label, False, 4) + self.assertIsNotNone(out) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 5ad922725a0b692e28552737a99b745ed09ddbd5..a55b2002ed989d4588716202a37aa6f4139825ea 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -20,6 +20,7 @@ import numpy as np from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle +import paddle.fluid.core as core import paddle.dataset.wmt16 as wmt16 import os @@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase): writer.complete_append_tensor() def test_main(self): - self.check_network_convergence(transformer, use_cuda=True) + if core.is_compiled_with_cuda(): + self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence(transformer, use_cuda=False, iter=5) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 931cac409f26fce4ecca18c4b0cfcca2e675046f..b7fad9b3a60632adb564e1d155a3d935706b467f 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase): self.queue_capacity = 50 def test(self): - for use_cuda in [False, True]: + for use_cuda in ([False, True] + if core.is_compiled_with_cuda() else [False]): for use_parallel_executor in [False, True]: for use_double_buffer in [False, True]: print('Test Parameters:'), diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index 8ad11d76f683d556f05cafc3251acc942efef72f..e97a05b6f929821f82d96b462598a5ff03cf0a48 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function - +import os import paddle.fluid as fluid import paddle import numpy as np @@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase): self.data_file_name, reader, feeder) def setUp(self): + # set parallel threads to fit 20 batches in line 49 + os.environ['CPU_NUM'] = str(20) self.use_cuda = fluid.core.is_compiled_with_cuda() self.data_file_name = './reader_reset_test.recordio' self.ins_shape = [3] diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py index bd548009b3ada9512e4b5f7d7b61b67b0717a39b..f63dbcd3d7f6bfce3ccc1c42ae41afe42bfad003 100644 --- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py @@ -19,48 +19,58 @@ import numpy as np import paddle.fluid.core as core from op_test import OpTest from test_anchor_generator_op import anchor_generator_in_python -from test_generate_proposal_labels import _generate_groundtruth -from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta - - -def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - iou = np.transpose(gt_anchor_iou) - anchor_to_gt_max = iou.max(axis=1) - anchor_to_gt_argmax = iou.argmax(axis=1) - - gt_to_anchor_argmax = iou.argmax(axis=0) - gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] - anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] - - tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1 - tgt_lbl[anchors_with_max_overlap] = 1 - tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1 - - num_fg = int(fg_fraction * rpn_batch_size_per_im) - fg_inds = np.where(tgt_lbl == 1)[0] - if len(fg_inds) > num_fg: +from test_generate_proposal_labels_op import _generate_groundtruth +from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta + + +def rpn_target_assign(anchor_by_gt_overlap, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1) + anchor_to_gt_max = anchor_by_gt_overlap[np.arange( + anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax] + + gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0) + gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange( + anchor_by_gt_overlap.shape[1])] + anchors_with_max_overlap = np.where( + anchor_by_gt_overlap == gt_to_anchor_max)[0] + + labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1 + labels[anchors_with_max_overlap] = 1 + labels[anchor_to_gt_max >= rpn_positive_overlap] = 1 + + num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im) + fg_inds = np.where(labels == 1)[0] + if len(fg_inds) > num_fg and use_random: disable_inds = np.random.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) - tgt_lbl[disable_inds] = -1 - fg_inds = np.where(tgt_lbl == 1)[0] + else: + disable_inds = fg_inds[num_fg:] + labels[disable_inds] = -1 + fg_inds = np.where(labels == 1)[0] - num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) + num_bg = rpn_batch_size_per_im - np.sum(labels == 1) bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] - tgt_lbl[bg_inds] = 0 - if len(bg_inds) > num_bg: + if len(bg_inds) > num_bg and use_random: enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] - tgt_lbl[enable_inds] = 0 - bg_inds = np.where(tgt_lbl == 0)[0] - tgt_lbl[bg_inds] = 0 + else: + enable_inds = bg_inds[:num_bg] + labels[enable_inds] = 0 + fg_inds = np.where(labels == 1)[0] + bg_inds = np.where(labels == 0)[0] loc_index = fg_inds score_index = np.hstack((fg_inds, bg_inds)) - tgt_lbl = np.expand_dims(tgt_lbl, axis=1) + labels = labels[score_index] + assert not np.any(labels == -1), "Wrong labels with -1" gt_inds = anchor_to_gt_argmax[fg_inds] - return loc_index, score_index, tgt_lbl, gt_inds + return loc_index, score_index, labels, gt_inds def get_anchor(n, c, h, w): @@ -75,85 +85,129 @@ def get_anchor(n, c, h, w): return anchors -def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, - rpn_positive_overlap, rpn_negative_overlap, fg_fraction): - - loc_indexes = [] - score_indexes = [] - tmp_tgt_labels = [] - tgt_bboxes = [] - anchor_num = anchor.shape[0] - +def rpn_target_assign_in_python(all_anchors, + gt_boxes, + is_crowd, + im_info, + lod, + rpn_straddle_thresh, + rpn_batch_size_per_im, + rpn_positive_overlap, + rpn_negative_overlap, + rpn_fg_fraction, + use_random=True): + anchor_num = all_anchors.shape[0] batch_size = len(lod) - 1 for i in range(batch_size): + im_height = im_info[i][0] + im_width = im_info[i][1] + im_scale = im_info[i][2] + if rpn_straddle_thresh >= 0: + # Only keep anchors inside the image by a margin of straddle_thresh + inds_inside = np.where( + (all_anchors[:, 0] >= -rpn_straddle_thresh) & + (all_anchors[:, 1] >= -rpn_straddle_thresh) & ( + all_anchors[:, 2] < im_width + rpn_straddle_thresh) & ( + all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0] + # keep only inside anchors + inside_anchors = all_anchors[inds_inside, :] + else: + inds_inside = np.arange(all_anchors.shape[0]) + inside_anchors = all_anchors + b, e = lod[i], lod[i + 1] - iou_slice = iou[b:e, :] - bboxes_slice = gt_boxes[b:e, :] + gt_boxes_slice = gt_boxes[b:e, :] * im_scale + is_crowd_slice = is_crowd[b:e] - loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( - iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, - rpn_negative_overlap, fg_fraction) + not_crowd_inds = np.where(is_crowd_slice == 0)[0] + gt_boxes_slice = gt_boxes_slice[not_crowd_inds] + iou = _bbox_overlaps(inside_anchors, gt_boxes_slice) - fg_bboxes = bboxes_slice[gt_inds] - fg_anchors = anchor[loc_idx] - box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) + loc_inds, score_inds, labels, gt_inds = rpn_target_assign( + iou, rpn_batch_size_per_im, rpn_positive_overlap, + rpn_negative_overlap, rpn_fg_fraction, use_random) + # unmap to all anchor + loc_inds = inds_inside[loc_inds] + score_inds = inds_inside[score_inds] + + sampled_gt = gt_boxes_slice[gt_inds] + sampled_anchor = all_anchors[loc_inds] + box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.]) if i == 0: - loc_indexes = loc_idx - score_indexes = score_idx - tmp_tgt_labels = tgt_lbl + loc_indexes = loc_inds + score_indexes = score_inds + tgt_labels = labels tgt_bboxes = box_deltas else: loc_indexes = np.concatenate( - [loc_indexes, loc_idx + i * anchor_num]) + [loc_indexes, loc_inds + i * anchor_num]) score_indexes = np.concatenate( - [score_indexes, score_idx + i * anchor_num]) - tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) + [score_indexes, score_inds + i * anchor_num]) + tgt_labels = np.concatenate([tgt_labels, labels]) tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) - tgt_labels = tmp_tgt_labels[score_indexes] return loc_indexes, score_indexes, tgt_bboxes, tgt_labels class TestRpnTargetAssignOp(OpTest): def setUp(self): n, c, h, w = 2, 4, 14, 14 - anchor = get_anchor(n, c, h, w) + all_anchors = get_anchor(n, c, h, w) gt_num = 10 - anchor = anchor.reshape(-1, 4) - anchor_num = anchor.shape[0] - - im_shapes = [[64, 64], [64, 64]] - gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) - bbox = np.vstack([v['boxes'] for v in gt_box]) - - iou = _bbox_overlaps(bbox, anchor) - - anchor = anchor.astype('float32') - bbox = bbox.astype('float32') - iou = iou.astype('float32') - - loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( - anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) + all_anchors = all_anchors.reshape(-1, 4) + anchor_num = all_anchors.shape[0] + + images_shape = [[64, 64], [64, 64]] + #images_shape = [[64, 64]] + groundtruth, lod = _generate_groundtruth(images_shape, 3, 4) + lod = [0, 4, 8] + #lod = [0, 4] + + im_info = np.ones((len(images_shape), 3)).astype(np.float32) + for i in range(len(images_shape)): + im_info[i, 0] = images_shape[i][0] + im_info[i, 1] = images_shape[i][1] + im_info[i, 2] = 0.8 #scale + gt_boxes = np.vstack([v['boxes'] for v in groundtruth]) + is_crowd = np.hstack([v['is_crowd'] for v in groundtruth]) + + all_anchors = all_anchors.astype('float32') + gt_boxes = gt_boxes.astype('float32') + + rpn_straddle_thresh = 0.0 + rpn_batch_size_per_im = 256 + rpn_positive_overlap = 0.7 + rpn_negative_overlap = 0.3 + rpn_fg_fraction = 0.5 + use_random = False + + loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python( + all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh, + rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap, + rpn_fg_fraction, use_random) + labels = labels[:, np.newaxis] self.op_type = "rpn_target_assign" self.inputs = { - 'Anchor': anchor, - 'GtBox': (bbox, [[4, 4]]), - 'DistMat': (iou, [[4, 4]]), + 'Anchor': all_anchors, + 'GtBoxes': (gt_boxes, [[4, 4]]), + 'IsCrowd': (is_crowd, [[4, 4]]), + 'ImInfo': (im_info, [[1, 1]]) } self.attrs = { - 'rpn_batch_size_per_im': 25600, - 'rpn_positive_overlap': 0.95, - 'rpn_negative_overlap': 0.03, - 'fg_fraction': 0.25, - 'fix_seed': True + 'rpn_batch_size_per_im': rpn_batch_size_per_im, + 'rpn_straddle_thresh': rpn_straddle_thresh, + 'rpn_positive_overlap': rpn_positive_overlap, + 'rpn_negative_overlap': rpn_negative_overlap, + 'rpn_fg_fraction': rpn_fg_fraction, + 'use_random': use_random } self.outputs = { 'LocationIndex': loc_index.astype('int32'), 'ScoreIndex': score_index.astype('int32'), 'TargetBBox': tgt_bbox.astype('float32'), - 'TargetLabel': tgt_lbl.astype('int64'), + 'TargetLabel': labels.astype('int32') } def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index b7e5ff6d52ad7dde3dd94b3bd660cfca383e1ada..a18941dd3126ac027f022ddafbbaed8516166233 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -88,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): self.check_grad(["Logits"], "Loss") +class TestSoftmaxWithCrossEntropyOp3(OpTest): + """ + Test softmax with cross entropy operator with ignore_index. + """ + + def setUp(self): + self.op_type = "softmax_with_cross_entropy" + batch_size = 41 + class_num = 37 + + logits = np.random.uniform(0.1, 1.0, + [batch_size, class_num]).astype("float64") + softmax = np.apply_along_axis(stable_softmax, 1, logits) + labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64") + ignore_index = 7 + cross_entropy = np.asmatrix( + [[-np.log(softmax[i][labels[i][0]])] + if labels[i] != ignore_index else [0] + for i in range(softmax.shape[0])], + dtype="float64") + + self.inputs = {"Logits": logits, "Label": labels} + self.outputs = { + "Softmax": softmax.astype("float64"), + "Loss": cross_entropy.astype("float64") + } + self.attrs = {"ignore_index": ignore_index} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["Logits"], "Loss") + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py index 5e98266a761c7e01bd6668e85e6adeb54103ca80..f33c05ed2f48c2498b98fc486d6ff7471088d77e 100644 --- a/python/paddle/fluid/transpiler/details/__init__.py +++ b/python/paddle/fluid/transpiler/details/__init__.py @@ -16,3 +16,4 @@ from __future__ import print_function from .program_utils import * from .ufind import * +from .checkport import * diff --git a/python/paddle/fluid/transpiler/details/checkport.py b/python/paddle/fluid/transpiler/details/checkport.py new file mode 100644 index 0000000000000000000000000000000000000000..7bad4b427a2d53bd14c7a1f870ce74a883158d04 --- /dev/null +++ b/python/paddle/fluid/transpiler/details/checkport.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import time +import socket +from contextlib import closing + + +def wait_server_ready(endpoints): + """ + Wait until parameter servers are ready, use connext_ex to detect + port readiness. + + Args: + endpoints (list): endpoints string list, like: + ["127.0.0.1:8080", "127.0.0.1:8081"] + + Examples: + .. code-block:: python + + wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"]) + """ + while True: + all_ok = True + for ep in endpoints: + ip_port = ep.split(":") + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as sock: + sock.settimeout(2) + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) + if result != 0: + all_ok = False + if not all_ok: + sys.stderr.write("pserver not ready, wait 3 sec to retry...\n") + sys.stderr.flush() + time.sleep(3) + else: + break diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d4d218d547a394a56c040ade2a9ba703b691b86b..53c9cbe23dd82af866658fe46d1d631b0a3b26f3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -381,7 +381,7 @@ class DistributeTranspiler(object): pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints) - def get_trainer_program(self): + def get_trainer_program(self, wait_port=True): """ Get transpiled trainer side program. @@ -393,6 +393,9 @@ class DistributeTranspiler(object): delete_ops(self.origin_program.global_block(), self.optimize_ops) self.origin_program.__str__() + if wait_port: + wait_server_ready(self.pserver_endpoints) + return self.origin_program def _get_trainer_startup_program(self, recv_vars, eplist): diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index f79fcb24bb5a48de00d03fc468b6526e48656d07..adad2428f7fdc554cf4efd652f52b5c5de0ab527 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -60,13 +60,46 @@ class InferenceTranspiler(object): if not isinstance(scope, core.Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) + + self._fuse_batch_norm(program, place, scope) if use_mkldnn: - self._fuse_relu_mkldnn(program) self._fuse_conv_bias_mkldnn(program) - else: - self._fuse_batch_norm(program, place, scope) + self._fuse_conv_relu_mkldnn(program) + self._fuse_bn_relu_mkldnn(program) + + def _fuse_conv_relu_mkldnn(self, program): + ''' + Transpile the program by fused relu activation for MKLDNN program. + Relu activation following convolution OP can be fused by adding + 'fuse_relu' attribute to convolution OP. + The result of fuse is: + - before: + - conv->relu->any_other_op + - after: + - conv->any_other_op + :param program: program to transpile + :type program: Program + ''' + self.block = program.block(0) + + i = 0 + while i < len(self.block.ops): + current_op = self.block.ops[i] + if current_op.type in ['conv2d']: + next_op = self.block.ops[i + 1] + if next_op.type == 'relu': + # modify conv OP to include relu + current_op.set_attr("fuse_relu", True) + # remove conv OP + self.block._remove_op(i + 1) + i = i + 1 + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() - def _fuse_relu_mkldnn(self, program): + def _fuse_bn_relu_mkldnn(self, program): ''' Transpile the program by fused relu activation for MKLDNN program. @@ -160,7 +193,6 @@ class InferenceTranspiler(object): self._fuse_conv_bias(i, current_op, next_op) self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove elementwise_add - i = i + 1 i = i + 1 self._remove_unused_var() diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 6d7ac876fdf65fe0e85cceb94c311a93d9ea39c2..5b9459b670ac8583ee0e65a3c1b51f6248bb6303 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -14,11 +14,14 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader', + 'multiprocess_reader' ] from threading import Thread import subprocess +import multiprocessing +import sys from six.moves.queue import Queue from six.moves import zip_longest @@ -332,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): return xreader +def multiprocess_reader(readers, use_pipe=True, queue_size=1000): + """ + multiprocess_reader use python multi process to read data from readers + and then use multiprocess.Queue or multiprocess.Pipe to merge all + data. The process number is equal to the number of input readers, each + process call one reader. + + Multiprocess.Queue require the rw access right to /dev/shm, some + platform does not support. + + you need to create multiple readers first, these readers should be independent + to each other so that each process can work independently. + + An example: + + .. code-block:: python + + reader0 = reader(["file01", "file02"]) + reader1 = reader(["file11", "file12"]) + reader1 = reader(["file21", "file22"]) + reader = multiprocess_reader([reader0, reader1, reader2], + queue_size=100, use_pipe=False) + """ + + try: + import ujson as json + except Exception as e: + sys.stderr.write("import ujson error: " + str(e) + " use json\n") + import json + + assert type(readers) is list and len(readers) > 0 + + def _read_into_queue(reader, queue): + for sample in reader(): + if sample is None: + raise ValueError("sample has None") + queue.put(sample) + queue.put(None) + + def queue_reader(): + queue = multiprocessing.Queue(queue_size) + for reader in readers: + p = multiprocessing.Process( + target=_read_into_queue, args=(reader, queue)) + p.start() + + reader_num = len(readers) + finish_num = 0 + while finish_num < reader_num: + sample = queue.get() + if sample is None: + finish_num += 1 + else: + yield sample + + def _read_into_pipe(reader, conn): + for sample in reader(): + if sample is None: + raise ValueError("sample has None!") + conn.send(json.dumps(sample)) + conn.send(json.dumps(None)) + conn.close() + + def pipe_reader(): + conns = [] + for reader in readers: + parent_conn, child_conn = multiprocessing.Pipe() + conns.append(parent_conn) + p = multiprocessing.Process( + target=_read_into_pipe, args=(reader, child_conn)) + p.start() + + reader_num = len(readers) + finish_num = 0 + conn_to_remove = [] + while finish_num < reader_num: + for conn in conn_to_remove: + conns.remove(conn) + conn_to_remove = [] + for conn in conns: + sample = json.loads(conn.recv()) + if sample is None: + finish_num += 1 + conn.close() + conn_to_remove.append(conn) + else: + yield sample + + if use_pipe: + return pipe_reader + else: + return queue_reader + + def _buf2lines(buf, line_break="\n"): # FIXME: line_break should be automatically configured. lines = buf.split(line_break) diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py index 537df489b9738864933b3a7922d178701db3d19f..c324092f8850e4bd64955aa9c987746b5cec54b5 100644 --- a/python/paddle/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -14,6 +14,7 @@ import time import unittest +import functools import paddle.reader @@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase): temp.close() +class TestMultiProcessReader(unittest.TestCase): + def setup(self): + self.samples = [] + for i in range(1000): + self.samples.append([[i], [i + 1, i + 2], i + 3]) + + def reader(index): + for i in range(len(self.samples)): + if i % 3 == index: + yield self.samples[i] + + self.reader0 = functools.partial(reader, 0) + self.reader1 = functools.partial(reader, 1) + self.reader2 = functools.partial(reader, 2) + + def reader_test(self, use_pipe): + self.setup() + results = [] + for data in paddle.reader.multiprocess_reader( + [self.reader0, self.reader1, self.reader2], 100, use_pipe)(): + results.append(data) + self.assertEqual(sorted(self.samples), sorted(results)) + + def test_multi_process_reader(self): + self.reader_test(use_pipe=False) + self.reader_test(use_pipe=True) + + if __name__ == '__main__': unittest.main()