提交 9e3245e3 编写于 作者: Y Yibing Liu

Merge branch 'develop' of upstream into fix_seq_pad

......@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
### Install Latest Stable Release:
```
# Linux CPU
......@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions!
......
......@@ -140,5 +140,11 @@ def parse_args():
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument(
'--reduce_strategy',
type=str,
choices=['reduce', 'all_reduce'],
default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce')
args = parser.parse_args()
return args
......@@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
program=train_prog,
pservers=pserver_endpoints,
trainers=trainers,
sync_mode=not args.async_mode)
sync_mode=not args.async_mode,
startup_program=startup_prog)
if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program(
......@@ -169,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.cpus
strategy.allow_op_delay = False
build_strategy = fluid.BuildStrategy()
if args.reduce_strategy == "reduce":
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
avg_loss = train_args[0]
if args.update_method == "pserver":
......@@ -183,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
avg_loss.name,
main_program=train_prog,
exec_strategy=strategy,
build_strategy=build_strategy,
num_trainers=num_trainers,
trainer_id=trainer_id)
......
......@@ -67,11 +67,14 @@ def cnn_model(data):
def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet.
filelist = [
os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
]
opt = None
data_file_handle = None
with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f)
for f in os.listdir(args.data_path)
]
data_file_handle = fluid.layers.open_files(
filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)],
......@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
if is_train:
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize()
opt.minimize(avg_cost)
if args.memory_optimize:
fluid.memory_optimize(main_prog)
......
......@@ -20,6 +20,7 @@ import functools
import numpy as np
import time
import os
import math
import cProfile, pstats, StringIO
......@@ -27,128 +28,120 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
# from recordio_converter import imagenet_train, imagenet_test
from imagenet_reader import train, val
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class ResNet():
def __init__(self, layers=50, is_train=True):
self.params = train_parameters
self.layers = layers
self.is_train = is_train
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(input=pool,
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)))
return out
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv, act=act, is_test=not self.is_train)
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
return self.conv_bn_layer(input, ch_out, 1, stride)
else:
return input
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
act='relu',
is_train=True):
conv1 = fluid.layers.conv2d(
input=input,
filter_size=filter_size,
num_filters=ch_out,
stride=stride,
padding=padding,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
def shortcut(input, ch_out, stride, is_train=True):
ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
if ch_in != ch_out:
return conv_bn_layer(
input, ch_out, 1, stride, 0, None, is_train=is_train)
else:
return input
def basicblock(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out * 4, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
conv3 = conv_bn_layer(
conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
def layer_warp(block_func, input, ch_out, count, stride):
res_out = block_func(input, ch_out, stride)
for i in range(1, count):
res_out = block_func(res_out, ch_out, 1)
return res_out
def bottleneck_block(self, input, num_filters, stride):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
def resnet_imagenet(input,
class_dim,
depth=50,
data_format='NCHW',
is_train=True):
short = self.shortcut(input, num_filters * 4, stride)
cfg = {
18: ([2, 2, 2, 1], basicblock),
34: ([3, 4, 6, 3], basicblock),
50: ([3, 4, 6, 3], bottleneck),
101: ([3, 4, 23, 3], bottleneck),
152: ([3, 8, 36, 3], bottleneck)
}
stages, block_func = cfg[depth]
conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
pool1 = fluid.layers.pool2d(
input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
res2 = layer_warp(block_func, res1, 128, stages[1], 2)
res3 = layer_warp(block_func, res2, 256, stages[2], 2)
res4 = layer_warp(block_func, res3, 512, stages[3], 2)
pool2 = fluid.layers.pool2d(
input=res4,
pool_size=7,
pool_type='avg',
pool_stride=1,
global_pooling=True)
out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
return out
def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
assert (depth - 2) % 6 == 0
n = (depth - 2) // 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
return out
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def _model_reader_dshape_classdim(args, is_train):
model = resnet_cifar10
model = None
reader = None
if args.data_set == "cifar10":
class_dim = 10
if args.data_format == 'NCHW':
dshape = [3, 32, 32]
else:
dshape = [32, 32, 3]
model = resnet_cifar10
if is_train:
reader = paddle.dataset.cifar.train10()
else:
reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers":
if args.data_set == "flowers":
class_dim = 102
if args.data_format == 'NCHW':
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if is_train:
reader = paddle.dataset.flowers.train()
else:
......@@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train):
dshape = [3, 224, 224]
else:
dshape = [224, 224, 3]
model = resnet_imagenet
if not args.data_path:
raise Exception(
"Must specify --data_path when training with imagenet")
......@@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train):
reader = train(xmap=False)
else:
reader = val(xmap=False)
return model, reader, dshape, class_dim
return reader, dshape, class_dim
def get_model(args, is_train, main_prog, startup_prog):
model, reader, dshape, class_dim = _model_reader_dshape_classdim(args,
is_train)
reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS"))
......@@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog):
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
predict = model(input, class_dim, is_train=is_train)
model = ResNet(is_train=is_train)
predict = model.net(input, class_dim=class_dim)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
......@@ -215,16 +207,15 @@ def get_model(args, is_train, main_prog, startup_prog):
total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1)
epochs = [30, 60, 80, 90]
step = int(total_images / (args.batch_size * args.gpus) + 1)
epochs = [30, 60, 90]
bd = [step * e for e in epochs]
base_lr = args.learning_rate
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=base_lr,
#learning_rate=fluid.layers.piecewise_decay(
# boundaries=bd, values=lr),
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
......
# PaddlePaddle发行规范
PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
PaddlePaddle每次发新的版本,遵循以下流程:
1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
1. 对这个版本的提交,做如下几个操作:
* 使用Regression Test List作为检查列表,测试本次release的正确性。
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`
* 将这个版本的python wheel包发布到pypi。
* 更新Docker镜像(参考后面的操作细节)。
1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。
1. 协同完成Release Note的书写。
2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`
3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
4. QA和研发发现的bug,在develop上修复验证后,cherry-pick修复到release分支。直到release分支相对稳定。
5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。
6. release分支稳定后,打上正式的release tag,比如`0.10.0`
7. 将这个版本的python wheel包发布到pypi。
8. 更新Docker镜像(参考后面的操作细节)。
需要注意的是:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
*`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop``release/版本号`这三个分支。
* bug修复需要先在develop上进行,然后进入release分支。而不是直接在release分支上开发。
* release分支原则上只接受修复类的修改,不接受新feature。
## 发布wheel包到pypi
......@@ -61,24 +60,21 @@ docker push [镜像]:[version]
## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。
* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。
* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
* `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。
* `master`分支因为历史原因,已经废弃。
* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master``develop`与可能有的`release/版本号`分支,同时提起`Pull Request`
* 其他开发者fork的feature branch。
* 建议,开发者的feature branch需要同步主版本库的`develop`分支。
* 建议,开发者的feature branch需要基于主版本库中的`develop`分支。
* 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
* 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。
## PaddlePaddle回归测试列表
本列表说明PaddlePaddle发版之前需要测试的功能点。
TODO
### PaddlePaddle Book中所有章节
......
......@@ -4,26 +4,21 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti
Each time we release a new PaddlePaddle version, we should follow the below steps:
1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
1. After that, we should do:
* Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
that this release has no major bugs.
* If regression test fails, we must fix those bugs and create a new `release/[version]`
branch from previous release branch.
* Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
* Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
* Update the Docker images (see below instructions for detail).
1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
then merge `master` to `develop`.
1. Update the Release Note.
***NOTE:***
* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
features only for current release, so that we can test on that version.
* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0`
2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`
3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable.
5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
6. After release branch is stable,Create the official release tag,such as `0.10.0`.
7. Release the python wheel package to pypi.
8. Update the docker image (More details below).
NOTE:
* bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch.
* release normally only accept bug fixes. Don't add new features.
## Publish Wheel Packages to pypi
......@@ -97,26 +92,22 @@ You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlep
## Branching Model
We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
with some modifications:
* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
regression tests are run.
* `release/[version]` branch is used to publish each release. Latest release version branches have
bugfix only for that version, but no feature updates.
* Developer forks are not required to follow
[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
branching model, all forks is like a feature branch.
* Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
* Advise: developer use it's fork's develop branch to for new branch to start developing.
* Use that branch on developer's fork to create pull requests and start reviews.
* developer can push new commits to that branch when the pull request is open.
* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
`master`, `develop` and `releases`.
PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model.
* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests.
* `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release.
* `master` branch as been deprecated for historical reasons
* Developer's feature branch。
* Developer's feature branch should sync with upstream `develop` branch.
* Developer's feature branch should be forked from upstream `develop` branch.
* After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review.
* In the review process, develop modify codes and push to their own feature branch.
## PaddlePaddle Regression Test List
TODO
### All Chapters of PaddlePaddle Book
We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
......
......@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
......@@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
......@@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
......@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
......@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -56,9 +56,9 @@ else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif()
if (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
else()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
endif (NOT WIN32)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
......@@ -116,7 +116,11 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
......
......@@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr);
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1;
} else {
......
......@@ -15,12 +15,15 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace framework {
namespace details {
void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return;
// The input and output may have dummy vars.
......
......@@ -348,14 +348,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
size_t cur_device_id = 0;
bool is_forwarding = true;
bool is_dist_train = false;
for (ir::Node *node : sorted_ops) {
if (boost::get<int>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) {
CreateRPCOp(&result, node);
int op_dev_id = CreateRPCOp(&result, node);
PADDLE_ENFORCE(op_dev_id != -1,
"Can not schedule the RPC operator to the right place.");
if (node->Op()->Type() == "recv") {
auto recv_vars_attr =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
if (recv_vars_attr[0].find(".block") == std::string::npos) {
bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
}
}
is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) {
CreateDistTrainOp(&result, node);
int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
bcast_var_name_set[op_dev_id].emplace(origin_param_name);
}
} else if (IsScaleLossOp(node)) {
// user can customize loss@grad if not use_default_grad_scale_
if (strategy_.gradient_scale_ !=
......@@ -414,7 +431,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateReduceOp(&result, g_name, cur_device_id);
graph->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(g_name, cur_device_id);
bcast_var_name_set[cur_device_id].emplace(p_name);
if (!is_dist_train) {
bcast_var_name_set[cur_device_id].emplace(p_name);
}
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(g_name)) {
......@@ -436,15 +455,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
}
}
}
bool use_gpu = false;
#ifdef PADDLE_WITH_CUDA
use_gpu = nccl_ctxs_ != nullptr;
#endif
if (use_gpu ||
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
// Insert BCast Ops
// Insert broadcast operators principle:
// 1. Broadcast optimized parameters in Reduce strategy;
// 2. No need broadcast optimized parameters in AllReduce strategy because of
// the optimization sub-graph would be run on every GPU;
// 3. Allways broadcast received parameters in Distribute Training.
if ((use_gpu &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
is_dist_train) {
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id];
for (auto &bcast_name : to_bcast_set) {
......@@ -676,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var;
}
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const {
int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const {
int op_dev_id = -1;
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
......@@ -720,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id);
return op_dev_id;
}
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
......@@ -738,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
}
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const {
int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const {
int op_dev_id = -1;
if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe.
......@@ -825,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
}
}
return op_dev_id;
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
......
......@@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
bool IsScaleLossOp(ir::Node *node) const;
void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/**
* Is this operator as the end-point operator before/after send operator.
......
......@@ -27,7 +27,8 @@ namespace framework {
namespace details {
void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr);
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return;
// the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
......
......@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op";
VLOG(10) << place_ << "RUN Scale loss grad op";
});
#endif
}
......
......@@ -16,6 +16,13 @@ syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package paddle.framework.proto;
// Any incompatible changes to ProgramDesc and its dependencies should
// raise the version defined version.h.
//
// Serailization and Deserialization codes should be modified in a way
// that supports old versions following the version and compatibility policy.
message Version { optional int64 version = 1 [ default = 0 ]; }
enum AttrType {
INT = 0;
FLOAT = 1;
......@@ -180,4 +187,8 @@ message BlockDesc {
// for more details.
// TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name?
message ProgramDesc { repeated BlockDesc blocks = 1; }
message ProgramDesc {
repeated BlockDesc blocks = 1;
optional Version version = 2;
}
......@@ -19,7 +19,7 @@ function(pass_library TARGET DEST)
endfunction()
cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node)
cc_library(graph SRCS graph.cc DEPS node pretty_log)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
......@@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass inference)
if(WITH_MKLDNN)
pass_library(conv_relu_mkldnn_fuse_pass inference)
endif()
pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass inference)
......@@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if(WITH_MKLDNN)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
endif()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd;
auto* conv_input = gpd.mutable_pattern()
->NewNode("conv_relu_mkldnn_fuse/conv_input")
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(),
"conv_relu_mkldnn_fuse");
conv_relu_pattern(conv_input);
int found_conv_relu_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op
GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out
GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op
// Create an ConvReLU Node.
OpDesc desc;
std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
std::string conv_relu_w_in = conv_weight->Name();
std::string conv_relu_b_in = conv_bias->Name();
std::string conv_relu_out = relu_out->Name();
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
}
desc.SetAttr("fuse_relu", true);
auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
PADDLE_ENFORCE(subgraph.count(conv_input));
IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
IR_NODE_LINK_TO(conv_weight, conv_relu_node);
IR_NODE_LINK_TO(conv_bias, conv_relu_node);
IR_NODE_LINK_TO(conv_relu_node, relu_out);
found_conv_relu_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_relu_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
paddle::framework::ir::ConvReLUFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the CONV and ReLU to a ConvReLUOp.
*/
class ConvReLUFusePass : public FusePassBase {
public:
virtual ~ConvReLUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (type == "conv2d") {
op->SetAttr("use_mkldnn", true);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
op->SetInput("Bias", {inputs[2]});
} else if (type == "relu") {
op->SetInput("X", inputs);
}
op->SetOutput("Out", outputs);
}
// a->OP0->b
// b->OP1->c
// (c, weights, bias)->conv->f
// (f)->relu->g
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias") {
var->SetPersistable(true);
}
}
SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"}));
SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
std::vector<std::string>({"f"}));
SetOp(&prog, "relu", std::vector<std::string>({"f"}),
std::vector<std::string>({"g"}));
return prog;
}
TEST(ConvReLUFusePass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
// Remove 3 Nodes: CONV, RELU, conv_out
// Add 1 Node: ConvReLU
EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
// Assert conv_relu op in newly generated graph
int conv_relu_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") {
if (node->Op()->HasAttr("use_mkldnn")) {
bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
if (use_mkldnn) {
if (node->Op()->HasAttr("fuse_relu")) {
bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
if (fuse_relu) {
++conv_relu_count;
}
}
}
}
}
}
EXPECT_EQ(conv_relu_count, 1);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(conv_relu_mkldnn_fuse_pass);
......@@ -51,7 +51,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
if (with_fc_bias) {
// Add FC-bias with LSTM-bias and create a new weight
PADDLE_ENFORCE(scope);
const std::string& new_bias_var = name_scope + "_bias.new";
const std::string& new_bias_var = patterns::UniqueKey("NewBias");
auto* bias_var = scope->Var(new_bias_var);
PADDLE_ENFORCE(bias_var);
auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
......@@ -120,7 +120,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
......@@ -136,7 +135,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
fc_bias);
// Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes(
{mul, lstm, elementwise_add});
{mul, lstm, elementwise_add, fc_bias});
GraphSafeRemoveNodes(graph, marked_nodes);
} else {
GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
......
......@@ -21,12 +21,17 @@
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace framework {
namespace ir {
using string::PrettyLogEndl;
using string::PrettyLog;
using string::Style;
size_t PDPattern::id_ = 0UL;
PDNode* PDPattern::NewNode(const std::string& name) {
......@@ -83,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph,
ValidateByNodeRole(&subgraphs);
if (subgraphs.empty()) return;
LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size());
int id = 0;
for (auto& g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph";
......@@ -517,6 +522,39 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
return false;
}
PDNode* patterns::ConvReLU::operator()(
paddle::framework::ir::PDNode* conv_input) {
// Create Operators
conv_input->assert_is_op_input("conv2d", "Input");
auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
// Create variables
// Filter
auto* conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter");
// Bias
auto* conv_bias_var = pattern->NewNode(conv_bias_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Bias");
// intermediate variable, will be removed in the IR after fuse.
auto* conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate()
->assert_is_only_output_of_op("conv2d")
->assert_is_op_input("relu");
// output
auto* relu_out_var = pattern->NewNode(relu_out_repr())
->AsOutput()
->assert_is_op_output("relu");
conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
.LinksTo({conv_out_var});
relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
return relu_out_var;
}
PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
bool with_bias) {
// Create shared nodes.
......
......@@ -360,6 +360,28 @@ struct PatternBase {
size_t id_;
};
// CONV with ReLU
// op: conv + relu
// named nodes:
// conv_input, conv_weight,
// conv_bias, conv_out, conv,
// relu_out, relu
struct ConvReLU : public PatternBase {
ConvReLU(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_relu") {}
PDNode* operator()(PDNode* conv_input);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(relu);
// declare variable node's name
PATTERN_DECL_NODE(conv_weight);
PATTERN_DECL_NODE(conv_bias);
PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(relu_out);
};
// FC with bias
// op: mul + elementwise_add
// named nodes:
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
......@@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) {
{ // the 1st field, uint32_t version for LoDTensor
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version));
os.write(reinterpret_cast<const char *>(&kCurTensorVersion),
sizeof(kCurTensorVersion));
}
{
// the 2st field, LoD information
......@@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
// the 1st field, unit32_t version for LoDTensor
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE(framework::IsTensorVersionSupported(version),
"tensor version %u is not supported.", version);
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
}
{
......
......@@ -495,35 +495,35 @@ class RuntimeInferShapeContext : public InferShapeContext {
: op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const override {
if (!op_.HasInputs(name)) {
// has only one input
const auto& ins = op_.Inputs();
auto it = ins.find(name);
if (it == ins.end()) {
return false;
}
auto& ins = Inputs(name);
size_t length = ins.size();
if (length == 0) {
const auto& in = it->second;
if (in.size() == 0 || in[0] == kEmptyVarName) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL,
PADDLE_ENFORCE_EQ(in.size(), 1UL,
"Input %s should not have more than one inputs", name);
auto ipt = ins[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
return scope_.FindVar(in[0]) != nullptr;
}
bool HasOutput(const std::string& name) const override {
if (!op_.HasOutputs(name)) {
// has only one output
const auto& outs = op_.Outputs();
auto it = outs.find(name);
if (it == outs.end()) {
return false;
}
auto& outs = Outputs(name);
size_t length = outs.size();
if (length == 0) {
const auto& out = it->second;
if (out.size() == 0 || out[0] == kEmptyVarName) {
return false;
}
PADDLE_ENFORCE_EQ(length, 1UL,
"Output %s should not have more than one inputs", name);
auto ipt = outs[0];
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
PADDLE_ENFORCE_EQ(out.size(), 1UL,
"Output %s should not have more than one outputs", name);
return scope_.FindVar(out[0]) != nullptr;
}
bool HasInputs(const std::string& name) const override {
......
......@@ -352,7 +352,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
ParallelExecutor::~ParallelExecutor() {
if (member_->own_local_scope_) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
Scope *local_scope = member_->local_scopes_[i];
if (member_->global_scope_->HasKid(local_scope)) {
member_->global_scope_->DeleteScope(local_scope);
}
}
}
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/version.h"
namespace paddle {
namespace framework {
......@@ -38,7 +39,10 @@ proto::ProgramDesc *ProgramDesc::Proto() {
return &desc_;
}
int64_t ProgramDesc::Version() const { return desc_.version().version(); }
ProgramDesc::ProgramDesc() {
desc_.mutable_version()->set_version(kCurProgramVersion);
auto *block = desc_.mutable_blocks()->Add();
block->set_idx(kRootBlockIndex);
block->set_parent_idx(kNoneBlockIndex);
......
......@@ -57,6 +57,8 @@ class ProgramDesc {
proto::ProgramDesc *Proto();
int64_t Version() const;
// The output variable of feed_op is referenced as feed_target.
// This function is used to collect the output variable's name of all
// feed_ops.
......
......@@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) {
ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
op_origin->Proto()->SerializeAsString());
ASSERT_EQ(op_origin->Proto()->attrs().size(),
op_copy->Proto()->attrs().size());
for (auto it = op_origin->Proto()->attrs().begin();
it != op_origin->Proto()->attrs().end(); ++it) {
for (auto it_2 = op_copy->Proto()->attrs().begin();
it_2 != op_copy->Proto()->attrs().end(); ++it_2) {
if (it->name() == it_2->name()) {
ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString());
}
}
}
if (op->Type() == "op_with_subblock") {
ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
......
......@@ -56,5 +56,76 @@ struct RWLock {
};
#endif
class RWLockGuard {
public:
enum Status { kUnLock, kWRLock, kRDLock };
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
void WRLock() {
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
void RDLock() {
switch (status_) {
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
void UnLock() {
if (status_ != Status::kUnLock) {
lock_->UNLock();
status_ = Status::kUnLock;
}
}
~RWLockGuard() { UnLock(); }
private:
RWLock* lock_;
Status status_;
};
} // namespace framework
} // namespace paddle
......@@ -72,6 +72,12 @@ void Scope::DropKids() {
kids_.clear();
}
bool Scope::HasKid(const Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end();
}
std::vector<std::string> Scope::LocalVarNames() const {
std::unique_lock<std::mutex> lock(mutex_);
std::vector<std::string> known_vars;
......
......@@ -71,6 +71,9 @@ class Scope {
/// Drop all kids scopes belonged to this scope.
void DropKids();
/// Find if a scope exists in the kid scopes
bool HasKid(const Scope* scope) const;
// enumerate all the variables current contains.
std::vector<std::string> LocalVarNames() const;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/version.h"
#include <algorithm>
namespace paddle {
namespace framework {
bool IsProgramVersionSupported(int64_t version) {
static int num_supported =
sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]);
return std::find(kSupportedProgramVersion,
kSupportedProgramVersion + num_supported,
version) != kSupportedProgramVersion + num_supported;
}
bool IsTensorVersionSupported(uint32_t version) {
static int num_supported =
sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]);
return std::find(kSupportedTensorVersion,
kSupportedTensorVersion + num_supported,
version) != kSupportedTensorVersion + num_supported;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdint>
#pragma once
namespace paddle {
namespace framework {
// Note:
// Program and Tensor that pass the IsXXXVersionSupported should
// be supported by the current codes. Otherwise, it's a compatibility
// bug.
// The program version the current codes generate.
constexpr int64_t kCurProgramVersion = 0;
// The program version that was generated by previous or current codes
// and supported by current codes.
constexpr int64_t kSupportedProgramVersion[] = {0};
// Due to historical reasons, tensor version use uint32_t.
// The tensor version the current codes generate.
constexpr uint32_t kCurTensorVersion = 0;
// The tensor version that was generated by previous or current codes
// and supported by current codes.
constexpr uint32_t kSupportedTensorVersion[] = {0};
bool IsProgramVersionSupported(int64_t version);
bool IsTensorVersionSupported(uint32_t version);
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/version.h"
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
TEST(Version, Basic) {
EXPECT_TRUE(IsProgramVersionSupported(0));
EXPECT_FALSE(IsProgramVersionSupported(1));
EXPECT_FALSE(IsProgramVersionSupported(-1));
EXPECT_TRUE(IsTensorVersionSupported(0));
EXPECT_FALSE(IsTensorVersionSupported(1));
EXPECT_FALSE(IsTensorVersionSupported(-1));
}
} // namespace framework
} // namespace paddle
......@@ -55,6 +55,7 @@ if(NOT APPLE)
endif()
if(WITH_TESTING)
# both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
# tests/book depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book)
add_subdirectory(tests/api)
endif()
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
set(analysis_deps
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor)
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc
......@@ -40,27 +40,7 @@ function (inference_analysis_test TARGET)
endif(WITH_TESTING)
endfunction(inference_analysis_test)
function (inference_download_and_uncompress install_dir url gz_filename)
message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
message(STATUS "finish downloading ${gz_filename}")
endfunction(inference_download_and_uncompress)
set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1" CACHE PATH "RNN1 model and data root." FORCE)
if (NOT EXISTS ${RNN1_INSTALL_DIR} AND WITH_TESTING)
inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} "rnn1%2Fmodel.tar.gz")
inference_download_and_uncompress(${RNN1_INSTALL_DIR} ${RNN1_DATA_URL} "rnn1%2Fdata.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_model=${RNN1_INSTALL_DIR}/model
--infer_data=${RNN1_INSTALL_DIR}/data.txt)
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
......@@ -71,46 +51,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
endif()
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
--topn=1 # Just run top 1 batch.
)
......@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
"mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", //
"fc_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", //
#endif
}};
std::unordered_set<std::string> disabled_ir_passes_;
......
......@@ -16,21 +16,9 @@
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
......@@ -91,274 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
}
}
namespace {
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<std::vector<float>> week_data_all, minute_data_all;
std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
data.week_data_all.assign(week_data_all.begin() + batch_iter,
week_data_all.begin() + batch_end);
data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
minute_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
data.lod3.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
CHECK(!data.week_data_all.empty());
CHECK(!data.minute_data_all.empty());
CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
}
data.rnn_week_datas.push_back(data.week_data_all[j]);
data.rnn_minute_datas.push_back(data.minute_data_all[j]);
// calculate lod
data.lod1.push_back(data.lod1.back() +
data.link_step_data_all[j].size());
data.lod3.push_back(data.lod3.back() + 1);
for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
data.lod2.push_back(data.lod2.back() +
data.link_step_data_all[j].size());
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
std::vector<std::vector<float>> link_step_data;
std::vector<std::string> link_datas;
split(data[0], '|', &link_datas);
for (auto &step_data : link_datas) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
link_step_data.push_back(tmp);
}
// load week data
std::vector<float> week_data;
split_to_float(data[2], ',', &week_data);
// load minute data
std::vector<float> minute_data;
split_to_float(data[1], ',', &minute_data);
link_step_data_all.push_back(std::move(link_step_data));
week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data));
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
week_tensor, minute_tensor;
lod_attention_tensor.name = "data_lod_attention";
init_zero_tensor.name = "cell_init";
lod_tensor_tensor.name = "data";
week_tensor.name = "week";
minute_tensor.name = "minute";
auto one_batch = data->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor.shape.assign({1, 2});
lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
init_zero_tensor.shape.assign({batch_size, 15});
init_zero_tensor.lod.assign({one_batch.lod3});
lod_tensor_tensor.shape = rnn_link_data_shape;
lod_tensor_tensor.lod.assign({one_batch.lod1});
// clang-format off
week_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor.lod.assign({one_batch.lod3});
minute_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData<float>(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
init_zero_tensor1, lod_attention_tensor,
lod_tensor_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::FLOAT32;
}
}
} // namespace
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs);
if (num_threads == 1) {
// Prepare inputs.
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
CompareResult(outputs, base_outputs);
} else {
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
if (use_analysis && activate_ir) {
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
}
}
// Inference with analysis and IR, easy for profiling independently.
TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
// Other unit-tests of RNN1, test different options of use_analysis,
// activate_ir and multi-threads.
TEST(Analyzer, RNN_tests) {
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis
// module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestRNN1Prediction(true, true, i);
}
TEST(Analyzer, word2vec_without_analysis) {
TestWord2vecPrediction(FLAGS_inference_model_dir);
}
} // namespace analysis
......
......@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
......@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end()));
}
// Filter the Intermediate results of the subgraph node.
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
......@@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out);
} else {
out->SetDeleted();
}
}
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
// The filtered_subgraph_outlinks may be empty.
op_nodes[i]->outlinks = filtered_subgraph_outlinks;
}
}
......
......@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
// collect inputs
std::unordered_set<std::string> input_names;
std::unordered_set<std::string> input_names_with_id;
for (auto *x : func->inlinks) {
input_names.insert(x->name());
input_names_with_id.insert(x->name() + std::to_string(x->id()));
}
desc.SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::unordered_set<std::string> output_names;
std::unordered_set<std::string> output_names_with_id;
for (auto *x : func->outlinks) {
output_names.insert(x->name());
output_names_with_id.insert(x->name() + std::to_string(x->id()));
}
std::vector<std::string> output_temp(output_names.begin(),
output_names.end());
desc.SetOutput("Ys", output_temp);
desc.SetOutput(
"Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
desc.SetType("tensorrt_engine");
std::unordered_map<std::string, std::string> output_name_map;
......@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) {
std::string arg_value = in_var->arguments(k);
if (input_names.count(arg_value)) {
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value);
} else {
replaced_names.push_back(arg_value +
std::to_string(var2id[arg_value]));
replaced_names.push_back(arg_value_with_id);
}
}
in_var->clear_arguments();
......@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k);
if (output_names.count(arg_value)) {
output_name_map[arg_value] =
arg_value + std::to_string(var2id[arg_value]);
std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (output_names_with_id.count(arg_value_with_id)) {
output_name_map[arg_value] = arg_value_with_id;
}
replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
replaced_names.push_back(arg_value_with_id);
}
out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) {
......
......@@ -14,13 +14,18 @@
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
namespace analysis {
using string::PrettyLogEndl;
using string::PrettyLog;
using string::Style;
IRPassManager::IRPassManager(const ProgramDesc &program,
framework::Scope *scope)
......@@ -33,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program,
void IRPassManager::Apply(const std::vector<std::string> &passes) {
// Apply all the passes
std::string pre_pass;
int pass_num = 0;
for (const std::string &pass_name : passes) {
LOG(WARNING) << "Running IR pass [" << pass_name << "]";
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
if (pass_name == "graph_viz_pass") {
std::string dot_file_path =
"ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot";
std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
(pre_pass.empty() ? "origin" : pre_pass) +
".dot";
pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
pass_num++;
}
graph_ = pass->Apply(std::move(graph_));
pre_pass = pass_name;
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
......@@ -22,7 +23,7 @@ namespace analysis {
bool PassManager::Initialize(Argument* argument) {
argument_ = argument;
for (auto& pass : data_) {
LOG(WARNING) << "Initializing pass [" << pass->repr() << "]";
VLOG(3) << "Initializing pass [" << pass->repr() << "]";
if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false;
......@@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) {
void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_);
LOG(INFO) << "Total " << data_.size() << " Analysys passes";
VLOG(3) << "Total " << data_.size() << " Analysys passes";
for (auto& pass : data_) {
LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]";
string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
pass->repr());
pass->Run(argument_->main_dfg.get());
}
}
......
......@@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
}
// This is a simple representation of a graph.
// The BriefNode hold the pointer of the Node.
// This is to avoid changing the original graph
// in the process of trt graph analysis.
struct BriefNode {
explicit BriefNode(Node *n) { node = n; }
Node *node;
std::vector<BriefNode *> inlinks;
std::vector<BriefNode *> outlinks;
};
// Union two adjacent BriefNode.
// Suppose we have two adjacent nodes src and dst.
// We will perform the following operations:
// 1. add all inputs(except src) of dst to src inlinks.
// 2. add all outputs of dst to src outlinks.
// 3. change all the dst's inputs and outputs
// corresponding inlinks and outlinks to src node.
// 4. delete all dst's inlinks and outlinks.
void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
int src_id, int dst_id) {
// merge the two adjacent nodes into one node.
BriefNode *src_node = node_map.at(src_id);
BriefNode *dst_node = node_map.at(dst_id);
std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
src_node->inlinks.end());
std::unordered_set<BriefNode *> outputs;
for (auto *n : src_node->outlinks) {
if (n != dst_node) outputs.insert(n);
}
// Add the inlinks and outlinks of dst node to src node.
std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
for (BriefNode *node : dst_in_nodes) {
if (node != src_node) {
inputs.insert(node);
}
}
std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
for (BriefNode *node : dst_out_nodes) {
outputs.insert(node);
}
// update the dst and src node's inlinks and outlinks.
#ifdef __clang__
src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#else
src_node->inlinks =
std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
src_node->outlinks =
std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#endif
auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
for (auto *&n : nodes) {
if (n == src_node || n == dst_node) {
n = src_node;
}
}
};
// Change all the dst inputs and outputs corresponding inlink and
// outlink to the src node.
for (auto *node : src_node->inlinks) {
inlink_or_outlink_cleaner(node->outlinks);
}
for (auto *node : src_node->outlinks) {
inlink_or_outlink_cleaner(node->inlinks);
}
}
// FlexibleDFS
// If reverse is true, do reverse dfs.
// If enter func is not nullptr, calls enter(node) before visiting any children
// of node.
// If leave func not nullptr, calls leave(node) after visiting all parents of
// node.
void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
const std::function<bool(const BriefNode *)> &enter,
const std::function<bool(const BriefNode *)> &leave) {
typedef struct {
const BriefNode *node;
bool leave;
} FNode;
std::vector<FNode> stack;
for (auto &node : source) {
stack.push_back(FNode{node, false});
}
std::unordered_set<const BriefNode *> visited;
while (!stack.empty()) {
auto fnode = stack.back();
stack.pop_back();
if (fnode.leave) {
if (leave && !leave(fnode.node)) return;
}
if (visited.count(fnode.node)) continue;
visited.insert(fnode.node);
if (enter && !enter(fnode.node)) return;
if (leave) stack.push_back(FNode{fnode.node, true});
const std::vector<BriefNode *> iter_nodes =
reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
for (const BriefNode *node : iter_nodes) {
if (!visited.count(node)) {
stack.push_back(FNode{node, false});
}
}
}
}
std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
// Run the Extract algorithm to find all subgraphs.
std::vector<Node *> marked_nodes;
// We use brief_node_map to represent the original graph in order to avoid
// changing the original graph.
std::unordered_map<int, BriefNode *> brief_node_map;
for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
brief_node_map[node.id()] = new BriefNode(&node);
if (node.attr(kMarkerAttrName).Bool()) {
marked_nodes.push_back(&node);
}
}
// extract sub-graphs in the marked node set, use Union Find algorithm.
node_map_t node_map; // id to ptr
for (auto *n : marked_nodes) {
......@@ -88,11 +216,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
n->attr(kUnionFindParent).Int32() = n->id();
node_map[n->id()] = n;
}
std::unordered_set<Node *> visited;
for (auto *n : marked_nodes) {
for (auto *out : n->outlinks) {
if (node_map.count(out->id())) {
UnionFindCombine(node_map, n->id(), out->id());
// create breif node map
for (auto &itr : brief_node_map) {
for (Node *node : itr.second->node->inlinks) {
itr.second->inlinks.push_back(brief_node_map[node->id()]);
}
for (Node *node : itr.second->node->outlinks) {
itr.second->outlinks.push_back(brief_node_map[node->id()]);
}
}
for (auto &itr : brief_node_map) {
BriefNode *brief_node = itr.second;
if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
continue;
}
// Our algorithm must guarantee that:
// 1. The graph is always directed acyclic graph(DAG).
// 2. If there is a path in the subgraph from X to Y (X and Y are both
// nodes in the subgraph), then all paths from X to Y are in the
// subgraph.
//
// In order to achieve the above guarantee.
// For adjacent nodes src -> dst.
// 1. Get all dst input nodes except src.
// 2. Reverse DFS from those input nodes
// 3. If there is a path from input nodes to src,
// then the src and dst nodes can not be fused into one node,
// otherwise it can be done.
while (true) {
std::unordered_set<BriefNode *> contract_nodes;
for (auto *out : brief_node->outlinks) {
// must be an trt candidate
if (!out->node->attr(kMarkerAttrName).Bool()) continue;
// get all dst input nodes except src.
std::vector<BriefNode *> source_nodes;
for (auto *n : out->inlinks) {
if (n != brief_node) {
source_nodes.push_back(n);
}
}
// Reverse DFS from the source_nodes.
bool have_excess_path = false;
FlexibleDFS(source_nodes, true, nullptr,
[&have_excess_path, brief_node](const BriefNode *n) {
if (n == brief_node) {
have_excess_path = true;
return false;
}
return true;
});
if (have_excess_path) continue;
contract_nodes.insert(out);
}
if (contract_nodes.empty()) break;
for (auto dst_node : contract_nodes) {
UnionFindCombine(node_map, brief_node->node->id(),
dst_node->node->id());
UnionContractedNodes(brief_node_map, brief_node->node->id(),
dst_node->node->id());
}
}
}
......@@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto io = ExtractInputAndOutputOfSubGraph(subgraph);
block_node->inlinks = std::move(io.first);
block_node->outlinks = std::move(io.second);
for (auto *node : subgraph) {
// TODO(Superjomn) need a unified mechanism to treat deleted node in each
// pass.
......
......@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) {
// At least one nodes should be deleted.
ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock
ASSERT_EQ(6, count1);
ASSERT_EQ(11, count1);
}
} // namespace analysis
......
......@@ -77,6 +77,9 @@ bool AnalysisPredictor::Init(
OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
VLOG(5) << "to create variables";
PADDLE_ENFORCE(scope_.get());
......
......@@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init(
}
ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0);
......@@ -262,7 +265,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
buffer.Resize(sizeof(T) * data.size());
}
std::memcpy(buffer.data(), data.data(), buffer.length());
std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
// copy LoD
for (const auto &level : fetch.lod()) {
output->lod.emplace_back(level);
......
......@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
}
void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) {
double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======";
if (epoch > 1) {
int samples = batch_size * epoch;
LOG(INFO) << "====== sample number: " << samples
<< ", average latency of each sample: " << latency / samples
<< "ms ======";
}
}
} // namespace inference
......
......@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {}
// Own memory.
PaddleBuf(size_t length)
explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes.
void Resize(size_t length);
......@@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
bool use_gpu{false};
int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
// NOTE: NOT use it, just for the internal test, will discard later
bool _use_mkldnn{false};
// Specify the variable's name of each input.
bool specify_input_name{false};
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/pybind/pybind.h"
......@@ -124,6 +125,9 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, dirname, "");
return main_program;
......@@ -138,6 +142,9 @@ std::unique_ptr<framework::ProgramDesc> Load(
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, "", param_filename);
return main_program;
......
......@@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter {
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
nvinfer1::ActivationType::kRELU);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("relu (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
......
......@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter {
scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Y").front();
layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Bias").front()] =
std::move(combile_bias_tensor);
engine_->weight_map[op_desc.Input("Scale").front()] =
......
......@@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter {
axis = axis - 1; // Remove batch dim
layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("concat (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
......
......@@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter {
layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front();
layer->setName(("conv2d (Output: " + output_name + ")").c_str());
engine_->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
......
......@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
......@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter {
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
......
......@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter {
n_output, tmp_weight.get(), bias.get());
auto output_name = op_desc.Output("Out").front();
layer->setName(("fc (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
if (test_mode) {
......
......@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter {
layer->setPadding(nv_paddings);
auto output_name = op_desc.Output("Out")[0];
layer->setName(("pool2d (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
......
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download_and_uncompress install_dir filename)
message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
message(STATUS "finish downloading ${filename}")
endfunction(inference_download_and_uncompress)
function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
inference_download_and_uncompress(${install_dir} ${model_name})
inference_download_and_uncompress(${install_dir} ${data_name})
endif()
endfunction()
# RNN1
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RNN1_INSTALL_DIR}/model
--infer_data=${RNN1_INSTALL_DIR}/data.txt)
# RNN2
set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RNN2_INSTALL_DIR}/model
--infer_data=${RNN2_INSTALL_DIR}/data.txt)
# chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
# lac
set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
# text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
# ocr
set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
get_filename_component(filename ${OCR_MODEL_URL} NAME)
message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
message(STATUS "finish downloading ${filename}")
endif()
inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${OCR_INSTALL_DIR}/model
--infer_data=${OCR_INSTALL_DIR}/data.txt)
......@@ -12,21 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -117,34 +103,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({input_tensor});
}
void BenchAllData(const std::string &model_path, const std::string &data_file,
const int batch_size, const int repeat) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
GetOneBatch(&input_slots, &data, batch_size);
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
double sum = 0;
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
}
PrintTime(batch_size, repeat, 1, 0, sum / repeat);
}
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
......@@ -152,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data,
bool use_analysis = false) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
const int repeat, bool use_analysis = false) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
if (test_all_data) {
double sum = 0;
LOG(INFO) << "Total number of samples: " << data.datasets.size();
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
input_slots_all.emplace_back(input_slots);
}
PrintTime(batch_size, repeat, 1, 0, sum / repeat);
LOG(INFO) << "Average latency of each sample: "
<< sum / repeat / data.datasets.size() << " ms";
LOG(INFO) << "total number of samples: " << data.datasets.size();
TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
return;
}
timer.tic();
......@@ -218,19 +166,10 @@ void TestLACPrediction(const std::string &model_path,
if (use_analysis) {
// run once for comparion as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], pdata[i]);
}
CompareResult(ref_outputs_slots, outputs_slots);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
......@@ -259,13 +198,13 @@ void TestLACPrediction(const std::string &model_path,
TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data);
FLAGS_repeat);
}
TEST(Analyzer_LAC, analysis) {
LOG(INFO) << "LAC with analysis";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data, true);
FLAGS_repeat, true);
}
} // namespace analysis
......
......@@ -12,20 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -113,49 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25};
void TestChineseNERPrediction(bool use_analysis) {
NativeConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param";
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param";
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
double sum = 0;
size_t num_samples;
for (int i = 0; i < FLAGS_repeat; i++) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
num_samples = data.num_samples;
for (size_t bid = 0; bid < num_samples; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
timer.tic();
predictor->Run(input_slots, &outputs);
sum += timer.toc();
}
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << num_samples;
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
LOG(INFO) << "average latency of each sample: "
<< sum / FLAGS_repeat / num_samples;
LOG(INFO) << "total number of samples: " << data.num_samples;
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
return;
}
// Prepare inputs.
......@@ -181,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
if (use_analysis) {
// run once for comparion as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], result[i]);
}
CompareResult(ref_outputs_slots, outputs);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
using namespace framework; // NOLINT
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<std::vector<float>> week_data_all, minute_data_all;
std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
data.week_data_all.assign(week_data_all.begin() + batch_iter,
week_data_all.begin() + batch_end);
data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
minute_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
data.lod3.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
CHECK(!data.week_data_all.empty());
CHECK(!data.minute_data_all.empty());
CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
}
data.rnn_week_datas.push_back(data.week_data_all[j]);
data.rnn_minute_datas.push_back(data.minute_data_all[j]);
// calculate lod
data.lod1.push_back(data.lod1.back() +
data.link_step_data_all[j].size());
data.lod3.push_back(data.lod3.back() + 1);
for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
data.lod2.push_back(data.lod2.back() +
data.link_step_data_all[j].size());
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
std::vector<std::vector<float>> link_step_data;
std::vector<std::string> link_datas;
split(data[0], '|', &link_datas);
for (auto &step_data : link_datas) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
link_step_data.push_back(tmp);
}
// load week data
std::vector<float> week_data;
split_to_float(data[2], ',', &week_data);
// load minute data
std::vector<float> minute_data;
split_to_float(data[1], ',', &minute_data);
link_step_data_all.push_back(std::move(link_step_data));
week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data));
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
week_tensor, minute_tensor;
lod_attention_tensor.name = "data_lod_attention";
init_zero_tensor.name = "cell_init";
lod_tensor_tensor.name = "data";
week_tensor.name = "week";
minute_tensor.name = "minute";
auto one_batch = data->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor.shape.assign({1, 2});
lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
init_zero_tensor.shape.assign({batch_size, 15});
init_zero_tensor.lod.assign({one_batch.lod3});
lod_tensor_tensor.shape = rnn_link_data_shape;
lod_tensor_tensor.lod.assign({one_batch.lod1});
// clang-format off
week_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor.lod.assign({one_batch.lod3});
minute_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData<float>(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
init_zero_tensor1, lod_attention_tensor,
lod_tensor_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::FLOAT32;
}
}
// Test with a really complicate model.
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs);
std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots);
if (num_threads == 1) {
TestOneThreadPrediction(config, input_slots_all, &outputs);
CompareResult(outputs, base_outputs);
} else {
// only return the output of first thread
TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
}
if (use_analysis && activate_ir) {
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
}
}
// Inference with analysis and IR, easy for profiling independently.
TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
// Other unit-tests of RNN1, test different options of use_analysis,
// activate_ir and multi-threads.
TEST(Analyzer, RNN_tests) {
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestRNN1Prediction(true, true, i);
}
}
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
using namespace framework; // NOLINT
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
// Prepare LoDs
data.lod.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
// calculate lod
data.lod.push_back(data.lod.back() + 11);
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
if (num_lines % 2) { // feature
std::vector<std::string> feature_data;
split(data[1], ' ', &feature_data);
std::vector<std::vector<float>> link_step_data;
int feature_count = 1;
std::vector<float> feature;
for (auto &step_data : feature_data) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
feature.insert(feature.end(), tmp.begin(), tmp.end());
if (feature_count % 11 == 0) { // each sample has 11 features
link_step_data.push_back(feature);
feature.clear();
}
feature_count++;
}
link_step_data_all.push_back(std::move(link_step_data));
} else { // result
std::vector<float> tmp;
split_to_float(data[1], ',', &tmp);
result_data.insert(result_data.end(), tmp.begin(), tmp.end());
}
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor feed_tensor;
feed_tensor.name = "feed";
auto one_batch = data->NextBatch();
int token_size = one_batch.rnn_link_data.size();
// each token has 11 features, each feature's dim is 54.
std::vector<int> rnn_link_data_shape({token_size * 11, 54});
feed_tensor.shape = rnn_link_data_shape;
feed_tensor.lod.assign({one_batch.lod});
feed_tensor.dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
// Set inputs.
input_slots->assign({feed_tensor});
}
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<float> &base_result) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_result[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestRNN2Prediction() {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = true;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
Timer timer1;
timer1.tic();
for (int i = 0; i < num_times; i++) {
base_predictor->Run(input_slots, &base_outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
Timer timer2;
timer2.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
CompareResult(base_outputs, data.result_data);
CompareResult(outputs, data.result_data);
}
TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
} // namespace inference
} // namespace paddle
......@@ -12,23 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
DEFINE_int32(topn, -1, "Run top n batches of data to save time");
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
......@@ -37,24 +21,25 @@ struct DataReader {
explicit DataReader(const std::string &path)
: file(new std::ifstream(path)) {}
bool NextBatch(PaddleTensor *tensor, int batch_size) {
bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
PADDLE_ENFORCE_EQ(batch_size, 1);
std::string line;
tensor->lod.clear();
tensor->lod.emplace_back(std::vector<size_t>({0}));
PaddleTensor tensor;
tensor.dtype = PaddleDType::INT64;
tensor.lod.emplace_back(std::vector<size_t>({0}));
std::vector<int64_t> data;
for (int i = 0; i < batch_size; i++) {
if (!std::getline(*file, line)) return false;
inference::split_to_int64(line, ' ', &data);
}
tensor->lod.front().push_back(data.size());
tensor.lod.front().push_back(data.size());
tensor->data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
tensor->shape.clear();
tensor->shape.push_back(data.size());
tensor->shape.push_back(1);
tensor.data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
tensor.shape.push_back(data.size());
tensor.shape.push_back(1);
input->assign({tensor});
return true;
}
......@@ -68,32 +53,28 @@ void Main(int batch_size) {
config.model_dir = FLAGS_infer_model;
config.use_gpu = false;
config.enable_ir_optim = true;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
auto &input = input_slots[0];
input.dtype = PaddleDType::INT64;
inference::Timer timer;
double sum = 0;
std::vector<PaddleTensor> output_slots;
std::vector<PaddleTensor> input_slots, output_slots;
DataReader reader(FLAGS_infer_data);
std::vector<std::vector<PaddleTensor>> input_slots_all;
int num_batches = 0;
for (int t = 0; t < FLAGS_repeat; t++) {
DataReader reader(FLAGS_infer_data);
while (reader.NextBatch(&input, FLAGS_batch_size)) {
if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
int num_batches = 0;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
input_slots_all.emplace_back(input_slots);
++num_batches;
}
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
}
PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
// one batch starts
// data --
reader.NextBatch(&input_slots, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
// Get output
LOG(INFO) << "get outputs " << output_slots.size();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
Record ProcessALine(const std::string &line) {
VLOG(3) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
<< "data format error, should be <data>\t<shape>";
Record record;
std::vector<std::string> data_strs;
split(columns[0], ' ', &data_strs);
for (auto &d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(columns[1], ' ', &shape_strs);
for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
return record;
}
/*
* Use the native and analysis fluid engine to inference the demo.
* ocr, mobilenet and se_resnext50
*/
void TestVisualPrediction(bool use_mkldnn) {
std::unique_ptr<PaddlePredictor> predictor;
AnalysisConfig cfg;
cfg.param_file = FLAGS_infer_model + "/__params__";
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.use_gpu = false;
cfg._use_mkldnn = use_mkldnn;
cfg.device = 0;
cfg.enable_ir_optim = true;
// TODO(TJ): fix fusion gru
cfg.ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN
// disable mkldnn fuse since it should have some bugs
cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
#endif
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
// Only have single batch of data.
std::string line;
std::ifstream file(FLAGS_infer_data);
std::getline(file, line);
auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input;
input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs_slots;
Timer timer;
timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) {
predictor->Run({input}, &outputs_slots);
}
PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
timer.toc() / FLAGS_repeat);
VLOG(3) << "output.size " << outputs_slots.size();
// run native as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run({input}, &ref_outputs_slots);
CompareResult(outputs_slots, ref_outputs_slots);
// print what are fused
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
}
TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_vis, analysis_mkldnn) {
TestVisualPrediction(/*use_mkldnn*/ true);
}
#endif
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0);
EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &ref_out = ref_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_GT(size, 0);
EXPECT_EQ(size, ref_size);
EXPECT_EQ(out.dtype, ref_out.dtype);
switch (out.dtype) {
case PaddleDType::INT64: {
int64_t *pdata = static_cast<int64_t *>(out.data.data());
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
case PaddleDType::FLOAT32: {
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = static_cast<float *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
}
break;
}
}
}
}
void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs);
}
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
inputs.size());
}
void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
std::vector<PaddleTensor> outputs_tid;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs_tid.size(); j++) {
predictors[tid]->Run(inputs_tid[j], &outputs_tid);
}
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times, inputs_tid.size());
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs);
} else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads);
}
}
} // namespace inference
} // namespace paddle
......@@ -24,28 +24,28 @@ namespace operators {
void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of AttentionLSTM should not be null.");
"Assert only one Input(X) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(C0) of AttentionLSTM should not be null.");
"Assert only one Input(C0) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
"Input(LSTMWeight) of AttentionLSTM should not be null.");
"Assert only one Input(LSTMWeight) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
"Input(LSTMBias) of AttentionLSTM should not be null.");
"Assert only one Input(LSTMBias) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
"Input(AttentionWeight) of AttentionLSTM should not be null.");
"Assert only one Input(AttentionWeight) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of AttentionLSTM should not be null.");
"Assert only one Output(Hidden) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of AttentionLSTM should not be null.");
"Assert only one Output(Cell) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
"Output(AttentionedX) of AttentionLSTM should not be null.");
"Assert only one Output(AttentionedX) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
"Output(AttentionFCOut) of AttentionLSTM should not be null.");
"Assert only one Output(AttentionFCOut) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
"Output(LSTMX) of AttentionLSTM should not be null.");
"Assert only one Output(LSTMX) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
"Output(LSTMOUT) of AttentionLSTM should not be null.");
"Assert only one Output(LSTMOUT) of AttentionLSTM.");
auto x_dims = ctx->GetInputDim("X");
const int M = x_dims[1];
......
......@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_channels / groups * output_height * output_width * output_depth;
int group_offset_filter = filter->numel() / groups;
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) {
......@@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_filter_desc, filter_data + i * group_offset_filter,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......@@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, filter_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
}
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) {
......@@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + i * group_offset_in));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
filter_data + i * group_offset_filter, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc,
data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
// ------------------- cudnn conv backward filter ---------------------
......@@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
cudnn_conv_desc, filter_algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_filter_desc,
filter_grad_data + i * group_offset_filter));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc,
input_data + i * group_offset_in, cudnn_output_grad_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc,
filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......
......@@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd,
user_weights_memory_p, "@weights_mem_p",
pipeline);
pipeline, is_persistent);
}
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
......@@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
......@@ -296,10 +299,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
// TODO: add support for dilation
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
......@@ -310,6 +313,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
if (g > 1) {
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives
......@@ -323,7 +339,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
{weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? filter->format() : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
......@@ -336,7 +353,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
weights_tz, platform::MKLDNNGetDataType<T>(),
(g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc(
......@@ -348,11 +366,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine);
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides,
paddings, mkldnn_engine, fuse_relu);
} else {
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
paddings, mkldnn_engine);
paddings, mkldnn_engine, fuse_relu);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
......@@ -371,7 +390,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline);
user_weights_memory_p, pipeline, is_test);
auto dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
......@@ -402,11 +421,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
}
private:
mkldnn::primitive_attr AddRelu() const {
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
mkldnn::primitive_attr conv_attr;
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
mkldnn::post_ops post_operations;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine) const {
const mkldnn::engine& engine,
const bool fuse_relu) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
......@@ -415,8 +449,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
auto p_conv_pd =
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
mkldnn::primitive_attr conv_attr;
if (fuse_relu) {
conv_attr = AddRelu();
}
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
......@@ -427,7 +466,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine) const {
const mkldnn::engine& engine,
const bool fuse_relu) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
......@@ -436,8 +476,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
auto p_conv_pd =
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
mkldnn::primitive_attr conv_attr;
if (fuse_relu) {
conv_attr = AddRelu();
}
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
......
......@@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
}
void Conv2DOpMaker::Make() {
AddAttr<bool>("is_test", "").SetDefault(false);
AddInput(
"Input",
"(Tensor) The input tensor of convolution operator. "
......@@ -161,6 +162,8 @@ void Conv2DOpMaker::Make() {
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......
......@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
if (user_workspace_size > 0) {
......@@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward ---------------------
int input_offset = input->numel() / input->dims()[0] / groups;
int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f;
for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......@@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::max(workspace_size_in_bytes, bwd_filter_ws_size);
}
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call.
int input_offset = input->numel() / input->dims()[0] / groups;
......@@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_filter_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
......@@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_input_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
filter_grad_data + filter_offset * g));
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc,
output_grad_data + output_grad_offset * g, cudnn_input_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo,
cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
}
}
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
}
};
......
......@@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false), a flag indicating whether to "
"interpretate the given labels as soft labels.")
.SetDefault(false);
AddAttr<int>("ignore_index",
"(int, default -100), Specifies a target value that is"
"ignored and does not contribute to the input gradient."
"Only valid if soft_label is set to False")
.SetDefault(-100);
AddComment(R"DOC(
CrossEntropy Operator.
......
......@@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
math::CrossEntropyFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
ctx.Attr<bool>("soft_label"));
ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"));
}
};
......@@ -74,16 +74,22 @@ class XeGradFunctor {
const T* dy, // NOLINT
const T* x, // NOLINT
const int64_t* label, // NOLINT
size_t num_classes)
: dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
size_t num_classes, size_t ignore_index)
: dx_(dx),
dy_(dy),
x_(x),
label_(label),
num_classes_(num_classes),
ignore_index_(ignore_index) {}
HOSTDEVICE void operator()(size_t sample_id) {
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
for (size_t x_offset = sample_id * num_classes_;
x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
dx_[x_offset] = x_offset != x_is_true_offset
? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
dx_[x_offset] =
(x_offset != x_is_true_offset || label_[sample_id] == ignore_index_)
? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
}
}
......@@ -93,6 +99,7 @@ class XeGradFunctor {
const T* x_;
const int64_t* label_;
size_t num_classes_;
size_t ignore_index_;
};
template <typename DeviceContext, typename T>
......@@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
// unnecessary to convert tensors to 2-D views.
int rank = x->dims().size();
int64_t class_num = x->dims()[rank - 1];
int64_t ignore_index = ctx.Attr<int>("ignore_index");
if (ctx.Attr<bool>("soft_label")) {
XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
label->data<T>(),
......@@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
static_cast<size_t>(dx->numel()));
for_range(functor);
} else {
XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
label->data<int64_t>(),
static_cast<size_t>(class_num));
XeGradFunctor<T> functor(
dx_data, dy->data<T>(), x->data<T>(), label->data<int64_t>(),
static_cast<size_t>(class_num), static_cast<size_t>(ignore_index));
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
static_cast<size_t>(dy->numel()));
......
......@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
......@@ -21,7 +22,7 @@ namespace operators {
*/
template <typename T>
inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
const framework::Tensor& gt_boxes, const T* weights,
const framework::Tensor& gt_boxes, const float* weights,
const bool normalized, framework::Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
......@@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num,
}
}
template <typename T>
void BboxOverlaps(const framework::Tensor& r_boxes,
const framework::Tensor& c_boxes,
framework::Tensor* overlaps) {
auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
int r_num = r_boxes.dims()[0];
int c_num = c_boxes.dims()[0];
auto zero = static_cast<T>(0.0);
T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
inter_area;
for (int i = 0; i < r_num; ++i) {
r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
(r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
for (int j = 0; j < c_num; ++j) {
c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
(c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
}
}
}
} // namespace operators
} // namespace paddle
......@@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
"Input(RpnRois) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
"Input(GtClasses) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
"Input(IsCrowd) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
"Input(GtBoxes) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("ImScales"),
"Input(ImScales) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasOutput("Rois"),
"Output(Rois) of RpnTargetAssignOp should not be null");
......@@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
auto gt_classes_dims = ctx->GetInputDim("GtClasses");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto im_scales_dims = ctx->GetInputDim("ImScales");
auto im_info_dims = ctx->GetInputDim("ImInfo");
PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
"The rank of Input(RpnRois) must be 2.");
PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1,
"The rank of Input(GtClasses) must be 1.");
PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
"The rank of Input(GtBoxes) must be 2.");
PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1,
"The rank of Input(ImScales) must be 1.");
PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(ImInfo) must be 2.");
int class_nums = ctx->Attrs().Get<int>("class_nums");
ctx->SetOutputDim("Rois", {-1, 4});
ctx->SetOutputDim("LabelsInt32", {-1});
ctx->SetOutputDim("LabelsInt32", {-1, 1});
ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
......@@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context,
concat_functor(context, inputs, axis, out_tensor);
}
template <typename T>
void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
Tensor* overlaps) {
auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
int r_num = r_boxes.dims()[0];
int c_num = c_boxes.dims()[0];
auto zero = static_cast<T>(0.0);
T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
inter_area;
for (int i = 0; i < r_num; ++i) {
r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
(r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
for (int j = 0; j < c_num; ++j) {
c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
(c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
}
}
}
template <typename T>
std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& context, Tensor* iou,
const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
const float bg_thresh_hi, const float bg_thresh_lo,
std::minstd_rand engine) {
const Tensor& is_crowd, const int batch_size_per_im,
const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
T* proposal_to_gt_overlaps = iou->mutable_data<T>(context.GetPlace());
int64_t gt_num = is_crowd.numel();
const int* crowd_data = is_crowd.data<int>();
T* proposal_to_gt_overlaps = iou->data<T>();
int64_t row = iou->dims()[0];
int64_t col = iou->dims()[1];
float epsilon = 0.00001;
......@@ -152,6 +125,9 @@ std::vector<std::vector<int>> SampleFgBgGt(
for (int64_t i = 0; i < row; ++i) {
const T* v = proposal_to_gt_overlaps + i * col;
T max_overlap = *std::max_element(v, v + col);
if ((i < gt_num) && (crowd_data[i])) {
max_overlap = -1.0;
}
if (max_overlap > fg_thresh) {
for (int64_t j = 0; j < col; ++j) {
T val = proposal_to_gt_overlaps[i * col + j];
......@@ -170,17 +146,19 @@ std::vector<std::vector<int>> SampleFgBgGt(
}
// Reservoir Sampling
std::uniform_real_distribution<float> uniform(0, 1);
int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
int fg_rois_this_image = fg_inds.size();
int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
std::uniform_real_distribution<float> uniform(0, 1);
const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
if (fg_size > fg_rois_per_this_image) {
for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < fg_rois_per_this_image) {
std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
if (use_random) {
const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
if (fg_size > fg_rois_per_this_image) {
for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < fg_rois_per_this_image) {
std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
}
}
}
}
......@@ -192,12 +170,14 @@ std::vector<std::vector<int>> SampleFgBgGt(
int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
int bg_rois_this_image = bg_inds.size();
int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
if (bg_size > bg_rois_per_this_image) {
for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < fg_rois_per_this_image)
std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
if (use_random) {
const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
if (bg_size > bg_rois_per_this_image) {
for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < fg_rois_per_this_image)
std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
}
}
}
std::vector<int> new_bg_inds(bg_inds.begin(),
......@@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
template <typename T>
std::vector<Tensor> SampleRoisForOneImage(
const platform::CPUDeviceContext& context, Tensor* rpn_rois,
Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale,
Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
const float bg_thresh_hi, const float bg_thresh_lo,
const std::vector<float>& bbox_reg_weights, const int class_nums,
std::minstd_rand engine) {
std::minstd_rand engine, bool use_random) {
auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
auto im_scale_data = im_scale->data<T>()[0];
rpn_rois_et = rpn_rois_et / im_scale_data;
auto im_scale = im_info->data<T>()[2];
rpn_rois_et = rpn_rois_et / im_scale;
Tensor boxes;
int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
......@@ -270,8 +250,8 @@ std::vector<Tensor> SampleRoisForOneImage(
// Generate proposal index
std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction,
fg_thresh, bg_thresh_hi, bg_thresh_lo, engine);
context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
std::vector<int> fg_inds = fg_bg_gt[0];
std::vector<int> bg_inds = fg_bg_gt[1];
std::vector<int> gt_inds = fg_bg_gt[2];
......@@ -291,15 +271,15 @@ std::vector<Tensor> SampleRoisForOneImage(
// Compute targets
Tensor bbox_targets_single;
bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
&bbox_targets_single);
BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
false, &bbox_targets_single);
// Scale rois
Tensor sampled_rois;
sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
sampled_rois_et = sampled_boxes_et * im_scale_data;
sampled_rois_et = sampled_boxes_et * im_scale;
// Expand box targets
Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
......@@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override {
auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
auto* gt_classes = context.Input<LoDTensor>("GtClasses");
auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
auto* im_scales = context.Input<LoDTensor>("ImScales");
auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* rois = context.Output<LoDTensor>("Rois");
auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
......@@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
std::vector<float> bbox_reg_weights =
context.Attr<std::vector<float>>("bbox_reg_weights");
int class_nums = context.Attr<int>("class_nums");
bool use_random = context.Attr<bool>("use_random");
PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
"GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
PADDLE_ENFORCE_EQ(
gt_classes->lod().size(), 1UL,
"GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
"GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
"GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
labels_int32->mutable_data<int>({n * batch_size_per_im},
labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
context.GetPlace());
bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
context.GetPlace());
......@@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
int seed = rnd();
engine.seed(seed);
framework::LoD lod;
......@@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto rpn_rois_lod = rpn_rois->lod().back();
auto gt_classes_lod = gt_classes->lod().back();
auto is_crowd_lod = is_crowd->lod().back();
auto gt_boxes_lod = gt_boxes->lod().back();
for (int i = 0; i < n; ++i) {
Tensor rpn_rois_slice =
rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
Tensor gt_classes_slice =
gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
Tensor is_crowd_slice =
is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
Tensor gt_boxes_slice =
gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
Tensor im_scales_slice = im_scales->Slice(i, i + 1);
Tensor im_info_slice = im_info->Slice(i, i + 1);
std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
dev_ctx, &rpn_rois_slice, &gt_classes_slice, &gt_boxes_slice,
&im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh,
bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine);
dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
&gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
engine, use_random);
Tensor sampled_rois = tensor_output[0];
Tensor sampled_labels_int32 = tensor_output[1];
Tensor sampled_bbox_targets = tensor_output[2];
......@@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
bbox_inside_weights->set_lod(lod);
bbox_outside_weights->set_lod(lod);
rois->Resize({num_rois, kBoxDim});
labels_int32->Resize({num_rois});
labels_int32->Resize({num_rois, 1});
bbox_targets->Resize({num_rois, kBoxDim * class_nums});
bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
......@@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
// TODO(buxingyuan): Add Document
AddInput("RpnRois", "RpnRois.");
AddInput("GtClasses", "GtClasses.");
AddInput("IsCrowd", "IsCrowd.");
AddInput("GtBoxes", "GtBoxes.");
AddInput("ImScales", "ImScales.");
AddInput("ImInfo", "ImInfo.");
AddOutput("Rois", "Rois.");
AddOutput("LabelsInt32", "LabelsInt32.");
......@@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
AddAttr<int>("class_nums", "class_nums");
AddAttr<bool>("fix_seed", "fix_seed").SetDefault(false);
AddAttr<int>("seed", "seed").SetDefault(0);
AddAttr<bool>("use_random", "use_random").SetDefault(true);
AddComment(R"DOC(
Generate Proposals Labels Operator.
......
......@@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
}
for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len];
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1];
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2;
T anchor_center_y =
(anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2;
T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0;
......@@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y;
bbox_width = std::exp(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2]) *
bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) *
anchor_width;
bbox_height = std::exp(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3]) *
bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) *
anchor_height;
} else {
bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) *
anchor_height;
}
proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
}
// return proposals;
}
......@@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
float min_size, const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
min_size *= im_info_data[2];
T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
y_ctr <= im_info_data[0]) {
if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i;
}
}
......@@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin;
const T inter_h = inter_ymax - inter_ymin;
const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized);
......
......@@ -20,6 +20,7 @@ if(WITH_GRPC)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc)
return()
endif()
......
......@@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() {
}
channels_.clear();
}
client_thread_->join();
}
bool GRPCClient::AsyncSendVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name, int64_t time_out) {
VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
SendProcessor* s = new SendProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
this] {
framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
auto* var = p_scope->FindVar(var_name_val);
::grpc::ByteBuffer req;
SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
// varhandle
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = var_name_val;
var_h.ctx = p_ctx;
var_h.method = "Send";
VLOG(3) << var_h.String() << " begin";
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context
SendProcessor* s = new SendProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = nullptr;
auto call = s->stub_g_.PrepareUnaryCall(
......@@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
});
req_count_++;
return true;
return h;
}
void ProcGetResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) {
framework::Variable* outvar = nullptr;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar);
}
template <typename T>
......@@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
result->Swap(&tmp);
}
bool GRPCClient::AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name, int64_t time_out) {
VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
this] {
framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
// prepare input
sendrecv::VariableMessage req;
req.set_varname(var_name_val);
::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
// var handle
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = var_name_val;
var_h.ctx = p_ctx;
var_h.method = "Get";
VLOG(3) << var_h.String() << " begin";
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse;
auto call = s->stub_g_.PrepareUnaryCall(
......@@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
req_count_++;
return true;
return h;
}
bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
int64_t time_out) {
VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string in_var_name_val = in_var_name;
const std::string out_var_name_val = out_var_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch);
VarHandlePtr h(
new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
time_out, ch, this] {
time_out, s, this] {
auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
// var handle
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = out_var_name_val;
var_h.ctx = p_ctx;
var_h.method = "Prefetch";
VLOG(3) << var_h.String() << " begin";
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse;
auto call = s->stub_g_.PrepareUnaryCall(
......@@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
});
req_count_++;
return true;
return h;
}
void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out) {
VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out) {
const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out);
VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
return h;
}
void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out) {
VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out) {
const auto ch = GetChannel(ep);
FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
s->Prepare(time_out);
VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
return h;
}
void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
int64_t time_out) {
const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out);
VarHandlePtr h(
new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
return h;
}
void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const std::string& dir,
int64_t time_out) {
VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const std::string& dir,
int64_t time_out) {
const auto ch = GetChannel(ep);
CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
s->Prepare(time_out);
VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req;
req.set_varname(CHECKPOINT_SAVE_MESSAGE);
......@@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
return h;
}
bool GRPCClient::Wait() {
......@@ -276,32 +268,42 @@ void GRPCClient::Proceed() {
void* tag = nullptr;
bool ok = false;
VLOG(3) << "GRPCClient Proceed begin";
while (!stopped_ && cq_.Next(&tag, &ok)) {
BaseProcessor* c = static_cast<BaseProcessor*>(tag);
GPR_ASSERT(ok);
PADDLE_ENFORCE(c);
if (c->status_.ok()) {
VLOG(3) << c->var_h_.String() << " process";
VLOG(3) << c->GetVarHandlePtr()->String() << " process";
c->Process();
} else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
LOG(ERROR) << c->var_h_.String()
LOG(ERROR) << c->GetVarHandlePtr()->String()
<< " meets grpc error:" << c->status_.error_message();
{
std::lock_guard<std::mutex> lk(sync_mutex_);
ok_ = false;
}
sync_cond_.notify_all();
c->Finish(false);
} else {
LOG(FATAL) << c->var_h_.String()
LOG(FATAL) << c->GetVarHandlePtr()->String()
<< " meets grpc error:" << c->status_.error_message();
c->Finish(false);
}
delete c;
bool notify = false;
{
std::lock_guard<std::mutex> lk(sync_mutex_);
req_count_--;
notify = (req_count_ <= 0 || !c->status_.ok());
}
delete c;
if (notify) {
sync_cond_.notify_all();
}
sync_cond_.notify_all();
}
VLOG(3) << "GRPCClient Proceed end";
}
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
......
......@@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
class BaseProcessor {
public:
explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
context_ = nullptr;
}
BaseProcessor() { context_ = nullptr; }
virtual ~BaseProcessor() {}
virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
virtual void Prepare(VarHandlePtr h, int64_t time_out) {
var_h_ = h;
context_.reset(new grpc::ClientContext());
var_h_ = var_info;
context_->set_wait_for_ready(true);
if (time_out) {
std::chrono::system_clock::time_point deadline =
......@@ -71,21 +70,21 @@ class BaseProcessor {
}
}
virtual void Prepare(int64_t time_out) {
context_.reset(new grpc::ClientContext());
context_->set_wait_for_ready(true);
std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
context_->set_deadline(deadline);
void Process() {
ProcessImpl();
var_h_->Finish(true);
}
virtual void Process() = 0;
VarHandlePtr GetVarHandlePtr() { return var_h_; }
bool Wait() { return var_h_->Wait(); }
void Finish(bool ok) { return var_h_->Finish(ok); }
virtual void ProcessImpl() = 0;
std::unique_ptr<grpc::ClientContext> context_;
grpc::Status status_;
VarHandle var_h_;
protected:
VarHandlePtr var_h_;
};
typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
......@@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
class SendProcessor : public BaseProcessor {
public:
explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch), stub_g_(ch) {}
: BaseProcessor(), stub_g_(ch) {}
virtual ~SendProcessor() {}
virtual void Process() {
void ProcessImpl() override {
if (response_call_back_) {
response_call_back_(var_h_, reply_);
response_call_back_(*var_h_.get(), reply_);
}
}
......@@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
class GetProcessor : public BaseProcessor {
public:
explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch), stub_g_(ch) {}
: BaseProcessor(), stub_g_(ch) {}
virtual ~GetProcessor() {}
virtual void Process() {
void ProcessImpl() override {
if (response_call_back_) {
response_call_back_(var_h_, reply_);
response_call_back_(*var_h_.get(), reply_);
}
}
......@@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor {
class BatchBarrierProcessor : public BaseProcessor {
public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {
: BaseProcessor() {
stub_ = sendrecv::SendRecvService::NewStub(ch);
}
virtual ~BatchBarrierProcessor() {}
virtual void Process() {}
void ProcessImpl() override {}
sendrecv::VoidMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
};
......@@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
class FetchBarrierProcessor : public BaseProcessor {
public:
explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {
: BaseProcessor() {
stub_ = sendrecv::SendRecvService::NewStub(ch);
}
virtual ~FetchBarrierProcessor() {}
virtual void Process() {}
void ProcessImpl() override {}
sendrecv::VariableMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
};
......@@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor {
class CheckpointNotifyProcessor : public BaseProcessor {
public:
explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {
: BaseProcessor() {
stub_ = sendrecv::SendRecvService::NewStub(ch);
}
virtual ~CheckpointNotifyProcessor() {}
virtual void Process() {}
void ProcessImpl() override {}
sendrecv::VoidMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
};
......@@ -177,32 +176,37 @@ class GRPCClient : public RPCClient {
GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
virtual ~GRPCClient();
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
bool AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendComplete(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendBatchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendFetchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncCheckpointNotify(
const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
bool Wait() override;
......
......@@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname,
LOG(FATAL) << "sync: Can not find server side var: " << varname;
return false;
}
if (invar->IsType<framework::SelectedRows>()) {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
sparse_vars_.push_back(invar);
}
}
}
return true;
}
void RequestSendHandler::ResetSparseVarRecorder() {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
for (auto* var : sparse_vars_) {
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
}
sparse_vars_.clear();
}
bool RequestGetHandler::Handle(const std::string& varname,
framework::Scope* scope,
framework::Variable* invar,
......
......@@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler {
bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const std::string& out_var_name = "") override;
void ResetSparseVarRecorder();
private:
std::mutex mutex_sparse_vars_;
std::vector<framework::Variable*> sparse_vars_;
};
class RequestGetHandler final : public RequestHandler {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册