“72061b0ac0a135e40eb811278e9ad9b8cac48168”上不存在“python/paddle/fluid/tests/unittests/test_fleet_runtime.py”
提交 fd9dc75f 编写于 作者: D dzhwinter

Merge remote-tracking branch 'origin/develop' into memory/stable

...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. ...@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0) ### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
### Install Latest Stable Release: ### Install Latest Stable Release:
``` ```
# Linux CPU # Linux CPU
...@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85 ...@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
## Installation ## Installation
It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website. It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
## Documentation ## Documentation
We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation. [Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
- [Deep Learning 101](https://github.com/PaddlePaddle/book) - [Deep Learning 101](https://github.com/PaddlePaddle/book)
You might want to start from this online interactive book that can run in a Jupyter Notebook. You might want to start from this online interactive book that can run in a Jupyter Notebook.
- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html) - [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
You can run distributed training jobs on MPI clusters. You can run distributed training jobs on MPI clusters.
- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html) - [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
Our new API enables much shorter programs. Our new API enables much shorter programs.
- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html) - [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
We appreciate your contributions! We appreciate your contributions!
......
...@@ -140,5 +140,11 @@ def parse_args(): ...@@ -140,5 +140,11 @@ def parse_args():
'--use_lars', '--use_lars',
action='store_true', action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.') help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument(
'--reduce_strategy',
type=str,
choices=['reduce', 'all_reduce'],
default='all_reduce',
help='Specify the reduce strategy, can be reduce, all_reduce')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): ...@@ -91,7 +91,8 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
program=train_prog, program=train_prog,
pservers=pserver_endpoints, pservers=pserver_endpoints,
trainers=trainers, trainers=trainers,
sync_mode=not args.async_mode) sync_mode=not args.async_mode,
startup_program=startup_prog)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_program = t.get_pserver_program(current_endpoint) pserver_program = t.get_pserver_program(current_endpoint)
pserver_startup_program = t.get_startup_program( pserver_startup_program = t.get_startup_program(
...@@ -169,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -169,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.cpus strategy.num_threads = args.cpus
strategy.allow_op_delay = False strategy.allow_op_delay = False
build_strategy = fluid.BuildStrategy()
if args.reduce_strategy == "reduce":
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.Reduce
else:
build_strategy.reduce_strategy = fluid.BuildStrategy(
).ReduceStrategy.AllReduce
avg_loss = train_args[0] avg_loss = train_args[0]
if args.update_method == "pserver": if args.update_method == "pserver":
...@@ -183,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, ...@@ -183,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
avg_loss.name, avg_loss.name,
main_program=train_prog, main_program=train_prog,
exec_strategy=strategy, exec_strategy=strategy,
build_strategy=build_strategy,
num_trainers=num_trainers, num_trainers=num_trainers,
trainer_id=trainer_id) trainer_id=trainer_id)
......
...@@ -67,11 +67,14 @@ def cnn_model(data): ...@@ -67,11 +67,14 @@ def cnn_model(data):
def get_model(args, is_train, main_prog, startup_prog): def get_model(args, is_train, main_prog, startup_prog):
# NOTE: mnist is small, we don't implement data sharding yet. # NOTE: mnist is small, we don't implement data sharding yet.
filelist = [ opt = None
os.path.join(args.data_path, f) for f in os.listdir(args.data_path) data_file_handle = None
]
with fluid.program_guard(main_prog, startup_prog): with fluid.program_guard(main_prog, startup_prog):
if args.use_reader_op: if args.use_reader_op:
filelist = [
os.path.join(args.data_path, f)
for f in os.listdir(args.data_path)
]
data_file_handle = fluid.layers.open_files( data_file_handle = fluid.layers.open_files(
filenames=filelist, filenames=filelist,
shapes=[[-1, 1, 28, 28], (-1, 1)], shapes=[[-1, 1, 28, 28], (-1, 1)],
...@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
if is_train: if is_train:
opt = fluid.optimizer.AdamOptimizer( opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999) learning_rate=0.001, beta1=0.9, beta2=0.999)
opt.minimize() opt.minimize(avg_cost)
if args.memory_optimize: if args.memory_optimize:
fluid.memory_optimize(main_prog) fluid.memory_optimize(main_prog)
......
...@@ -20,6 +20,7 @@ import functools ...@@ -20,6 +20,7 @@ import functools
import numpy as np import numpy as np
import time import time
import os import os
import math
import cProfile, pstats, StringIO import cProfile, pstats, StringIO
...@@ -27,128 +28,120 @@ import paddle ...@@ -27,128 +28,120 @@ import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
# from recordio_converter import imagenet_train, imagenet_test
from imagenet_reader import train, val from imagenet_reader import train, val
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 256,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class ResNet():
def __init__(self, layers=50, is_train=True):
self.params = train_parameters
self.layers = layers
self.is_train = is_train
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(input=pool,
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)))
return out
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(
input=conv, act=act, is_test=not self.is_train)
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
return self.conv_bn_layer(input, ch_out, 1, stride)
else:
return input
def conv_bn_layer(input, def bottleneck_block(self, input, num_filters, stride):
ch_out, conv0 = self.conv_bn_layer(
filter_size, input=input, num_filters=num_filters, filter_size=1, act='relu')
stride, conv1 = self.conv_bn_layer(
padding, input=conv0,
act='relu', num_filters=num_filters,
is_train=True): filter_size=3,
conv1 = fluid.layers.conv2d( stride=stride,
input=input, act='relu')
filter_size=filter_size, conv2 = self.conv_bn_layer(
num_filters=ch_out, input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
stride=stride,
padding=padding,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv1, act=act, is_test=not is_train)
def shortcut(input, ch_out, stride, is_train=True):
ch_in = input.shape[1] # if args.data_format == 'NCHW' else input.shape[-1]
if ch_in != ch_out:
return conv_bn_layer(
input, ch_out, 1, stride, 0, None, is_train=is_train)
else:
return input
def basicblock(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 3, stride, 1, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def bottleneck(input, ch_out, stride, is_train=True):
short = shortcut(input, ch_out * 4, stride, is_train=is_train)
conv1 = conv_bn_layer(input, ch_out, 1, stride, 0, is_train=is_train)
conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, is_train=is_train)
conv3 = conv_bn_layer(
conv2, ch_out * 4, 1, 1, 0, act=None, is_train=is_train)
return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
def layer_warp(block_func, input, ch_out, count, stride):
res_out = block_func(input, ch_out, stride)
for i in range(1, count):
res_out = block_func(res_out, ch_out, 1)
return res_out
def resnet_imagenet(input, short = self.shortcut(input, num_filters * 4, stride)
class_dim,
depth=50,
data_format='NCHW',
is_train=True):
cfg = { return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
18: ([2, 2, 2, 1], basicblock),
34: ([3, 4, 6, 3], basicblock),
50: ([3, 4, 6, 3], bottleneck),
101: ([3, 4, 23, 3], bottleneck),
152: ([3, 8, 36, 3], bottleneck)
}
stages, block_func = cfg[depth]
conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
pool1 = fluid.layers.pool2d(
input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
res2 = layer_warp(block_func, res1, 128, stages[1], 2)
res3 = layer_warp(block_func, res2, 256, stages[2], 2)
res4 = layer_warp(block_func, res3, 512, stages[3], 2)
pool2 = fluid.layers.pool2d(
input=res4,
pool_size=7,
pool_type='avg',
pool_stride=1,
global_pooling=True)
out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
return out
def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
assert (depth - 2) % 6 == 0
n = (depth - 2) // 6
conv1 = conv_bn_layer(
input=input, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
return out
def _model_reader_dshape_classdim(args, is_train): def _model_reader_dshape_classdim(args, is_train):
model = resnet_cifar10 model = None
reader = None reader = None
if args.data_set == "cifar10": if args.data_set == "flowers":
class_dim = 10
if args.data_format == 'NCHW':
dshape = [3, 32, 32]
else:
dshape = [32, 32, 3]
model = resnet_cifar10
if is_train:
reader = paddle.dataset.cifar.train10()
else:
reader = paddle.dataset.cifar.test10()
elif args.data_set == "flowers":
class_dim = 102 class_dim = 102
if args.data_format == 'NCHW': if args.data_format == 'NCHW':
dshape = [3, 224, 224] dshape = [3, 224, 224]
else: else:
dshape = [224, 224, 3] dshape = [224, 224, 3]
model = resnet_imagenet
if is_train: if is_train:
reader = paddle.dataset.flowers.train() reader = paddle.dataset.flowers.train()
else: else:
...@@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train): ...@@ -159,7 +152,6 @@ def _model_reader_dshape_classdim(args, is_train):
dshape = [3, 224, 224] dshape = [3, 224, 224]
else: else:
dshape = [224, 224, 3] dshape = [224, 224, 3]
model = resnet_imagenet
if not args.data_path: if not args.data_path:
raise Exception( raise Exception(
"Must specify --data_path when training with imagenet") "Must specify --data_path when training with imagenet")
...@@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train): ...@@ -173,12 +165,11 @@ def _model_reader_dshape_classdim(args, is_train):
reader = train(xmap=False) reader = train(xmap=False)
else: else:
reader = val(xmap=False) reader = val(xmap=False)
return model, reader, dshape, class_dim return reader, dshape, class_dim
def get_model(args, is_train, main_prog, startup_prog): def get_model(args, is_train, main_prog, startup_prog):
model, reader, dshape, class_dim = _model_reader_dshape_classdim(args, reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
is_train)
pyreader = None pyreader = None
trainer_count = int(os.getenv("PADDLE_TRAINERS")) trainer_count = int(os.getenv("PADDLE_TRAINERS"))
...@@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -198,7 +189,8 @@ def get_model(args, is_train, main_prog, startup_prog):
label = fluid.layers.data( label = fluid.layers.data(
name='label', shape=[1], dtype='int64') name='label', shape=[1], dtype='int64')
predict = model(input, class_dim, is_train=is_train) model = ResNet(is_train=is_train)
predict = model.net(input, class_dim=class_dim)
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
...@@ -215,16 +207,15 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -215,16 +207,15 @@ def get_model(args, is_train, main_prog, startup_prog):
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / (args.batch_size * args.gpus) + 1)
epochs = [30, 60, 80, 90] epochs = [30, 60, 90]
bd = [step * e for e in epochs] bd = [step * e for e in epochs]
base_lr = args.learning_rate base_lr = args.learning_rate
lr = [] lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
learning_rate=base_lr, learning_rate=fluid.layers.piecewise_decay(
#learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr),
# boundaries=bd, values=lr),
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4)) regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
......
# PaddlePaddle发行规范 # PaddlePaddle发行规范
PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 PaddlePaddle使用Trunk Based Development,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
PaddlePaddle每次发新的版本,遵循以下流程: PaddlePaddle每次发新的版本,遵循以下流程:
1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` 1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 2. 将新分支的版本打上tag,tag为`版本号rc-Patch号`。例如,第一个tag为`0.10.0-rc0`
1. 对这个版本的提交,做如下几个操作: 3. 新分支一般不接受新的feature和优化。QA在release分支上进行测试。研发基于最新的develop开发。
* 使用Regression Test List作为检查列表,测试本次release的正确性。 4. QA和研发发现的bug,在develop上修复验证后,cherry-pick修复到release分支。直到release分支相对稳定。
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 5. 如果有需要,在release分支最新代码上打上新的tag,比如`0.10.0-rc1`,让更多的用户加入测试。重复3-4步。
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True` 6. release分支稳定后,打上正式的release tag,比如`0.10.0`
* 将这个版本的python wheel包发布到pypi。 7. 将这个版本的python wheel包发布到pypi。
* 更新Docker镜像(参考后面的操作细节)。 8. 更新Docker镜像(参考后面的操作细节)。
1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。
1. 协同完成Release Note的书写。
需要注意的是: 需要注意的是:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 * bug修复需要先在develop上进行,然后进入release分支。而不是直接在release分支上开发。
*`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop``release/版本号`这三个分支。
* release分支原则上只接受修复类的修改,不接受新feature。
## 发布wheel包到pypi ## 发布wheel包到pypi
...@@ -61,24 +60,21 @@ docker push [镜像]:[version] ...@@ -61,24 +60,21 @@ docker push [镜像]:[version]
## PaddlePaddle 分支规范 ## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 PaddlePaddle开发过程使用[Trunk Based Development](https://trunkbaseddevelopment.com/) 开发规范。
* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。
* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试。并且会经过模型回归测试。
* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 * `release/版本号`分支为每一次Release时建立的临时分支。release分支主要用于测试,bug修复和最终发版。
* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 * `master`分支因为历史原因,已经废弃。
* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。
* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master``develop`与可能有的`release/版本号`分支,同时提起`Pull Request` * 其他开发者fork的feature branch。
* 建议,开发者的feature branch需要同步主版本库的`develop`分支。
* 建议,开发者的feature branch需要基于主版本库中的`develop`分支。
* 当feature branch开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
* 在评审过程中,开发者修改自己的代码,可以继续在自己的feature branch提交代码。
## PaddlePaddle回归测试列表 ## PaddlePaddle回归测试列表
本列表说明PaddlePaddle发版之前需要测试的功能点。 TODO
### PaddlePaddle Book中所有章节 ### PaddlePaddle Book中所有章节
......
...@@ -4,26 +4,21 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti ...@@ -4,26 +4,21 @@ PaddlePaddle manages its branches using "git-flow branching model", and [Semanti
Each time we release a new PaddlePaddle version, we should follow the below steps: Each time we release a new PaddlePaddle version, we should follow the below steps:
1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`. 1. Create a new release branch from `develop`,named `release/[version]`. E.g.,`release/0.10.0`
1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The 2. Create a new tag for the release branch, tag format: `version-rc.Patch`. E.g. the first tag is `0.10.0-rc0`
first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on. 3. New release branch normally doesn't accept new features or optimizations. QA will test on the release branch. Developer should develop based on `develop` branch.
1. After that, we should do: 4. If QA or Developer find bugs. They should first fix and verify on `develop` branch. Then cherry-pick the fix to the release branch. Wait until the release branch is stable.
* Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm 5. If necessary, create a new tag on the relese branch, e.g. `0.10.0-rc1`. Involve more users to try it and repeat step 3-4.
that this release has no major bugs. 6. After release branch is stable,Create the official release tag,such as `0.10.0`.
* If regression test fails, we must fix those bugs and create a new `release/[version]` 7. Release the python wheel package to pypi.
branch from previous release branch. 8. Update the docker image (More details below).
* Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
* Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail). NOTE:
* Update the Docker images (see below instructions for detail).
1. After above step, merge `release/[version]` branch to master and push a tag on the master commit, * bug fix should happen on `develop` branch, then cherry-pick to relese branch. Avoid developing directly on release branch.
then merge `master` to `develop`.
1. Update the Release Note. * release normally only accept bug fixes. Don't add new features.
***NOTE:***
* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
features only for current release, so that we can test on that version.
* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
## Publish Wheel Packages to pypi ## Publish Wheel Packages to pypi
...@@ -97,26 +92,22 @@ You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlep ...@@ -97,26 +92,22 @@ You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlep
## Branching Model ## Branching Model
We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model, PaddlePaddle uses [Trunk Based Development](https://trunkbaseddevelopment.com/) as our branching model.
with some modifications:
* `develop` branch is used for development. Each comment to `develop` branc goes through unit tests and model regression tests.
* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed. * `release/[version]` branch is used for each release. Release branch is used for tests, bug fix and evetual release.
* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no * `master` branch as been deprecated for historical reasons
regression tests are run.
* `release/[version]` branch is used to publish each release. Latest release version branches have * Developer's feature branch。
bugfix only for that version, but no feature updates. * Developer's feature branch should sync with upstream `develop` branch.
* Developer forks are not required to follow * Developer's feature branch should be forked from upstream `develop` branch.
[git-flow](http://nvie.com/posts/a-successful-git-branching-model/) * After feature branch is ready, create a `Pull Request` against the Paddle repo and go through code review.
branching model, all forks is like a feature branch. * In the review process, develop modify codes and push to their own feature branch.
* Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
* Advise: developer use it's fork's develop branch to for new branch to start developing.
* Use that branch on developer's fork to create pull requests and start reviews.
* developer can push new commits to that branch when the pull request is open.
* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
`master`, `develop` and `releases`.
## PaddlePaddle Regression Test List ## PaddlePaddle Regression Test List
TODO
### All Chapters of PaddlePaddle Book ### All Chapters of PaddlePaddle Book
We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
......
...@@ -2,28 +2,31 @@ ...@@ -2,28 +2,31 @@
## Automatic Differentiation ## Automatic Differentiation
A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers. Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf). A key challenge in deep learning is to automatically derive the backward pass given the forward pass as a program, which has been long studied in the field of [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf), or autodiff, before the prosperity of deep learning.
## The Tape ## Program Transformation v.s. Backtracking
Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass: Given the forward pass program, there are two strategies to derive the backward pass:
1. from the forward pass program itself, or 1. by transforming the forward pass program without executing it, or
1. from the execution trace of the forward pass program, which is often known as the *tape*. 1. by backtracking the execution process of the forward pass program.
This article surveys systems that follow the latter strategy. This article is about the latter strategy.
## Dynamic Network ## The Tape and Dynamic Networks
When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration. This is known as *dynamic network*. We refer to the trace of the execution of the forward pass program as a *tape* [[1]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf). When we train a deep learning model, the tape changes every iteration as the input data change, so we'd have to re-derive the backward pass, which is time-consuming, but also eases the case that the forward program includes control flows like if-else and for/while. With these control flows, the execution trace might change with iterations. Such changes are known as *dynamic networks* in the field of deep learning.
Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years. This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/). ## Typical Systems
## An Overview Deep learning systems that utilize the idea of dynamic networks gained their popularities in recent years. This article surveys the following typical systems:
Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf) - [DyNet](https://dynet.readthedocs.io/en/latest/)
- [PyTorch](https://pytorch.org/)
- Chainer
- Autograd from HIPS
Consider the following code feedforward model. Before diving into these systems, let us pose an example forward pass program:
```python ```python
x = Variable(randn(20, 1))) x = Variable(randn(20, 1)))
...@@ -35,9 +38,11 @@ loss = softmax(pred, label) ...@@ -35,9 +38,11 @@ loss = softmax(pred, label)
loss.backward() loss.backward()
``` ```
### 1) Dynet uses List to encode the Tape ## The Representation of Tapes
During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`. ### DyNet: the Tape as a List
DyNet uses a linear data structure, a list, to represent the tape. During the execution of the above example, it is a list of operators: `matmul`, `matmul`, and `softmax`. The list also includes information needed to do the backward pass, such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward().`
<details> <details>
<summary></summary> <summary></summary>
...@@ -69,9 +74,9 @@ digraph g { ...@@ -69,9 +74,9 @@ digraph g {
![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20}) ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
### 2) Pytorch uses Node Graph to encode the Tape ### PyTorch: the Tape as a Graph
The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order. Please be aware that a `Function` might have more than one `prev_func`s.
<details> <details>
<summary></summary> <summary></summary>
...@@ -132,27 +137,22 @@ digraph g { ...@@ -132,27 +137,22 @@ digraph g {
![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20}) ![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix. Chainer and Autograd use the similar techniques to record the forward pass. For details, please refer to the appendix.
## Design choices
### 1) Dynet's List vs Pytorch's Node Graph ## Comparison: List v.s. Graph
What's good about List: The list of DyNet could be considered the result of the topological sort of the graph of PyTorch. Or, the graph is the raw representation of the tape, which gives us the chance to *prune* part of the graph that is irrelevant with the backward pass before the topological sort [[2]](https://openreview.net/pdf?id=BJJsrmfCZ). Consider the following example, PyTorch only does backward on `SmallNet` while DyNet does both `SmallNet` and `BigNet`:
1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
What's good about Node Graph:
1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
```python ```python
result = BigNet(data) result = BigNet(data)
loss = SmallNet(data) loss = SmallNet(data)
loss.backward() loss.backward()
``` ```
### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation ## Lazy v.s. Immediate Evaluation
Another difference between DyNet and PyTorch is that DyNet lazily evaluates the forward pass, whereas PyTorch executes it immediately. Consider the following example:
Dynet builds the list in a symbolic matter. Consider the following example
```python ```python
for epoch in range(num_epochs): for epoch in range(num_epochs):
for in_words, out_label in training_data: for in_words, out_label in training_data:
...@@ -164,16 +164,17 @@ for epoch in range(num_epochs): ...@@ -164,16 +164,17 @@ for epoch in range(num_epochs):
loss_val = loss_sym.value() loss_val = loss_sym.value()
loss_sym.backward() loss_sym.backward()
``` ```
The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion. The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`. PyTorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
## What can fluid learn from them? ## Fluid: Learning the Lessons
Please refer to `paddle/contrib/dynamic/`. Please refer to `paddle/contrib/dynamic/`.
# Appendix ## Appendix
### Overview ### Overview
......
...@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara ...@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.InferenceTranspiler.__init__ paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
...@@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att ...@@ -100,7 +100,7 @@ paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_att
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None))
...@@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's ...@@ -142,7 +142,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)) paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
...@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg ...@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)) paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)) paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3)) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con ...@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
paddle.fluid.transpiler.InferenceTranspiler.__init__ paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
......
...@@ -56,9 +56,9 @@ else() ...@@ -56,9 +56,9 @@ else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
else() else()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
endif (NOT WIN32) endif (NOT WIN32)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
...@@ -116,7 +116,11 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl ...@@ -116,7 +116,11 @@ cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope gl
endif(NOT WIN32) endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
......
...@@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent r("all_reduce", nullptr); platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {
......
...@@ -15,12 +15,15 @@ ...@@ -15,12 +15,15 @@
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// The input and output may have dummy vars. // The input and output may have dummy vars.
......
...@@ -348,14 +348,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -348,14 +348,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
size_t cur_device_id = 0; size_t cur_device_id = 0;
bool is_forwarding = true; bool is_forwarding = true;
bool is_dist_train = false;
for (ir::Node *node : sorted_ops) { for (ir::Node *node : sorted_ops) {
if (boost::get<int>( if (boost::get<int>(
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) { static_cast<int>(OpRole::kRPC)) {
CreateRPCOp(&result, node); int op_dev_id = CreateRPCOp(&result, node);
PADDLE_ENFORCE(op_dev_id != -1,
"Can not schedule the RPC operator to the right place.");
if (node->Op()->Type() == "recv") {
auto recv_vars_attr =
boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient]
if (recv_vars_attr[0].find(".block") == std::string::npos) {
bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
}
}
is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) { } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
CreateDistTrainOp(&result, node); int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0];
bcast_var_name_set[op_dev_id].emplace(origin_param_name);
}
} else if (IsScaleLossOp(node)) { } else if (IsScaleLossOp(node)) {
// user can customize loss@grad if not use_default_grad_scale_ // user can customize loss@grad if not use_default_grad_scale_
if (strategy_.gradient_scale_ != if (strategy_.gradient_scale_ !=
...@@ -414,7 +431,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -414,7 +431,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateReduceOp(&result, g_name, cur_device_id); CreateReduceOp(&result, g_name, cur_device_id);
graph->Get<ShardedVarDevice>(kShardedVarDevice) graph->Get<ShardedVarDevice>(kShardedVarDevice)
.emplace(g_name, cur_device_id); .emplace(g_name, cur_device_id);
bcast_var_name_set[cur_device_id].emplace(p_name); if (!is_dist_train) {
bcast_var_name_set[cur_device_id].emplace(p_name);
}
break; break;
case BuildStrategy::ReduceStrategy::kAllReduce: case BuildStrategy::ReduceStrategy::kAllReduce:
if (IsSparseGradient(g_name)) { if (IsSparseGradient(g_name)) {
...@@ -436,15 +455,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -436,15 +455,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
} }
} }
} }
bool use_gpu = false; bool use_gpu = false;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
use_gpu = nccl_ctxs_ != nullptr; use_gpu = nccl_ctxs_ != nullptr;
#endif #endif
if (use_gpu || // Insert broadcast operators principle:
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { // 1. Broadcast optimized parameters in Reduce strategy;
// Insert BCast Ops // 2. No need broadcast optimized parameters in AllReduce strategy because of
// the optimization sub-graph would be run on every GPU;
// 3. Allways broadcast received parameters in Distribute Training.
if ((use_gpu &&
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
is_dist_train) {
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
auto &to_bcast_set = bcast_var_name_set[dev_id]; auto &to_bcast_set = bcast_var_name_set[dev_id];
for (auto &bcast_name : to_bcast_set) { for (auto &bcast_name : to_bcast_set) {
...@@ -676,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, ...@@ -676,8 +699,8 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
return var; return var;
} }
void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
std::vector<std::string> input_var_names; std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names; std::vector<std::string> output_var_names;
...@@ -720,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -720,6 +743,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
node->Op()->Type()); node->Op()->Type());
CreateComputationalOp(result, node, op_dev_id); CreateComputationalOp(result, node, op_dev_id);
return op_dev_id;
} }
void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
...@@ -738,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { ...@@ -738,8 +762,8 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
} }
// Create RPC related op handles that connects its in ops and out ops. // Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
ir::Node *node) const { ir::Node *node) const {
int op_dev_id = -1; int op_dev_id = -1;
if (node->Op()->Type() == "send") { if (node->Op()->Type() == "send") {
// TODO(paddle-dev): getting the first var is not safe. // TODO(paddle-dev): getting the first var is not safe.
...@@ -825,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ...@@ -825,6 +849,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
} }
} }
return op_dev_id;
} }
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
......
...@@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -54,8 +54,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
bool IsScaleLossOp(ir::Node *node) const; bool IsScaleLossOp(ir::Node *node) const;
void CreateRPCOp(ir::Graph *result, ir::Node *node) const; int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/** /**
* Is this operator as the end-point operator before/after send operator. * Is this operator as the end-point operator before/after send operator.
......
...@@ -27,7 +27,8 @@ namespace framework { ...@@ -27,7 +27,8 @@ namespace framework {
namespace details { namespace details {
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent r("reduce", nullptr); platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
......
...@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() { ...@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
->stream(); ->stream();
memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp, memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
platform::CPUPlace(), &coeff_, sizeof(float), stream); platform::CPUPlace(), &coeff_, sizeof(float), stream);
VLOG(1) << place_ << "RUN Scale loss grad op"; VLOG(10) << place_ << "RUN Scale loss grad op";
}); });
#endif #endif
} }
......
...@@ -16,6 +16,13 @@ syntax = "proto2"; ...@@ -16,6 +16,13 @@ syntax = "proto2";
option optimize_for = LITE_RUNTIME; option optimize_for = LITE_RUNTIME;
package paddle.framework.proto; package paddle.framework.proto;
// Any incompatible changes to ProgramDesc and its dependencies should
// raise the version defined version.h.
//
// Serailization and Deserialization codes should be modified in a way
// that supports old versions following the version and compatibility policy.
message Version { optional int64 version = 1 [ default = 0 ]; }
enum AttrType { enum AttrType {
INT = 0; INT = 0;
FLOAT = 1; FLOAT = 1;
...@@ -180,4 +187,8 @@ message BlockDesc { ...@@ -180,4 +187,8 @@ message BlockDesc {
// for more details. // for more details.
// TODO(panyx0718): A model can have multiple programs. Need a // TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name? // way to distinguish them. Maybe ID or name?
message ProgramDesc { repeated BlockDesc blocks = 1; } message ProgramDesc {
repeated BlockDesc blocks = 1;
optional Version version = 2;
}
...@@ -19,7 +19,7 @@ function(pass_library TARGET DEST) ...@@ -19,7 +19,7 @@ function(pass_library TARGET DEST)
endfunction() endfunction()
cc_library(node SRCS node.cc DEPS proto_desc) cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node) cc_library(graph SRCS graph.cc DEPS node pretty_log)
cc_library(graph_helper SRCS graph_helper.cc DEPS graph) cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
...@@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap ...@@ -28,6 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
pass_library(graph_to_program_pass base) pass_library(graph_to_program_pass base)
pass_library(graph_viz_pass base) pass_library(graph_viz_pass base)
pass_library(fc_fuse_pass inference) pass_library(fc_fuse_pass inference)
if(WITH_MKLDNN)
pass_library(conv_relu_mkldnn_fuse_pass inference)
endif()
pass_library(attention_lstm_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference)
pass_library(infer_clean_graph_pass inference) pass_library(infer_clean_graph_pass inference)
pass_library(fc_lstm_fuse_pass inference) pass_library(fc_lstm_fuse_pass inference)
...@@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r ...@@ -42,3 +45,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
if(WITH_MKLDNN)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
endif()
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd;
auto* conv_input = gpd.mutable_pattern()
->NewNode("conv_relu_mkldnn_fuse/conv_input")
->AsInput()
->assert_is_op_input("conv2d", "Input");
patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(),
"conv_relu_mkldnn_fuse");
conv_relu_pattern(conv_input);
int found_conv_relu_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
VLOG(4) << "handle ConvReLU fuse";
GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
conv_relu_pattern); // Filter
GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern); // Bias
GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp
GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op
GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out
GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op
// Create an ConvReLU Node.
OpDesc desc;
std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
std::string conv_relu_w_in = conv_weight->Name();
std::string conv_relu_b_in = conv_bias->Name();
std::string conv_relu_out = relu_out->Name();
desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
desc.SetType("conv2d");
for (auto& attr : conv->Op()->GetAttrMap()) {
desc.SetAttr(attr.first, attr.second);
}
desc.SetAttr("fuse_relu", true);
auto conv_relu_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
PADDLE_ENFORCE(subgraph.count(conv_input));
IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
IR_NODE_LINK_TO(conv_weight, conv_relu_node);
IR_NODE_LINK_TO(conv_bias, conv_relu_node);
IR_NODE_LINK_TO(conv_relu_node, relu_out);
found_conv_relu_count++;
};
gpd(graph.get(), handler);
AddStatis(found_conv_relu_count);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(conv_relu_mkldnn_fuse_pass,
paddle::framework::ir::ConvReLUFusePass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the CONV and ReLU to a ConvReLUOp.
*/
class ConvReLUFusePass : public FusePassBase {
public:
virtual ~ConvReLUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (type == "conv2d") {
op->SetAttr("use_mkldnn", true);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
op->SetInput("Bias", {inputs[2]});
} else if (type == "relu") {
op->SetInput("X", inputs);
}
op->SetOutput("Out", outputs);
}
// a->OP0->b
// b->OP1->c
// (c, weights, bias)->conv->f
// (f)->relu->g
ProgramDesc BuildProgramDesc() {
ProgramDesc prog;
for (auto& v :
std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::SELECTED_ROWS);
if (v == "weights" || v == "bias") {
var->SetPersistable(true);
}
}
SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
std::vector<std::string>({"b"}));
SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
std::vector<std::string>({"c"}));
SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights", "bias"}),
std::vector<std::string>({"f"}));
SetOp(&prog, "relu", std::vector<std::string>({"f"}),
std::vector<std::string>({"g"}));
return prog;
}
TEST(ConvReLUFusePass, basic) {
auto prog = BuildProgramDesc();
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
// Remove 3 Nodes: CONV, RELU, conv_out
// Add 1 Node: ConvReLU
EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
// Assert conv_relu op in newly generated graph
int conv_relu_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") {
if (node->Op()->HasAttr("use_mkldnn")) {
bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
if (use_mkldnn) {
if (node->Op()->HasAttr("fuse_relu")) {
bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
if (fuse_relu) {
++conv_relu_count;
}
}
}
}
}
}
EXPECT_EQ(conv_relu_count, 1);
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(conv_relu_mkldnn_fuse_pass);
...@@ -29,39 +29,27 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -29,39 +29,27 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
std::unordered_set<Node*> nodes2delete; std::unordered_set<Node*> nodes2delete;
GraphPatternDetector gpd; GraphPatternDetector gpd;
// BuildFCPattern(gpd.mutable_pattern());
auto* x = gpd.mutable_pattern() auto* x = gpd.mutable_pattern()
->NewNode("fc_fuse/x") ->NewNode("fc_fuse/x")
->AsInput() ->AsInput()
->assert_is_op_input("mul", "X"); ->assert_is_op_input("mul", "X");
patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/); patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse");
fc_pattern(x, true /*with bias*/);
#define GET_NODE(id) \
PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
int found_fc_count = 0; int found_fc_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
VLOG(4) << "handle FC fuse"; VLOG(4) << "handle FC fuse";
// Currently, there is no FC op available, so I will just simulate the GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
// scenerio. GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
// FC's fusion is simple, just op fuse, no need to process the GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
// parameters. GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_NODE(x); // x GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
GET_NODE(w); // Y GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_NODE(fc_bias); // bias
GET_NODE(fc_out); // Out
GET_NODE(mul); // MUL op
GET_NODE(elementwise_add); // ELEMENT_ADD op
GET_NODE(mul_out); // tmp
#undef GET_NODE
// Create an FC Node. // Create an FC Node.
OpDesc desc; OpDesc desc;
std::string fc_x_in = x->Name(); std::string fc_x_in = subgraph.at(x)->Name();
std::string fc_Y_in = w->Name(); std::string fc_Y_in = w->Name();
std::string fc_bias_in = fc_bias->Name(); std::string fc_bias_in = fc_bias->Name();
std::string fc_out_out = fc_out->Name(); std::string fc_out_out = fc_out->Name();
...@@ -73,7 +61,8 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl( ...@@ -73,7 +61,8 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied.
GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
IR_NODE_LINK_TO(x, fc_node); PADDLE_ENFORCE(subgraph.count(x));
IR_NODE_LINK_TO(subgraph.at(x), fc_node);
IR_NODE_LINK_TO(w, fc_node); IR_NODE_LINK_TO(w, fc_node);
IR_NODE_LINK_TO(fc_bias, fc_node); IR_NODE_LINK_TO(fc_bias, fc_node);
IR_NODE_LINK_TO(fc_node, fc_out); IR_NODE_LINK_TO(fc_node, fc_out);
......
...@@ -20,52 +20,43 @@ namespace paddle { ...@@ -20,52 +20,43 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
static void BuildPattern(PDPattern* pattern, const std::string& name_scope,
bool with_fc_bias) {
PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul")
->assert_var_not_persistable();
auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
patterns::GRU(pattern, name_scope, fc_out);
VLOG(3) << "fc_gru pattern \n" << pattern->DotString();
}
static int BuildFusion(Graph* graph, const std::string& name_scope, static int BuildFusion(Graph* graph, const std::string& name_scope,
Scope* scope, bool with_fc_bias) { Scope* scope, bool with_fc_bias) {
GraphPatternDetector gpd; GraphPatternDetector gpd;
auto* pattern = gpd.mutable_pattern(); auto* pattern = gpd.mutable_pattern();
BuildPattern(pattern, name_scope, with_fc_bias); // Create pattern.
patterns::FC fc_pattern(pattern, name_scope);
patterns::GRU gru_pattern(pattern, name_scope);
PDNode* x =
pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable();
auto* fc_out = fc_pattern(x, with_fc_bias);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
gru_pattern(fc_out);
// Create New OpDesc // Create New OpDesc
auto gru_creater = [&](int gru, int x, int weight_x, int weight_h, int bias, auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
int hidden, int fc_bias) { Node* bias, Node* hidden, Node* fc_bias) {
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
GET_NODE(x);
GET_NODE(weight_x);
GET_NODE(weight_h);
GET_NODE(bias);
GET_NODE(hidden);
GET_NODE(gru);
OpDesc op_desc; OpDesc op_desc;
op_desc.SetType("fusion_gru"); op_desc.SetType("fusion_gru");
#define NEW_NAME(x) name_scope + "/at." #x ".new" #define NEW_NAME(x) name_scope + "/at." #x ".new"
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
SET_IN(X, x); SET_IN(X, x);
SET_IN(WeightX, weight_x); SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h); SET_IN(WeightH, weight_h);
if (with_fc_bias) { if (with_fc_bias) {
op_desc.SetInput("Bias", {NEW_NAME(bias) + bias_n->Name()}); op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()});
} else { } else {
SET_IN(Bias, bias); SET_IN(Bias, bias);
} }
#undef SET_IN #undef SET_IN
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Hidden", {hidden->Name()});
op_desc.SetAttr("is_reverse", gru_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse"));
// TODO(TJ): This should be a option for infer // TODO(TJ): This should be a option for infer
op_desc.SetAttr("use_seq", true); op_desc.SetAttr("use_seq", true);
...@@ -82,14 +73,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -82,14 +73,12 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
PADDLE_ENFORCE(scope); PADDLE_ENFORCE(scope);
if (with_fc_bias) { if (with_fc_bias) {
// Fusion GRU bias = fcbias + grubias // Fusion GRU bias = fcbias + grubias
auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias_n->Name()); auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias->Name());
auto* out_bias_tensor = auto* out_bias_tensor =
fusion_bias_var->GetMutable<framework::LoDTensor>(); fusion_bias_var->GetMutable<framework::LoDTensor>();
PADDLE_ENFORCE(fusion_bias_var); PADDLE_ENFORCE(fusion_bias_var);
GET_NODE(fc_bias); auto* gru_bias_var = scope->FindVar(bias->Name());
PADDLE_ENFORCE(fc_bias_n); auto* fc_bias_var = scope->FindVar(fc_bias->Name());
auto* gru_bias_var = scope->FindVar(bias_n->Name());
auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
PADDLE_ENFORCE(gru_bias_var); PADDLE_ENFORCE(gru_bias_var);
PADDLE_ENFORCE(fc_bias_var); PADDLE_ENFORCE(fc_bias_var);
const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>(); const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
...@@ -113,11 +102,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -113,11 +102,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
#undef NEW_NAME #undef NEW_NAME
#undef NEW_IMTERMEDIATE_OUT #undef NEW_IMTERMEDIATE_OUT
IR_NODE_LINK_TO(x_n, op); IR_NODE_LINK_TO(x, op);
IR_NODE_LINK_TO(weight_x_n, op); IR_NODE_LINK_TO(weight_x, op);
IR_NODE_LINK_TO(weight_h_n, op); IR_NODE_LINK_TO(weight_h, op);
IR_NODE_LINK_TO(bias_n, op); // actually should link to new bias if have IR_NODE_LINK_TO(bias, op); // actually should link to new bias if have
IR_NODE_LINK_TO(op, hidden_n); IR_NODE_LINK_TO(op, hidden);
// h0? // h0?
return op; return op;
}; };
...@@ -125,42 +114,35 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -125,42 +114,35 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
int fusion_count{0}; int fusion_count{0};
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
#define GET_NODE(name__) \ auto* x_n = subgraph.at(x);
std::string name__##key = name_scope + "/" + #name__; \ GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
auto* name__##n = pattern->RetrieveNode(name__##key); \ GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
PADDLE_ENFORCE(name__##n); \ GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
PADDLE_ENFORCE(subgraph.count(name__##n)); \ GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
Node* name__##_n = subgraph.at(name__##n); \ GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
int name__ __attribute__((unused)) = name__##_n->id(); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern);
GET_NODE(x);
GET_NODE(w); // fc weight
GET_NODE(mul);
GET_NODE(fc_out);
GET_NODE(Weight);
GET_NODE(gru);
GET_NODE(Bias);
GET_NODE(Hidden);
// nodes need be removed // nodes need be removed
GET_NODE(BatchGate); GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern);
GET_NODE(BatchResetHiddenPrev); GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern);
GET_NODE(BatchHidden); GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern);
if (with_fc_bias) { if (with_fc_bias) {
GET_NODE(mul_out); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
GET_NODE(fc_bias); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
GET_NODE(elementwise_add); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
gru_creater(gru, x, w, Weight, Bias, Hidden, fc_bias);
gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
// Remove unneeded nodes. // Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes( std::unordered_set<const Node*> marked_nodes(
{mul_n, gru_n, elementwise_add_n, fc_bias_n, fc_out_n, mul_out_n, {mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate,
BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n}); BatchResetHiddenPrev, BatchHidden});
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} else { } else {
gru_creater(gru, x, w, Weight, Bias, Hidden, -1); gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr);
// Remove unneeded nodes. // Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes( std::unordered_set<const Node*> marked_nodes(
{mul_n, gru_n, BatchGate_n, BatchResetHiddenPrev_n, BatchHidden_n}); {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden});
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} }
#undef GET_NODE #undef GET_NODE
......
...@@ -20,45 +20,29 @@ namespace paddle { ...@@ -20,45 +20,29 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
static std::string GenNodeName(const std::string& prefix, int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
const std::string& name) { bool with_fc_bias) {
return prefix + "/" + name; GraphPatternDetector gpd;
} auto* pattern = gpd.mutable_pattern();
static void BuildPattern(PDPattern* pattern, const std::string& name_scope, // Build pattern
bool with_fc_bias) { PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
PDNode* x = pattern->NewNode(name_scope, "x")
->assert_is_op_input("mul") ->assert_is_op_input("mul")
->assert_var_not_persistable(); ->assert_var_not_persistable();
auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias); patterns::FC fc_pattern(pattern, name_scope);
fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse.
patterns::LSTM(pattern, name_scope, fc_out);
// LOG(INFO) << "\n" << pattern->DotString();
}
static int BuildFusion(Graph* graph, const std::string& name_scope, // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
Scope* scope, bool with_fc_bias) { auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate();
GraphPatternDetector gpd; patterns::LSTM lstm_pattern(pattern, name_scope);
auto* pattern = gpd.mutable_pattern(); lstm_pattern(fc_out);
BuildPattern(pattern, name_scope, with_fc_bias);
// Create New OpDesc // Create New OpDesc
auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h, auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x,
int bias, int hidden, int cell, int xx, int fc_bias) { Node* weight_h, Node* bias, Node* hidden, Node* cell,
#define GET_NODE(x) auto* x##_n = graph->RetriveNode(x); Node* xx, Node* fc_bias) {
GET_NODE(input);
GET_NODE(weight_x);
GET_NODE(weight_h);
GET_NODE(bias);
GET_NODE(hidden);
GET_NODE(cell);
GET_NODE(xx);
GET_NODE(lstm);
OpDesc op_desc; OpDesc op_desc;
op_desc.SetType("fusion_lstm"); op_desc.SetType("fusion_lstm");
#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__##_n->Name()}); #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
SET_IN(X, input); SET_IN(X, input);
SET_IN(WeightX, weight_x); SET_IN(WeightX, weight_x);
SET_IN(WeightH, weight_h); SET_IN(WeightH, weight_h);
...@@ -67,17 +51,16 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -67,17 +51,16 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
if (with_fc_bias) { if (with_fc_bias) {
// Add FC-bias with LSTM-bias and create a new weight // Add FC-bias with LSTM-bias and create a new weight
PADDLE_ENFORCE(scope); PADDLE_ENFORCE(scope);
const std::string& new_bias_var = name_scope + "_bias.new"; const std::string& new_bias_var = patterns::UniqueKey("NewBias");
auto* bias_var = scope->Var(new_bias_var); auto* bias_var = scope->Var(new_bias_var);
PADDLE_ENFORCE(bias_var); PADDLE_ENFORCE(bias_var);
auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>(); auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
auto* lstm_bias_var = scope->FindVar(bias_n->Name()); auto* lstm_bias_var = scope->FindVar(bias->Name());
PADDLE_ENFORCE(lstm_bias_var); PADDLE_ENFORCE(lstm_bias_var);
const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>(); const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
bias_tensor->Resize(lstm_bias_tensor.dims()); bias_tensor->Resize(lstm_bias_tensor.dims());
GET_NODE(fc_bias); auto* fc_bias_var = scope->FindVar(fc_bias->Name());
auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>(); const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace()); auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
...@@ -88,31 +71,36 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -88,31 +71,36 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
} }
op_desc.SetInput("Bias", {new_bias_var}); op_desc.SetInput("Bias", {new_bias_var});
} }
#undef GET_NODE
// Create temp variables. // Create temp variables.
scope->Var(name_scope + "/BatchedInput.new") const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
->GetMutable<framework::LoDTensor>(); const std::string BatchedCellPreAct =
scope->Var(name_scope + "/BatchCellPreAct.new") patterns::UniqueKey("BatchedCellPreAct");
->GetMutable<framework::LoDTensor>(); const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
scope->Var(name_scope + "/BatchedGate.new")
->GetMutable<framework::LoDTensor>(); scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
op_desc.SetInput("C0", {}); op_desc.SetInput("C0", {});
op_desc.SetOutput("Hidden", {hidden_n->Name()}); op_desc.SetOutput("Hidden", {hidden->Name()});
op_desc.SetOutput("Cell", {cell_n->Name()}); op_desc.SetOutput("Cell", {cell->Name()});
op_desc.SetOutput("XX", {xx_n->Name()}); op_desc.SetOutput("XX", {xx->Name()});
op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"}); op_desc.SetOutput("BatchedGate", {BatchedGate});
op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"}); op_desc.SetOutput("BatchedInput", {BatchedInput});
op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse")); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
// TODO(TJ): get from attr // TODO(TJ): get from attr
op_desc.SetAttr("use_seq", true); op_desc.SetAttr("use_seq", true);
#define TMP_NAME(x) "at.new.tmp." #x PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)}) auto* scope = graph->Get<Scope*>(kParamScopeAttr);
#define OP_SET_OUT(x) \
const std::string x = patterns::UniqueKey(#x); \
op_desc.SetOutput(#x, {x}); \
scope->Var(x)->GetMutable<LoDTensor>()
OP_SET_OUT(BatchedCell); OP_SET_OUT(BatchedCell);
OP_SET_OUT(BatchedHidden); OP_SET_OUT(BatchedHidden);
OP_SET_OUT(ReorderedH0); OP_SET_OUT(ReorderedH0);
...@@ -120,22 +108,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -120,22 +108,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
#undef OP_SET_OUT #undef OP_SET_OUT
auto* op = graph->CreateOpNode(&op_desc); auto* op = graph->CreateOpNode(&op_desc);
PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); IR_NODE_LINK_TO(input, op);
auto* scope = graph->Get<Scope*>(kParamScopeAttr); IR_NODE_LINK_TO(weight_x, op);
IR_NODE_LINK_TO(weight_h, op);
#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>() IR_NODE_LINK_TO(bias, op);
TMP_NEW(BatchedCell); IR_NODE_LINK_TO(op, hidden);
TMP_NEW(BatchedHidden);
TMP_NEW(ReorderedH0);
TMP_NEW(ReorderedC0);
#undef TMP_NEW
#undef TMP_NAME
IR_NODE_LINK_TO(input_n, op);
IR_NODE_LINK_TO(weight_x_n, op);
IR_NODE_LINK_TO(weight_h_n, op);
IR_NODE_LINK_TO(bias_n, op);
IR_NODE_LINK_TO(op, hidden_n);
return op; return op;
}; };
...@@ -143,39 +120,31 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -143,39 +120,31 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) { Graph* g) {
#define GET_NODE(name__) \ GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
std::string name__##key = name_scope + "/" + #name__; \ GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
auto* name__##n = pattern->RetrieveNode(name__##key); \ GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
PADDLE_ENFORCE(name__##n); \ GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
PADDLE_ENFORCE(subgraph.count(name__##n)); \ GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
Node* name__##_n = subgraph.at(name__##n); \ GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
int name__ __attribute__((unused)) = name__##_n->id(); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
GET_NODE(x);
GET_NODE(w);
GET_NODE(mul);
GET_NODE(fc_out);
GET_NODE(Weight);
GET_NODE(lstm);
GET_NODE(Bias);
GET_NODE(Hidden);
GET_NODE(Cell);
if (with_fc_bias) { if (with_fc_bias) {
GET_NODE(fc_bias); GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
GET_NODE(elementwise_add); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
fc_bias);
// Remove unneeded nodes. // Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes( std::unordered_set<const Node*> marked_nodes(
{mul_n, lstm_n, elementwise_add_n}); {mul, lstm, elementwise_add, fc_bias});
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} else { } else {
lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1); GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out,
nullptr);
// Remove unneeded nodes. // Remove unneeded nodes.
std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n}); std::unordered_set<const Node*> marked_nodes({mul, lstm});
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
} }
#undef GET_NODE
++fusion_count; ++fusion_count;
}; };
......
...@@ -21,11 +21,17 @@ ...@@ -21,11 +21,17 @@
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
using string::PrettyLogEndl;
using string::PrettyLog;
using string::Style;
size_t PDPattern::id_ = 0UL; size_t PDPattern::id_ = 0UL;
PDNode* PDPattern::NewNode(const std::string& name) { PDNode* PDPattern::NewNode(const std::string& name) {
...@@ -82,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph, ...@@ -82,7 +88,7 @@ void GraphPatternDetector::operator()(Graph* graph,
ValidateByNodeRole(&subgraphs); ValidateByNodeRole(&subgraphs);
if (subgraphs.empty()) return; if (subgraphs.empty()) return;
LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern"; PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size());
int id = 0; int id = 0;
for (auto& g : subgraphs) { for (auto& g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph"; VLOG(3) << "optimizing #" << id++ << " subgraph";
...@@ -106,8 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { ...@@ -106,8 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
for (auto& pdnode : pattern_.nodes()) { for (auto& pdnode : pattern_.nodes()) {
if (!pdnodes2nodes_.count(pdnode.get())) { if (!pdnodes2nodes_.count(pdnode.get())) {
VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
// return false;
return false;
} }
} }
for (auto& item : pdnodes2nodes_) { for (auto& item : pdnodes2nodes_) {
...@@ -517,87 +522,122 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { ...@@ -517,87 +522,122 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
return false; return false;
} }
PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope, PDNode* patterns::ConvReLU::operator()(
PDNode* x, bool with_bias) { paddle::framework::ir::PDNode* conv_input) {
// mul op // Create Operators
auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul"); conv_input->assert_is_op_input("conv2d", "Input");
auto* mul_weight_var = pattern->NewNode(name_scope, "w") auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
->AsInput() auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
->assert_is_persistable_var() // Create variables
->assert_is_op_input("mul", "Y"); // Filter
auto* conv_weight_var = pattern->NewNode(conv_weight_repr())
PDNode* fc_out{nullptr}; ->AsInput()
if (with_bias) { ->assert_is_persistable_var()
PDNode* elementwise_add_op{nullptr}; ->assert_is_op_input("conv2d", "Filter");
PDNode *mul_out_var{nullptr}, *bias{nullptr}; // Bias
elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add") auto* conv_bias_var = pattern->NewNode(conv_bias_repr())
->assert_is_op("elementwise_add"); ->AsInput()
// intermediate variable, will be removed in the IR after fuse. ->assert_is_persistable_var()
mul_out_var = pattern->NewNode(name_scope, "mul_out") ->assert_is_op_input("conv2d", "Bias");
->AsIntermediate() // intermediate variable, will be removed in the IR after fuse.
->assert_is_only_output_of_op("mul") auto* conv_out_var = pattern->NewNode(conv_out_repr())
->assert_is_op_input("elementwise_add"); ->AsIntermediate()
// bias ->assert_is_only_output_of_op("conv2d")
bias = pattern->NewNode(name_scope, "fc_bias") ->assert_is_op_input("relu");
->AsInput() // output
->assert_is_op_input("elementwise_add"); auto* relu_out_var = pattern->NewNode(relu_out_repr())
// output ->AsOutput()
fc_out = pattern->NewNode(name_scope, "fc_out") ->assert_is_op_output("relu");
->AsOutput()
->assert_is_op_output("elementwise_add"); conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
mul_op->LinksFrom({x, mul_weight_var}).LinksTo({mul_out_var}); .LinksTo({conv_out_var});
elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
} else { return relu_out_var;
fc_out = pattern->NewNode(name_scope, "fc_out") }
->AsOutput()
->assert_is_op_output("mul"); PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out}); bool with_bias) {
// Create shared nodes.
x->assert_is_op_input("mul", "X");
auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
auto* mul_w_var = pattern->NewNode(w_repr())
->AsInput()
->assert_is_persistable_var()
->assert_is_op_input("mul", "Y");
auto* mul_out_var =
pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
if (!with_bias) { // not with bias
// Add links.
mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var});
return mul_out_var;
} else { // with bias
mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
// Create operators.
auto* elementwise_add = pattern->NewNode(elementwise_add_repr())
->assert_is_op("elementwise_add");
// Create variables.
auto* bias = pattern->NewNode(bias_repr())
->assert_is_op_input("elementwise_add")
->AsInput();
auto* fc_out = pattern->NewNode(Out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add");
mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var});
elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
return fc_out;
} }
return fc_out;
} }
#define NEW_NODE(op__, arg__, io__) \ PDNode* patterns::LSTM::operator()(PDNode* x) {
auto* arg__ = pattern->NewNode(name_scope, #arg__) \
->assert_is_op_##io__(#op__, #arg__);
PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
PDNode* x) {
x->assert_is_op_input("lstm", "Input"); x->assert_is_op_input("lstm", "Input");
auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm"); auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
#define NEW_NODE(arg__, io__) \
auto* arg__ = \
pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
// Currently, the H0 and C0 are optional // Currently, the H0 and C0 are optional
// TODO(Superjomn) upgrade the fuse framework to support optional. // TODO(Superjomn) upgrade the fuse framework to support optional.
// NEW_NODE(H0, input); // NEW_NODE(H0, input);
// NEW_NODE(C0, input); // NEW_NODE(C0, input);
NEW_NODE(lstm, Weight, input); NEW_NODE(Weight, input);
NEW_NODE(lstm, Bias, input); NEW_NODE(Bias, input);
NEW_NODE(lstm, Hidden, output); NEW_NODE(Hidden, output);
NEW_NODE(lstm, Cell, output); NEW_NODE(Cell, output);
NEW_NODE(lstm, BatchGate, output); NEW_NODE(BatchGate, output);
NEW_NODE(lstm, BatchCellPreAct, output); NEW_NODE(BatchCellPreAct, output);
#undef NEW_NODE
lstm_op->LinksFrom({x, Weight, Bias}); lstm_op->LinksFrom({x, Weight, Bias});
lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
return Hidden; return Hidden;
} }
PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope, PDNode* patterns::GRU::operator()(PDNode* x) {
PDNode* x) {
x->assert_is_op_input("gru", "Input"); x->assert_is_op_input("gru", "Input");
auto* gru_op = pattern->NewNode(name_scope, "gru")->assert_is_op("gru"); auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
#define NEW_NODE(arg__, io__) \
auto* arg__ = \
pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
NEW_NODE(gru, Weight, input); NEW_NODE(Weight, input);
// TODO(Superjomn): upgrade the fuse framework to support optional. // TODO(Superjomn): upgrade the fuse framework to support optional.
// H0 and bias are optional // H0 and bias are optional
NEW_NODE(gru, Bias, input); // also optional NEW_NODE(Bias, input); // also optional
// NEW_NODE(H0, input); // NEW_NODE(H0, input);
NEW_NODE(gru, Hidden, output); NEW_NODE(Hidden, output);
// below are intermediate // below are intermediate
NEW_NODE(gru, BatchGate, output); NEW_NODE(BatchGate, output);
NEW_NODE(gru, BatchResetHiddenPrev, output); NEW_NODE(BatchResetHiddenPrev, output);
NEW_NODE(gru, BatchHidden, output); NEW_NODE(BatchHidden, output);
#undef NEW_NODE
BatchGate->AsIntermediate(); BatchGate->AsIntermediate();
BatchResetHiddenPrev->AsIntermediate(); BatchResetHiddenPrev->AsIntermediate();
...@@ -607,7 +647,6 @@ PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope, ...@@ -607,7 +647,6 @@ PDNode* patterns::GRU(PDPattern* pattern, const std::string& name_scope,
gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden}); gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden});
return Hidden; return Hidden;
} }
#undef NEW_NODE
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
......
...@@ -286,22 +286,170 @@ void GraphSafeRemoveNodes(Graph* graph, ...@@ -286,22 +286,170 @@ void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes); const std::unordered_set<const Node*>& nodes);
// Some pre-defined patterns those can be reused in multiple passes. // Some pre-defined patterns those can be reused in multiple passes.
// The related Fluid Layer or Op should be one pattern here for better reusage
// accross different fusion.
namespace patterns { namespace patterns {
struct KeyCounter {
static KeyCounter& Instance() {
static KeyCounter x;
return x;
}
int IncCounter(const std::string& key) { return dic_[key]++; }
private:
std::unordered_map<std::string, size_t> dic_;
};
// Generate a unique PDNode's name with name_scope and id.
// The format is {name_scope}/{repr}/{id}/{name}
static std::string PDNodeName(const std::string& name_scope,
const std::string& repr, size_t id,
const std::string& name) {
return string::Sprintf("%s/%s/%d/%s", name_scope, repr, id, name);
}
// Generate a unique PDNode's name.
// The format is {name_scope}/{repr}/{id}
static std::string PDNodeName(const std::string& name_scope,
const std::string& repr) {
return string::Sprintf("%s/%s/%d", name_scope, repr,
KeyCounter::Instance().IncCounter(repr));
}
// Generate a unique key. It can be used for a universally unique temporary
// name.
// The format is {repr}/{id}
static std::string UniqueKey(const std::string& repr) {
return string::Sprintf("%s/%d", repr,
KeyCounter::Instance().IncCounter(repr));
}
// Declare a PDNode in a pattern, will create two methods:
// std::string xxx_repr(); return this PDNode's string id.
// PDNode* xxx_n(); return the corresponding PDNode.
#define PATTERN_DECL_NODE(name__) \
std::string name__##_repr() const { \
return PDNodeName(name_scope_, repr_, id_, #name__); \
} \
PDNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); }
// Get an ir::Node* from the matched subgraph.
// var: variable.
// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
// pat: the pattern object.
#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat) \
PADDLE_ENFORCE(subgraph.count(pat.arg##_n()), \
"Node not found for PDNode %s", pat.arg##_repr()); \
Node* var = subgraph.at(pat.arg##_n()); \
PADDLE_ENFORCE(var, "node %s not exists in the sub-graph", #arg)
// The base class of all the patterns.
struct PatternBase {
PatternBase(PDPattern* pattern, const std::string& name_scope,
const std::string& repr)
: pattern(pattern),
name_scope_(name_scope),
repr_(repr),
id_(KeyCounter::Instance().IncCounter(repr)) {}
PDPattern* pattern;
protected:
std::string name_scope_;
std::string repr_;
size_t id_;
};
// CONV with ReLU
// op: conv + relu
// named nodes:
// conv_input, conv_weight,
// conv_bias, conv_out, conv,
// relu_out, relu
struct ConvReLU : public PatternBase {
ConvReLU(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "conv_relu") {}
PDNode* operator()(PDNode* conv_input);
// declare operator node's name
PATTERN_DECL_NODE(conv);
PATTERN_DECL_NODE(relu);
// declare variable node's name
PATTERN_DECL_NODE(conv_weight);
PATTERN_DECL_NODE(conv_bias);
PATTERN_DECL_NODE(conv_out);
PATTERN_DECL_NODE(relu_out);
};
// FC with bias // FC with bias
// op: mul + elementwise_add // op: mul + elementwise_add
// named nodes: // named nodes:
// mul, elementwise_add // mul, elementwise_add
// w, mul_out, bias, fc_out // w, mul_out, bias, fc_out
PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x, struct FC : public PatternBase {
bool with_bias); FC(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "fc") {}
PDNode* operator()(PDNode* x, bool with_bias);
// declare operator node's name
PATTERN_DECL_NODE(fc);
PATTERN_DECL_NODE(mul);
PATTERN_DECL_NODE(elementwise_add);
// declare variable node's name
PATTERN_DECL_NODE(w);
PATTERN_DECL_NODE(mul_out); // (x,w) -> mul_out
PATTERN_DECL_NODE(bias);
PATTERN_DECL_NODE(Out);
};
struct LSTM : public PatternBase {
LSTM(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "lstm") {}
PDNode* operator()(PDNode* x);
PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x); // Operators
PATTERN_DECL_NODE(lstm);
// Inputs
PATTERN_DECL_NODE(Input);
PATTERN_DECL_NODE(H0);
PATTERN_DECL_NODE(C0);
PATTERN_DECL_NODE(Weight);
PATTERN_DECL_NODE(Bias);
// Outputs
PATTERN_DECL_NODE(Hidden);
PATTERN_DECL_NODE(Cell);
PATTERN_DECL_NODE(BatchGate);
PATTERN_DECL_NODE(BatchCellPreAct);
};
PDNode* GRU(PDPattern* pattern, const std::string& name_scope, PDNode* x); struct GRU : public PatternBase {
GRU(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "lstm") {}
PDNode* operator()(PDNode* x);
// Operators
PATTERN_DECL_NODE(gru);
// Inputs
PATTERN_DECL_NODE(Bias);
PATTERN_DECL_NODE(Weight);
// Outputs
PATTERN_DECL_NODE(BatchGate);
PATTERN_DECL_NODE(BatchResetHiddenPrev);
PATTERN_DECL_NODE(BatchHidden);
PATTERN_DECL_NODE(Hidden);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other.
#define IR_NODE_LINK_TO(a, b) \ #define IR_NODE_LINK_TO(a, b) \
a->outputs.push_back(b); \ a->outputs.push_back(b); \
b->inputs.push_back(a); b->inputs.push_back(a);
......
...@@ -192,6 +192,8 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -192,6 +192,8 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ auto* id = subgraph.at(pattern.RetrieveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
int fuse_count{0};
detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) { Graph* graph) {
VLOG(4) << "get one concat pattern"; VLOG(4) << "get one concat pattern";
...@@ -239,8 +241,12 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl( ...@@ -239,8 +241,12 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
marked_nodes.erase(sequence_expand1_in); marked_nodes.erase(sequence_expand1_in);
marked_nodes.erase(fc_out); marked_nodes.erase(fc_out);
GraphSafeRemoveNodes(graph, marked_nodes); GraphSafeRemoveNodes(graph, marked_nodes);
++fuse_count;
}); });
AddStatis(fuse_count);
return graph; return graph;
} }
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
...@@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { ...@@ -251,8 +252,8 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void SerializeToStream(std::ostream &os, const LoDTensor &tensor, void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) { const platform::DeviceContext &dev_ctx) {
{ // the 1st field, uint32_t version for LoDTensor { // the 1st field, uint32_t version for LoDTensor
constexpr uint32_t version = 0; os.write(reinterpret_cast<const char *>(&kCurTensorVersion),
os.write(reinterpret_cast<const char *>(&version), sizeof(version)); sizeof(kCurTensorVersion));
} }
{ {
// the 2st field, LoD information // the 2st field, LoD information
...@@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, ...@@ -281,6 +282,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
// the 1st field, unit32_t version for LoDTensor // the 1st field, unit32_t version for LoDTensor
uint32_t version; uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version)); is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE(framework::IsTensorVersionSupported(version),
"tensor version %u is not supported.", version);
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
} }
{ {
......
...@@ -464,35 +464,35 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -464,35 +464,35 @@ class RuntimeInferShapeContext : public InferShapeContext {
: op_(op), scope_(scope) {} : op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const override { bool HasInput(const std::string& name) const override {
if (!op_.HasInputs(name)) { // has only one input
const auto& ins = op_.Inputs();
auto it = ins.find(name);
if (it == ins.end()) {
return false; return false;
} }
auto& ins = Inputs(name); const auto& in = it->second;
size_t length = ins.size(); if (in.size() == 0 || in[0] == kEmptyVarName) {
if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(in.size(), 1UL,
"Input %s should not have more than one inputs", name); "Input %s should not have more than one inputs", name);
auto ipt = ins[0]; return scope_.FindVar(in[0]) != nullptr;
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
} }
bool HasOutput(const std::string& name) const override { bool HasOutput(const std::string& name) const override {
if (!op_.HasOutputs(name)) { // has only one output
const auto& outs = op_.Outputs();
auto it = outs.find(name);
if (it == outs.end()) {
return false; return false;
} }
auto& outs = Outputs(name); const auto& out = it->second;
size_t length = outs.size(); if (out.size() == 0 || out[0] == kEmptyVarName) {
if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(out.size(), 1UL,
"Output %s should not have more than one inputs", name); "Output %s should not have more than one outputs", name);
auto ipt = outs[0]; return scope_.FindVar(out[0]) != nullptr;
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr;
} }
bool HasInputs(const std::string& name) const override { bool HasInputs(const std::string& name) const override {
......
...@@ -352,7 +352,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( ...@@ -352,7 +352,10 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
ParallelExecutor::~ParallelExecutor() { ParallelExecutor::~ParallelExecutor() {
if (member_->own_local_scope_) { if (member_->own_local_scope_) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
member_->global_scope_->DeleteScope(member_->local_scopes_[i]); Scope *local_scope = member_->local_scopes_[i];
if (member_->global_scope_->HasKid(local_scope)) {
member_->global_scope_->DeleteScope(local_scope);
}
} }
} }
} }
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/version.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -38,7 +39,10 @@ proto::ProgramDesc *ProgramDesc::Proto() { ...@@ -38,7 +39,10 @@ proto::ProgramDesc *ProgramDesc::Proto() {
return &desc_; return &desc_;
} }
int64_t ProgramDesc::Version() const { return desc_.version().version(); }
ProgramDesc::ProgramDesc() { ProgramDesc::ProgramDesc() {
desc_.mutable_version()->set_version(kCurProgramVersion);
auto *block = desc_.mutable_blocks()->Add(); auto *block = desc_.mutable_blocks()->Add();
block->set_idx(kRootBlockIndex); block->set_idx(kRootBlockIndex);
block->set_parent_idx(kNoneBlockIndex); block->set_parent_idx(kNoneBlockIndex);
......
...@@ -57,6 +57,8 @@ class ProgramDesc { ...@@ -57,6 +57,8 @@ class ProgramDesc {
proto::ProgramDesc *Proto(); proto::ProgramDesc *Proto();
int64_t Version() const;
// The output variable of feed_op is referenced as feed_target. // The output variable of feed_op is referenced as feed_target.
// This function is used to collect the output variable's name of all // This function is used to collect the output variable's name of all
// feed_ops. // feed_ops.
......
...@@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) { ...@@ -87,8 +87,17 @@ TEST(ProgramDesc, copy_ctor) {
ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
ASSERT_EQ(op_copy->Proto()->SerializeAsString(), ASSERT_EQ(op_origin->Proto()->attrs().size(),
op_origin->Proto()->SerializeAsString()); op_copy->Proto()->attrs().size());
for (auto it = op_origin->Proto()->attrs().begin();
it != op_origin->Proto()->attrs().end(); ++it) {
for (auto it_2 = op_copy->Proto()->attrs().begin();
it_2 != op_copy->Proto()->attrs().end(); ++it_2) {
if (it->name() == it_2->name()) {
ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString());
}
}
}
if (op->Type() == "op_with_subblock") { if (op->Type() == "op_with_subblock") {
ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
......
...@@ -56,5 +56,76 @@ struct RWLock { ...@@ -56,5 +56,76 @@ struct RWLock {
}; };
#endif #endif
class RWLockGuard {
public:
enum Status { kUnLock, kWRLock, kRDLock };
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
void WRLock() {
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
void RDLock() {
switch (status_) {
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
void UnLock() {
if (status_ != Status::kUnLock) {
lock_->UNLock();
status_ = Status::kUnLock;
}
}
~RWLockGuard() { UnLock(); }
private:
RWLock* lock_;
Status status_;
};
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -72,6 +72,12 @@ void Scope::DropKids() { ...@@ -72,6 +72,12 @@ void Scope::DropKids() {
kids_.clear(); kids_.clear();
} }
bool Scope::HasKid(const Scope* scope) const {
std::unique_lock<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end();
}
std::vector<std::string> Scope::LocalVarNames() const { std::vector<std::string> Scope::LocalVarNames() const {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
std::vector<std::string> known_vars; std::vector<std::string> known_vars;
......
...@@ -71,6 +71,9 @@ class Scope { ...@@ -71,6 +71,9 @@ class Scope {
/// Drop all kids scopes belonged to this scope. /// Drop all kids scopes belonged to this scope.
void DropKids(); void DropKids();
/// Find if a scope exists in the kid scopes
bool HasKid(const Scope* scope) const;
// enumerate all the variables current contains. // enumerate all the variables current contains.
std::vector<std::string> LocalVarNames() const; std::vector<std::string> LocalVarNames() const;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/version.h"
#include <algorithm>
namespace paddle {
namespace framework {
bool IsProgramVersionSupported(int64_t version) {
static int num_supported =
sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]);
return std::find(kSupportedProgramVersion,
kSupportedProgramVersion + num_supported,
version) != kSupportedProgramVersion + num_supported;
}
bool IsTensorVersionSupported(uint32_t version) {
static int num_supported =
sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]);
return std::find(kSupportedTensorVersion,
kSupportedTensorVersion + num_supported,
version) != kSupportedTensorVersion + num_supported;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdint>
#pragma once
namespace paddle {
namespace framework {
// Note:
// Program and Tensor that pass the IsXXXVersionSupported should
// be supported by the current codes. Otherwise, it's a compatibility
// bug.
// The program version the current codes generate.
constexpr int64_t kCurProgramVersion = 0;
// The program version that was generated by previous or current codes
// and supported by current codes.
constexpr int64_t kSupportedProgramVersion[] = {0};
// Due to historical reasons, tensor version use uint32_t.
// The tensor version the current codes generate.
constexpr uint32_t kCurTensorVersion = 0;
// The tensor version that was generated by previous or current codes
// and supported by current codes.
constexpr uint32_t kSupportedTensorVersion[] = {0};
bool IsProgramVersionSupported(int64_t version);
bool IsTensorVersionSupported(uint32_t version);
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/version.h"
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
TEST(Version, Basic) {
EXPECT_TRUE(IsProgramVersionSupported(0));
EXPECT_FALSE(IsProgramVersionSupported(1));
EXPECT_FALSE(IsProgramVersionSupported(-1));
EXPECT_TRUE(IsTensorVersionSupported(0));
EXPECT_FALSE(IsTensorVersionSupported(1));
EXPECT_FALSE(IsTensorVersionSupported(-1));
}
} // namespace framework
} // namespace paddle
...@@ -55,6 +55,7 @@ if(NOT APPLE) ...@@ -55,6 +55,7 @@ if(NOT APPLE)
endif() endif()
if(WITH_TESTING) if(WITH_TESTING)
# both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book # tests/book depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book) add_subdirectory(tests/book)
add_subdirectory(tests/api)
endif() endif()
cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass)
set(analysis_deps set(analysis_deps
framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor) framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log)
cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
analyzer.cc analyzer.cc
...@@ -40,27 +40,7 @@ function (inference_analysis_test TARGET) ...@@ -40,27 +40,7 @@ function (inference_analysis_test TARGET)
endif(WITH_TESTING) endif(WITH_TESTING)
endfunction(inference_analysis_test) endfunction(inference_analysis_test)
function (inference_download_and_uncompress install_dir url gz_filename) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
message(STATUS "finish downloading ${gz_filename}")
endfunction(inference_download_and_uncompress)
set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
--infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
...@@ -71,46 +51,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ ...@@ -71,46 +51,3 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
endif()
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_DATA_URL} "text_classification_data.txt.tar.gz")
endif()
inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
--topn=1 # Just run top 1 batch.
)
...@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> { ...@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
"mul_gru_fuse_pass", // "mul_gru_fuse_pass", //
"seq_concat_fc_fuse_pass", // "seq_concat_fc_fuse_pass", //
"fc_fuse_pass", // "fc_fuse_pass", //
#ifdef PADDLE_WITH_MKLDNN
"conv_relu_mkldnn_fuse_pass", //
#endif
}}; }};
std::unordered_set<std::string> disabled_ir_passes_; std::unordered_set<std::string> disabled_ir_passes_;
......
...@@ -16,21 +16,9 @@ ...@@ -16,21 +16,9 @@
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/singleton.h"
DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -91,286 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) { ...@@ -91,286 +79,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
} }
} }
namespace { TEST(Analyzer, word2vec_without_analysis) {
TestWord2vecPrediction(FLAGS_inference_model_dir);
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<std::vector<float>> week_data_all, minute_data_all;
std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
data.week_data_all.assign(week_data_all.begin() + batch_iter,
week_data_all.begin() + batch_end);
data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
minute_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
data.lod3.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
CHECK(!data.week_data_all.empty());
CHECK(!data.minute_data_all.empty());
CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
}
data.rnn_week_datas.push_back(data.week_data_all[j]);
data.rnn_minute_datas.push_back(data.minute_data_all[j]);
// calculate lod
data.lod1.push_back(data.lod1.back() +
data.link_step_data_all[j].size());
data.lod3.push_back(data.lod3.back() + 1);
for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
data.lod2.push_back(data.lod2.back() +
data.link_step_data_all[j].size());
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
std::vector<std::vector<float>> link_step_data;
std::vector<std::string> link_datas;
split(data[0], '|', &link_datas);
for (auto &step_data : link_datas) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
link_step_data.push_back(tmp);
}
// load week data
std::vector<float> week_data;
split_to_float(data[2], ',', &week_data);
// load minute data
std::vector<float> minute_data;
split_to_float(data[1], ',', &minute_data);
link_step_data_all.push_back(std::move(link_step_data));
week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data));
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
week_tensor, minute_tensor;
lod_attention_tensor.name = "data_lod_attention";
init_zero_tensor.name = "cell_init";
lod_tensor_tensor.name = "data";
week_tensor.name = "week";
minute_tensor.name = "minute";
auto one_batch = data->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor.shape.assign({1, 2});
lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
init_zero_tensor.shape.assign({batch_size, 15});
init_zero_tensor.lod.assign({one_batch.lod3});
lod_tensor_tensor.shape = rnn_link_data_shape;
lod_tensor_tensor.lod.assign({one_batch.lod1});
// clang-format off
week_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor.lod.assign({one_batch.lod3});
minute_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData<float>(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
init_zero_tensor1, lod_attention_tensor,
lod_tensor_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::FLOAT32;
}
}
} // namespace
const float ditu_rnn_target_data[] = {
104.711, 11.2431, 1.35422, 0, 0, 0, 0, 0,
27.7039, 1.41486, 7.09526, 0, 0, 0, 0, 0,
7.6481, 6.5324, 56.383, 2.88018, 8.92918, 132.007, 4.27429, 2.02934,
14.1727, 10.7461, 25.0616, 16.0197, 14.4163, 16.9199, 6.75517, 0,
80.0249, 4.77739, 0, 0, 0, 0, 0, 0,
47.5643, 2.67029, 8.76252, 0, 0, 0, 0, 0,
51.8822, 4.4411, 0, 0, 0, 0, 0, 0,
10.7286, 12.0595, 10.6672, 0, 0, 0, 0, 0,
93.5771, 3.84641, 0, 0, 0, 0, 0, 0,
169.426, 0, 0, 0, 0, 0, 0, 0};
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &base_outputs) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &base_out = base_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
1, [](int a, int b) { return a * b; });
PADDLE_ENFORCE_EQ(size, size1);
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
float *base_data = static_cast<float *>(base_out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_data[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
int num_threads) {
AnalysisConfig config;
config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs);
if (num_threads == 1) {
// Prepare inputs.
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
CompareResult(outputs, base_outputs);
} else {
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local input_slots and outputs.
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
predictors[tid]->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times);
CompareResult(outputs, base_outputs);
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
if (use_analysis && activate_ir) {
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
}
}
// Inference with analysis and IR, easy for profiling independently.
TEST(Analyzer, DituRNN) {
TestDituRNNPrediction(true, true, FLAGS_num_threads);
}
// Other unit-tests of DituRNN, test different options of use_analysis,
// activate_ir and multi-threads.
TEST(Analyzer, DituRNN_tests) {
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestDituRNNPrediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis
// module will transform the program to a data flow graph.
TestDituRNNPrediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestDituRNNPrediction(true, true, i);
}
} }
} // namespace analysis } // namespace analysis
......
...@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT ...@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
} }
return false; return false;
}; };
for (auto &node : graph) { for (auto &node : graph) {
for (auto *in : node->inlinks) { for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the // The Value that is written by nodes inside a sub-graph shouldn't be the
...@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT ...@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::vector<Node *>(outputs.begin(), outputs.end())); std::vector<Node *>(outputs.begin(), outputs.end()));
} }
// Filter the Intermediate results of the subgraph node.
void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
std::vector<Node *> op_nodes; std::vector<Node *> op_nodes;
for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) { for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
...@@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { ...@@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
for (auto *out : op_nodes[i]->outlinks) { for (auto *out : op_nodes[i]->outlinks) {
if (follow_up_input_names.count(out->name())) { if (follow_up_input_names.count(out->name())) {
filtered_subgraph_outlinks.push_back(out); filtered_subgraph_outlinks.push_back(out);
} else {
out->SetDeleted();
} }
} }
PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL); // The filtered_subgraph_outlinks may be empty.
op_nodes[i]->outlinks = filtered_subgraph_outlinks; op_nodes[i]->outlinks = filtered_subgraph_outlinks;
} }
} }
......
...@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, ...@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
// collect inputs // collect inputs
std::unordered_set<std::string> input_names; std::unordered_set<std::string> input_names;
std::unordered_set<std::string> input_names_with_id;
for (auto *x : func->inlinks) { for (auto *x : func->inlinks) {
input_names.insert(x->name()); input_names.insert(x->name());
input_names_with_id.insert(x->name() + std::to_string(x->id()));
} }
desc.SetInput( desc.SetInput(
"Xs", std::vector<std::string>(input_names.begin(), input_names.end())); "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
std::unordered_set<std::string> output_names; std::unordered_set<std::string> output_names;
std::unordered_set<std::string> output_names_with_id;
for (auto *x : func->outlinks) { for (auto *x : func->outlinks) {
output_names.insert(x->name()); output_names.insert(x->name());
output_names_with_id.insert(x->name() + std::to_string(x->id()));
} }
std::vector<std::string> output_temp(output_names.begin(), desc.SetOutput(
output_names.end()); "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
desc.SetOutput("Ys", output_temp);
desc.SetType("tensorrt_engine"); desc.SetType("tensorrt_engine");
std::unordered_map<std::string, std::string> output_name_map; std::unordered_map<std::string, std::string> output_name_map;
...@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, ...@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
std::vector<std::string> replaced_names; std::vector<std::string> replaced_names;
for (int k = 0; k < in_var->arguments_size(); k++) { for (int k = 0; k < in_var->arguments_size(); k++) {
std::string arg_value = in_var->arguments(k); std::string arg_value = in_var->arguments(k);
if (input_names.count(arg_value)) { std::string arg_value_with_id =
arg_value + std::to_string(var2id[arg_value]);
if (input_names_with_id.count(arg_value_with_id)) {
replaced_names.push_back(arg_value); replaced_names.push_back(arg_value);
} else { } else {
replaced_names.push_back(arg_value + replaced_names.push_back(arg_value_with_id);
std::to_string(var2id[arg_value]));
} }
} }
in_var->clear_arguments(); in_var->clear_arguments();
...@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, ...@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
std::vector<std::string> replaced_names; std::vector<std::string> replaced_names;
for (int k = 0; k < out_var->arguments_size(); k++) { for (int k = 0; k < out_var->arguments_size(); k++) {
std::string arg_value = out_var->arguments(k); std::string arg_value = out_var->arguments(k);
if (output_names.count(arg_value)) { std::string arg_value_with_id =
output_name_map[arg_value] = arg_value + std::to_string(var2id[arg_value]);
arg_value + std::to_string(var2id[arg_value]); if (output_names_with_id.count(arg_value_with_id)) {
output_name_map[arg_value] = arg_value_with_id;
} }
replaced_names.push_back(arg_value + std::to_string(var2id[arg_value])); replaced_names.push_back(arg_value_with_id);
} }
out_var->clear_arguments(); out_var->clear_arguments();
for (size_t k = 0; k < replaced_names.size(); k++) { for (size_t k = 0; k < replaced_names.size(); k++) {
......
...@@ -14,13 +14,18 @@ ...@@ -14,13 +14,18 @@
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
using string::PrettyLogEndl;
using string::PrettyLog;
using string::Style;
IRPassManager::IRPassManager(const ProgramDesc &program, IRPassManager::IRPassManager(const ProgramDesc &program,
framework::Scope *scope) framework::Scope *scope)
...@@ -33,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program, ...@@ -33,13 +38,16 @@ IRPassManager::IRPassManager(const ProgramDesc &program,
void IRPassManager::Apply(const std::vector<std::string> &passes) { void IRPassManager::Apply(const std::vector<std::string> &passes) {
// Apply all the passes // Apply all the passes
std::string pre_pass; std::string pre_pass;
int pass_num = 0;
for (const std::string &pass_name : passes) { for (const std::string &pass_name : passes) {
LOG(WARNING) << "Running IR pass [" << pass_name << "]"; PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass_name);
auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
if (pass_name == "graph_viz_pass") { if (pass_name == "graph_viz_pass") {
std::string dot_file_path = std::string dot_file_path = std::to_string(pass_num) + "_ir_" +
"ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot"; (pre_pass.empty() ? "origin" : pre_pass) +
".dot";
pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
pass_num++;
} }
graph_ = pass->Apply(std::move(graph_)); graph_ = pass->Apply(std::move(graph_));
pre_pass = pass_name; pre_pass = pass_name;
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -22,7 +23,7 @@ namespace analysis { ...@@ -22,7 +23,7 @@ namespace analysis {
bool PassManager::Initialize(Argument* argument) { bool PassManager::Initialize(Argument* argument) {
argument_ = argument; argument_ = argument;
for (auto& pass : data_) { for (auto& pass : data_) {
LOG(WARNING) << "Initializing pass [" << pass->repr() << "]"; VLOG(3) << "Initializing pass [" << pass->repr() << "]";
if (!pass->Initialize(argument)) { if (!pass->Initialize(argument)) {
LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
return false; return false;
...@@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) { ...@@ -33,9 +34,10 @@ bool PassManager::Initialize(Argument* argument) {
void DfgPassManager::RunAll() { void DfgPassManager::RunAll() {
PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_);
LOG(INFO) << "Total " << data_.size() << " Analysys passes"; VLOG(3) << "Total " << data_.size() << " Analysys passes";
for (auto& pass : data_) { for (auto& pass : data_) {
LOG(WARNING) << "Running Analysis pass [" << pass->repr() << "]"; string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
pass->repr());
pass->Run(argument_->main_dfg.get()); pass->Run(argument_->main_dfg.get());
} }
} }
......
...@@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { ...@@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor; node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
} }
// This is a simple representation of a graph.
// The BriefNode hold the pointer of the Node.
// This is to avoid changing the original graph
// in the process of trt graph analysis.
struct BriefNode {
explicit BriefNode(Node *n) { node = n; }
Node *node;
std::vector<BriefNode *> inlinks;
std::vector<BriefNode *> outlinks;
};
// Union two adjacent BriefNode.
// Suppose we have two adjacent nodes src and dst.
// We will perform the following operations:
// 1. add all inputs(except src) of dst to src inlinks.
// 2. add all outputs of dst to src outlinks.
// 3. change all the dst's inputs and outputs
// corresponding inlinks and outlinks to src node.
// 4. delete all dst's inlinks and outlinks.
void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
int src_id, int dst_id) {
// merge the two adjacent nodes into one node.
BriefNode *src_node = node_map.at(src_id);
BriefNode *dst_node = node_map.at(dst_id);
std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
src_node->inlinks.end());
std::unordered_set<BriefNode *> outputs;
for (auto *n : src_node->outlinks) {
if (n != dst_node) outputs.insert(n);
}
// Add the inlinks and outlinks of dst node to src node.
std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
for (BriefNode *node : dst_in_nodes) {
if (node != src_node) {
inputs.insert(node);
}
}
std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
for (BriefNode *node : dst_out_nodes) {
outputs.insert(node);
}
// update the dst and src node's inlinks and outlinks.
#ifdef __clang__
src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#else
src_node->inlinks =
std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
src_node->outlinks =
std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
dst_node->inlinks.clear();
dst_node->outlinks.clear();
#endif
auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
for (auto *&n : nodes) {
if (n == src_node || n == dst_node) {
n = src_node;
}
}
};
// Change all the dst inputs and outputs corresponding inlink and
// outlink to the src node.
for (auto *node : src_node->inlinks) {
inlink_or_outlink_cleaner(node->outlinks);
}
for (auto *node : src_node->outlinks) {
inlink_or_outlink_cleaner(node->inlinks);
}
}
// FlexibleDFS
// If reverse is true, do reverse dfs.
// If enter func is not nullptr, calls enter(node) before visiting any children
// of node.
// If leave func not nullptr, calls leave(node) after visiting all parents of
// node.
void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
const std::function<bool(const BriefNode *)> &enter,
const std::function<bool(const BriefNode *)> &leave) {
typedef struct {
const BriefNode *node;
bool leave;
} FNode;
std::vector<FNode> stack;
for (auto &node : source) {
stack.push_back(FNode{node, false});
}
std::unordered_set<const BriefNode *> visited;
while (!stack.empty()) {
auto fnode = stack.back();
stack.pop_back();
if (fnode.leave) {
if (leave && !leave(fnode.node)) return;
}
if (visited.count(fnode.node)) continue;
visited.insert(fnode.node);
if (enter && !enter(fnode.node)) return;
if (leave) stack.push_back(FNode{fnode.node, true});
const std::vector<BriefNode *> iter_nodes =
reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
for (const BriefNode *node : iter_nodes) {
if (!visited.count(node)) {
stack.push_back(FNode{node, false});
}
}
}
}
std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() { std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
// Run the Extract algorithm to find all subgraphs.
std::vector<Node *> marked_nodes; std::vector<Node *> marked_nodes;
// We use brief_node_map to represent the original graph in order to avoid
// changing the original graph.
std::unordered_map<int, BriefNode *> brief_node_map;
for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) { for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
brief_node_map[node.id()] = new BriefNode(&node);
if (node.attr(kMarkerAttrName).Bool()) { if (node.attr(kMarkerAttrName).Bool()) {
marked_nodes.push_back(&node); marked_nodes.push_back(&node);
} }
} }
// extract sub-graphs in the marked node set, use Union Find algorithm. // extract sub-graphs in the marked node set, use Union Find algorithm.
node_map_t node_map; // id to ptr node_map_t node_map; // id to ptr
for (auto *n : marked_nodes) { for (auto *n : marked_nodes) {
...@@ -88,11 +216,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() { ...@@ -88,11 +216,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
n->attr(kUnionFindParent).Int32() = n->id(); n->attr(kUnionFindParent).Int32() = n->id();
node_map[n->id()] = n; node_map[n->id()] = n;
} }
std::unordered_set<Node *> visited;
for (auto *n : marked_nodes) { // create breif node map
for (auto *out : n->outlinks) { for (auto &itr : brief_node_map) {
if (node_map.count(out->id())) { for (Node *node : itr.second->node->inlinks) {
UnionFindCombine(node_map, n->id(), out->id()); itr.second->inlinks.push_back(brief_node_map[node->id()]);
}
for (Node *node : itr.second->node->outlinks) {
itr.second->outlinks.push_back(brief_node_map[node->id()]);
}
}
for (auto &itr : brief_node_map) {
BriefNode *brief_node = itr.second;
if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
continue;
}
// Our algorithm must guarantee that:
// 1. The graph is always directed acyclic graph(DAG).
// 2. If there is a path in the subgraph from X to Y (X and Y are both
// nodes in the subgraph), then all paths from X to Y are in the
// subgraph.
//
// In order to achieve the above guarantee.
// For adjacent nodes src -> dst.
// 1. Get all dst input nodes except src.
// 2. Reverse DFS from those input nodes
// 3. If there is a path from input nodes to src,
// then the src and dst nodes can not be fused into one node,
// otherwise it can be done.
while (true) {
std::unordered_set<BriefNode *> contract_nodes;
for (auto *out : brief_node->outlinks) {
// must be an trt candidate
if (!out->node->attr(kMarkerAttrName).Bool()) continue;
// get all dst input nodes except src.
std::vector<BriefNode *> source_nodes;
for (auto *n : out->inlinks) {
if (n != brief_node) {
source_nodes.push_back(n);
}
}
// Reverse DFS from the source_nodes.
bool have_excess_path = false;
FlexibleDFS(source_nodes, true, nullptr,
[&have_excess_path, brief_node](const BriefNode *n) {
if (n == brief_node) {
have_excess_path = true;
return false;
}
return true;
});
if (have_excess_path) continue;
contract_nodes.insert(out);
}
if (contract_nodes.empty()) break;
for (auto dst_node : contract_nodes) {
UnionFindCombine(node_map, brief_node->node->id(),
dst_node->node->id());
UnionContractedNodes(brief_node_map, brief_node->node->id(),
dst_node->node->id());
} }
} }
} }
...@@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() { ...@@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
auto io = ExtractInputAndOutputOfSubGraph(subgraph); auto io = ExtractInputAndOutputOfSubGraph(subgraph);
block_node->inlinks = std::move(io.first); block_node->inlinks = std::move(io.first);
block_node->outlinks = std::move(io.second); block_node->outlinks = std::move(io.second);
for (auto *node : subgraph) { for (auto *node : subgraph) {
// TODO(Superjomn) need a unified mechanism to treat deleted node in each // TODO(Superjomn) need a unified mechanism to treat deleted node in each
// pass. // pass.
......
...@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) { ...@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) {
// At least one nodes should be deleted. // At least one nodes should be deleted.
ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock ASSERT_EQ(dfg.nodes.size(), count0 + 1); // added a new FunctionBlock
ASSERT_EQ(6, count1); ASSERT_EQ(11, count1);
} }
} // namespace analysis } // namespace analysis
......
...@@ -77,6 +77,9 @@ bool AnalysisPredictor::Init( ...@@ -77,6 +77,9 @@ bool AnalysisPredictor::Init(
OptimizeInferenceProgram(); OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
VLOG(5) << "to create variables"; VLOG(5) << "to create variables";
PADDLE_ENFORCE(scope_.get()); PADDLE_ENFORCE(scope_.get());
......
...@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
...@@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { ...@@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {
void PaddleBuf::Resize(size_t length) { void PaddleBuf::Resize(size_t length) {
// Only the owned memory can be reset, the external memory can't be changed. // Only the owned memory can be reset, the external memory can't be changed.
if (length_ == length) return; if (length_ >= length) return;
if (memory_owned_) { if (memory_owned_) {
Free(); Free();
data_ = malloc(length);
length_ = length;
memory_owned_ = true;
} else {
PADDLE_THROW("The memory is allocated externally, can not Resized");
} }
data_ = new char[length];
length_ = length;
memory_owned_ = true;
} }
void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Reset(void* data, size_t length) {
...@@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) { ...@@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
assert(length_ > 0); PADDLE_ENFORCE_GT(length_, 0);
delete[] static_cast<char*>(data_); free(static_cast<char*>(data_));
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
} }
......
...@@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init( ...@@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init(
} }
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
if (config_._use_mkldnn) {
executor_->EnableMKLDNN(*inference_program_);
}
executor_->CreateVariables(*inference_program_, executor_->CreateVariables(*inference_program_,
sub_scope_ ? sub_scope_ : scope_.get(), 0); sub_scope_ ? sub_scope_ : scope_.get(), 0);
...@@ -262,7 +265,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, ...@@ -262,7 +265,7 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) { if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
buffer.Resize(sizeof(T) * data.size()); buffer.Resize(sizeof(T) * data.size());
} }
std::memcpy(buffer.data(), data.data(), buffer.length()); std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
// copy LoD // copy LoD
for (const auto &level : fetch.lod()) { for (const auto &level : fetch.lod()) {
output->lod.emplace_back(level); output->lod.emplace_back(level);
......
...@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) { ...@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
} }
void PrintTime(int batch_size, int repeat, int num_threads, int tid, void PrintTime(int batch_size, int repeat, int num_threads, int tid,
double latency) { double latency, int epoch = 1) {
LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
<< ", threads: " << num_threads << ", thread id: " << tid << ", threads: " << num_threads << ", thread id: " << tid
<< ", latency: " << latency << "ms ======"; << ", latency: " << latency << "ms ======";
if (epoch > 1) {
int samples = batch_size * epoch;
LOG(INFO) << "====== sample number: " << samples
<< ", average latency of each sample: " << latency / samples
<< "ms ======";
}
} }
} // namespace inference } // namespace inference
......
...@@ -45,7 +45,7 @@ class PaddleBuf { ...@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf(void* data, size_t length) PaddleBuf(void* data, size_t length)
: data_(data), length_(length), memory_owned_{false} {} : data_(data), length_(length), memory_owned_{false} {}
// Own memory. // Own memory.
PaddleBuf(size_t length) explicit PaddleBuf(size_t length)
: data_(new char[length]), length_(length), memory_owned_(true) {} : data_(new char[length]), length_(length), memory_owned_(true) {}
// Resize to `length` bytes. // Resize to `length` bytes.
void Resize(size_t length); void Resize(size_t length);
...@@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config { ...@@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
bool use_gpu{false}; bool use_gpu{false};
int device{0}; int device{0};
float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization. float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
// NOTE: NOT use it, just for the internal test, will discard later
bool _use_mkldnn{false};
// Specify the variable's name of each input. // Specify the variable's name of each input.
bool specify_input_name{false}; bool specify_input_name{false};
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
...@@ -124,6 +125,9 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor, ...@@ -124,6 +125,9 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str)); new framework::ProgramDesc(program_desc_str));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, dirname, ""); LoadPersistables(executor, scope, *main_program, dirname, "");
return main_program; return main_program;
...@@ -138,6 +142,9 @@ std::unique_ptr<framework::ProgramDesc> Load( ...@@ -138,6 +142,9 @@ std::unique_ptr<framework::ProgramDesc> Load(
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str)); new framework::ProgramDesc(program_desc_str));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, "", param_filename); LoadPersistables(executor, scope, *main_program, "", param_filename);
return main_program; return main_program;
......
...@@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter { ...@@ -35,6 +35,8 @@ class ReluOpConverter : public OpConverter {
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor), engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
nvinfer1::ActivationType::kRELU); nvinfer1::ActivationType::kRELU);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("relu (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
......
...@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter { ...@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter {
scale_weights.get(), power_weights.get()); scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Y").front(); auto output_name = op_desc.Output("Y").front();
layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Bias").front()] = engine_->weight_map[op_desc.Input("Bias").front()] =
std::move(combile_bias_tensor); std::move(combile_bias_tensor);
engine_->weight_map[op_desc.Input("Scale").front()] = engine_->weight_map[op_desc.Input("Scale").front()] =
......
...@@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter { ...@@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter {
axis = axis - 1; // Remove batch dim axis = axis - 1; // Remove batch dim
layer->setAxis(axis); layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("concat (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
......
...@@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter { ...@@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter {
layer->setNbGroups(groups); layer->setNbGroups(groups);
auto output_name = op_desc.Output("Output").front(); auto output_name = op_desc.Output("Output").front();
layer->setName(("conv2d (Output: " + output_name + ")").c_str());
engine_->weight_map[op_desc.Input("Filter").front()] = engine_->weight_map[op_desc.Input("Filter").front()] =
std::move(weight_tensor); std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { if (test_mode) {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
......
...@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
shift_weights.get(), scale_weights.get(), power_weights.get()); shift_weights.get(), scale_weights.get(), power_weights.get());
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor); engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
...@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter {
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second); *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("elementwise (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { // the test framework can not determine which is the if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside. // output, so place the declaration inside.
......
...@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter { ...@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter {
n_output, tmp_weight.get(), bias.get()); n_output, tmp_weight.get(), bias.get());
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
layer->setName(("fc (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp); engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
if (test_mode) { if (test_mode) {
......
...@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter { ...@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter {
layer->setPadding(nv_paddings); layer->setPadding(nv_paddings);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
layer->setName(("pool2d (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) { if (test_mode) {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
......
set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo")
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function (inference_download_and_uncompress install_dir filename)
message(STATUS "Download inference test stuff from ${INFERENCE_URL}/${filename}")
execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${INFERENCE_URL}/${filename}")
execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
message(STATUS "finish downloading ${filename}")
endfunction(inference_download_and_uncompress)
function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
inference_download_and_uncompress(${install_dir} ${model_name})
inference_download_and_uncompress(${install_dir} ${data_name})
endif()
endfunction()
# RNN1
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RNN1_INSTALL_DIR}/model
--infer_data=${RNN1_INSTALL_DIR}/data.txt)
# RNN2
set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_test(test_analyzer_rnn2 SRCS analyzer_rnn2_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${RNN2_INSTALL_DIR}/model
--infer_data=${RNN2_INSTALL_DIR}/data.txt)
# chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
--infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
# lac
set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${LAC_INSTALL_DIR}/model
--infer_data=${LAC_INSTALL_DIR}/data.txt)
# text_classification
set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
inference_analysis_test(test_analyzer_text_classification SRCS analyzer_text_classification_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/model
--infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt)
# ocr
set(OCR_MODEL_URL "http://paddlemodels.cdn.bcebos.com/inference-vis-demos%2Focr.tar.gz")
set(OCR_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR} AND WITH_INFERENCE)
get_filename_component(filename ${OCR_MODEL_URL} NAME)
message(STATUS "Download inference test stuff ${filename} from ${OCR_MODEL_URL}")
execute_process(COMMAND bash -c "mkdir -p ${OCR_INSTALL_DIR}")
execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && wget -q ${OCR_MODEL_URL}")
execute_process(COMMAND bash -c "cd ${OCR_INSTALL_DIR} && tar xzf ${filename}")
message(STATUS "finish downloading ${filename}")
endif()
inference_analysis_test(test_analyzer_ocr SRCS analyzer_vis_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${OCR_INSTALL_DIR}/model
--infer_data=${OCR_INSTALL_DIR}/data.txt)
...@@ -12,21 +12,7 @@ ...@@ -12,21 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path for LAC");
DEFINE_string(infer_data, "", "data file for LAC");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -117,34 +103,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -117,34 +103,6 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({input_tensor}); input_slots->assign({input_tensor});
} }
void BenchAllData(const std::string &model_path, const std::string &data_file,
const int batch_size, const int repeat) {
NativeConfig config;
config.model_dir = model_path;
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size);
auto predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
GetOneBatch(&input_slots, &data, batch_size);
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
}
Timer timer;
double sum = 0;
for (int i = 0; i < repeat; i++) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
}
PrintTime(batch_size, repeat, 1, 0, sum / repeat);
}
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, 25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, 44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
...@@ -152,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, ...@@ -152,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
void TestLACPrediction(const std::string &model_path, void TestLACPrediction(const std::string &model_path,
const std::string &data_file, const int batch_size, const std::string &data_file, const int batch_size,
const int repeat, bool test_all_data, const int repeat, bool use_analysis = false) {
bool use_analysis = false) { AnalysisConfig cfg;
NativeConfig config; cfg.model_dir = model_path;
config.model_dir = model_path; cfg.use_gpu = false;
config.use_gpu = false; cfg.device = 0;
config.device = 0; cfg.specify_input_name = true;
config.specify_input_name = true; cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots; std::vector<PaddleTensor> input_slots, outputs_slots;
DataRecord data(data_file, batch_size); DataRecord data(data_file, batch_size);
GetOneBatch(&input_slots, &data, batch_size); GetOneBatch(&input_slots, &data, batch_size);
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
if (use_analysis) { if (use_analysis) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor = predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else { } else {
predictor = predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
} }
for (int i = 0; i < FLAGS_burning; i++) { for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots); predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; Timer timer;
if (test_all_data) { if (FLAGS_test_all_data) {
double sum = 0; LOG(INFO) << "test all data";
LOG(INFO) << "Total number of samples: " << data.datasets.size(); std::vector<std::vector<PaddleTensor>> input_slots_all;
for (int i = 0; i < repeat; i++) { for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) { GetOneBatch(&input_slots, &data, batch_size);
GetOneBatch(&input_slots, &data, batch_size); input_slots_all.emplace_back(input_slots);
timer.tic();
predictor->Run(input_slots, &outputs_slots);
sum += timer.toc();
}
} }
PrintTime(batch_size, repeat, 1, 0, sum / repeat); LOG(INFO) << "total number of samples: " << data.datasets.size();
LOG(INFO) << "Average latency of each sample: " TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
<< sum / repeat / data.datasets.size() << " ms";
return; return;
} }
timer.tic(); timer.tic();
...@@ -218,19 +166,10 @@ void TestLACPrediction(const std::string &model_path, ...@@ -218,19 +166,10 @@ void TestLACPrediction(const std::string &model_path,
if (use_analysis) { if (use_analysis) {
// run once for comparion as reference // run once for comparion as reference
auto ref_predictor = auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots; std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots); ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size()); CompareResult(ref_outputs_slots, outputs_slots);
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], pdata[i]);
}
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());
...@@ -259,13 +198,13 @@ void TestLACPrediction(const std::string &model_path, ...@@ -259,13 +198,13 @@ void TestLACPrediction(const std::string &model_path,
TEST(Analyzer_LAC, native) { TEST(Analyzer_LAC, native) {
LOG(INFO) << "LAC with native"; LOG(INFO) << "LAC with native";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data); FLAGS_repeat);
} }
TEST(Analyzer_LAC, analysis) { TEST(Analyzer_LAC, analysis) {
LOG(INFO) << "LAC with analysis"; LOG(INFO) << "LAC with analysis";
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
FLAGS_repeat, FLAGS_test_all_data, true); FLAGS_repeat, true);
} }
} // namespace analysis } // namespace analysis
......
...@@ -12,20 +12,7 @@ ...@@ -12,20 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 10, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -113,49 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, ...@@ -113,49 +100,35 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
48, 39, 38, 16, 25}; 48, 39, 38, 16, 25};
void TestChineseNERPrediction(bool use_analysis) { void TestChineseNERPrediction(bool use_analysis) {
NativeConfig config; AnalysisConfig cfg;
config.prog_file = FLAGS_infer_model + "/__model__"; cfg.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param"; cfg.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false; cfg.use_gpu = false;
config.device = 0; cfg.device = 0;
config.specify_input_name = true; cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs; std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor; std::unique_ptr<PaddlePredictor> predictor;
Timer timer; Timer timer;
if (use_analysis) { if (use_analysis) {
AnalysisConfig cfg;
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.param_file = FLAGS_infer_model + "/param";
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
predictor = predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else { } else {
predictor = predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
} }
if (FLAGS_test_all_data) { if (FLAGS_test_all_data) {
LOG(INFO) << "test all data"; LOG(INFO) << "test all data";
double sum = 0; DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
size_t num_samples; std::vector<std::vector<PaddleTensor>> input_slots_all;
for (int i = 0; i < FLAGS_repeat; i++) { for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
num_samples = data.num_samples; input_slots_all.emplace_back(input_slots);
for (size_t bid = 0; bid < num_samples; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
timer.tic();
predictor->Run(input_slots, &outputs);
sum += timer.toc();
}
} }
LOG(INFO) << "total number of samples: " << num_samples; LOG(INFO) << "total number of samples: " << data.num_samples;
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
LOG(INFO) << "average latency of each sample: "
<< sum / FLAGS_repeat / num_samples;
return; return;
} }
// Prepare inputs. // Prepare inputs.
...@@ -181,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) { ...@@ -181,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
if (use_analysis) { if (use_analysis) {
// run once for comparion as reference // run once for comparion as reference
auto ref_predictor = auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots; std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots); ref_predictor->Run(input_slots, &ref_outputs_slots);
EXPECT_EQ(ref_outputs_slots.size(), outputs.size()); CompareResult(ref_outputs_slots, outputs);
auto &ref_out = ref_outputs_slots[0];
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_EQ(size, ref_size);
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t i = 0; i < size; ++i) {
EXPECT_EQ(pdata_ref[i], result[i]);
}
AnalysisPredictor *analysis_predictor = AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get()); dynamic_cast<AnalysisPredictor *>(predictor.get());
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
using namespace framework; // NOLINT
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<std::vector<float>> week_data_all, minute_data_all;
std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
data.week_data_all.assign(week_data_all.begin() + batch_iter,
week_data_all.begin() + batch_end);
data.minute_data_all.assign(minute_data_all.begin() + batch_iter,
minute_data_all.begin() + batch_end);
// Prepare LoDs
data.lod1.push_back(0);
data.lod2.push_back(0);
data.lod3.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
CHECK(!data.week_data_all.empty());
CHECK(!data.minute_data_all.empty());
CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size());
CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size());
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
}
data.rnn_week_datas.push_back(data.week_data_all[j]);
data.rnn_minute_datas.push_back(data.minute_data_all[j]);
// calculate lod
data.lod1.push_back(data.lod1.back() +
data.link_step_data_all[j].size());
data.lod3.push_back(data.lod3.back() + 1);
for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) {
data.lod2.push_back(data.lod2.back() +
data.link_step_data_all[j].size());
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
std::vector<std::vector<float>> link_step_data;
std::vector<std::string> link_datas;
split(data[0], '|', &link_datas);
for (auto &step_data : link_datas) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
link_step_data.push_back(tmp);
}
// load week data
std::vector<float> week_data;
split_to_float(data[2], ',', &week_data);
// load minute data
std::vector<float> minute_data;
split_to_float(data[1], ',', &minute_data);
link_step_data_all.push_back(std::move(link_step_data));
week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data));
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor,
week_tensor, minute_tensor;
lod_attention_tensor.name = "data_lod_attention";
init_zero_tensor.name = "cell_init";
lod_tensor_tensor.name = "data";
week_tensor.name = "week";
minute_tensor.name = "minute";
auto one_batch = data->NextBatch();
std::vector<int> rnn_link_data_shape(
{static_cast<int>(one_batch.rnn_link_data.size()),
static_cast<int>(one_batch.rnn_link_data.front().size())});
lod_attention_tensor.shape.assign({1, 2});
lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2});
init_zero_tensor.shape.assign({batch_size, 15});
init_zero_tensor.lod.assign({one_batch.lod3});
lod_tensor_tensor.shape = rnn_link_data_shape;
lod_tensor_tensor.lod.assign({one_batch.lod1});
// clang-format off
week_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_week_datas.size()),
static_cast<int>(one_batch.rnn_week_datas.front().size())});
week_tensor.lod.assign({one_batch.lod3});
minute_tensor.shape.assign(
{static_cast<int>(one_batch.rnn_minute_datas.size()),
static_cast<int>(one_batch.rnn_minute_datas.front().size())});
minute_tensor.lod.assign({one_batch.lod3});
// clang-format on
// assign data
TensorAssignData<float>(&lod_attention_tensor,
std::vector<std::vector<float>>({{0, 0}}));
std::vector<float> tmp_zeros(batch_size * 15, 0.);
TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
// Set inputs.
auto init_zero_tensor1 = init_zero_tensor;
init_zero_tensor1.name = "hidden_init";
input_slots->assign({week_tensor, init_zero_tensor, minute_tensor,
init_zero_tensor1, lod_attention_tensor,
lod_tensor_tensor});
for (auto &tensor : *input_slots) {
tensor.dtype = PaddleDType::FLOAT32;
}
}
// Test with a really complicate model.
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = activate_ir;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
// Prepare inputs.
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
base_predictor->Run(input_slots, &base_outputs);
std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots);
if (num_threads == 1) {
TestOneThreadPrediction(config, input_slots_all, &outputs);
CompareResult(outputs, base_outputs);
} else {
// only return the output of first thread
TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
}
if (use_analysis && activate_ir) {
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists.
}
}
// Inference with analysis and IR, easy for profiling independently.
TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
// Other unit-tests of RNN1, test different options of use_analysis,
// activate_ir and multi-threads.
TEST(Analyzer, RNN_tests) {
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestRNN1Prediction(true, true, i);
}
}
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
using namespace framework; // NOLINT
struct DataRecord {
std::vector<std::vector<std::vector<float>>> link_step_data_all;
std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data;
size_t batch_iter{0};
size_t batch_size{1};
DataRecord() = default;
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
Load(path);
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= link_step_data_all.size()) {
data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
link_step_data_all.begin() + batch_end);
// Prepare LoDs
data.lod.push_back(0);
CHECK(!data.link_step_data_all.empty()) << "empty";
for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
for (const auto &d : data.link_step_data_all[j]) {
data.rnn_link_data.push_back(d);
// calculate lod
data.lod.push_back(data.lod.back() + 11);
}
}
}
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
int num_lines = 0;
while (std::getline(file, line)) {
num_lines++;
std::vector<std::string> data;
split(line, ':', &data);
if (num_lines % 2) { // feature
std::vector<std::string> feature_data;
split(data[1], ' ', &feature_data);
std::vector<std::vector<float>> link_step_data;
int feature_count = 1;
std::vector<float> feature;
for (auto &step_data : feature_data) {
std::vector<float> tmp;
split_to_float(step_data, ',', &tmp);
feature.insert(feature.end(), tmp.begin(), tmp.end());
if (feature_count % 11 == 0) { // each sample has 11 features
link_step_data.push_back(feature);
feature.clear();
}
feature_count++;
}
link_step_data_all.push_back(std::move(link_step_data));
} else { // result
std::vector<float> tmp;
split_to_float(data[1], ',', &tmp);
result_data.insert(result_data.end(), tmp.begin(), tmp.end());
}
}
}
};
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor feed_tensor;
feed_tensor.name = "feed";
auto one_batch = data->NextBatch();
int token_size = one_batch.rnn_link_data.size();
// each token has 11 features, each feature's dim is 54.
std::vector<int> rnn_link_data_shape({token_size * 11, 54});
feed_tensor.shape = rnn_link_data_shape;
feed_tensor.lod.assign({one_batch.lod});
feed_tensor.dtype = PaddleDType::FLOAT32;
TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
// Set inputs.
input_slots->assign({feed_tensor});
}
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<float> &base_result) {
PADDLE_ENFORCE_GT(outputs.size(), 0);
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
PADDLE_ENFORCE_GT(size, 0);
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(data[i], base_result[i], 1e-3);
}
}
}
// Test with a really complicate model.
void TestRNN2Prediction() {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = true;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto base_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
Timer timer1;
timer1.tic();
for (int i = 0; i < num_times; i++) {
base_predictor->Run(input_slots, &base_outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
Timer timer2;
timer2.tic();
for (int i = 0; i < num_times; i++) {
predictor->Run(input_slots, &outputs);
}
PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
CompareResult(base_outputs, data.result_data);
CompareResult(outputs, data.result_data);
}
TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
} // namespace inference
} // namespace paddle
...@@ -12,23 +12,7 @@ ...@@ -12,23 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <gflags/gflags.h>
#include <glog/logging.h> // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
#include <gtest/gtest.h>
#include <fstream>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/timer.h"
DEFINE_string(infer_model, "", "Directory of the inference model.");
DEFINE_string(infer_data, "", "Path of the dataset.");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "How many times to repeat run.");
DEFINE_int32(topn, -1, "Run top n batches of data to save time");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -37,24 +21,25 @@ struct DataReader { ...@@ -37,24 +21,25 @@ struct DataReader {
explicit DataReader(const std::string &path) explicit DataReader(const std::string &path)
: file(new std::ifstream(path)) {} : file(new std::ifstream(path)) {}
bool NextBatch(PaddleTensor *tensor, int batch_size) { bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
PADDLE_ENFORCE_EQ(batch_size, 1); PADDLE_ENFORCE_EQ(batch_size, 1);
std::string line; std::string line;
tensor->lod.clear(); PaddleTensor tensor;
tensor->lod.emplace_back(std::vector<size_t>({0})); tensor.dtype = PaddleDType::INT64;
tensor.lod.emplace_back(std::vector<size_t>({0}));
std::vector<int64_t> data; std::vector<int64_t> data;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
if (!std::getline(*file, line)) return false; if (!std::getline(*file, line)) return false;
inference::split_to_int64(line, ' ', &data); inference::split_to_int64(line, ' ', &data);
} }
tensor->lod.front().push_back(data.size()); tensor.lod.front().push_back(data.size());
tensor->data.Resize(data.size() * sizeof(int64_t)); tensor.data.Resize(data.size() * sizeof(int64_t));
memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t)); memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
tensor->shape.clear(); tensor.shape.push_back(data.size());
tensor->shape.push_back(data.size()); tensor.shape.push_back(1);
tensor->shape.push_back(1); input->assign({tensor});
return true; return true;
} }
...@@ -68,32 +53,28 @@ void Main(int batch_size) { ...@@ -68,32 +53,28 @@ void Main(int batch_size) {
config.model_dir = FLAGS_infer_model; config.model_dir = FLAGS_infer_model;
config.use_gpu = false; config.use_gpu = false;
config.enable_ir_optim = true; config.enable_ir_optim = true;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots(1);
// one batch starts
// data --
auto &input = input_slots[0];
input.dtype = PaddleDType::INT64;
inference::Timer timer; std::vector<PaddleTensor> input_slots, output_slots;
double sum = 0; DataReader reader(FLAGS_infer_data);
std::vector<PaddleTensor> output_slots; std::vector<std::vector<PaddleTensor>> input_slots_all;
int num_batches = 0; if (FLAGS_test_all_data) {
for (int t = 0; t < FLAGS_repeat; t++) { LOG(INFO) << "test all data";
DataReader reader(FLAGS_infer_data); int num_batches = 0;
while (reader.NextBatch(&input, FLAGS_batch_size)) { while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break; input_slots_all.emplace_back(input_slots);
timer.tic();
CHECK(predictor->Run(input_slots, &output_slots));
sum += timer.toc();
++num_batches; ++num_batches;
} }
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
} }
PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
// one batch starts
// data --
reader.NextBatch(&input_slots, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
// Get output // Get output
LOG(INFO) << "get outputs " << output_slots.size(); LOG(INFO) << "get outputs " << output_slots.size();
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
struct Record {
std::vector<float> data;
std::vector<int32_t> shape;
};
Record ProcessALine(const std::string &line) {
VLOG(3) << "process a line";
std::vector<std::string> columns;
split(line, '\t', &columns);
CHECK_EQ(columns.size(), 2UL)
<< "data format error, should be <data>\t<shape>";
Record record;
std::vector<std::string> data_strs;
split(columns[0], ' ', &data_strs);
for (auto &d : data_strs) {
record.data.push_back(std::stof(d));
}
std::vector<std::string> shape_strs;
split(columns[1], ' ', &shape_strs);
for (auto &s : shape_strs) {
record.shape.push_back(std::stoi(s));
}
VLOG(3) << "data size " << record.data.size();
VLOG(3) << "data shape size " << record.shape.size();
return record;
}
/*
* Use the native and analysis fluid engine to inference the demo.
* ocr, mobilenet and se_resnext50
*/
void TestVisualPrediction(bool use_mkldnn) {
std::unique_ptr<PaddlePredictor> predictor;
AnalysisConfig cfg;
cfg.param_file = FLAGS_infer_model + "/__params__";
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.use_gpu = false;
cfg._use_mkldnn = use_mkldnn;
cfg.device = 0;
cfg.enable_ir_optim = true;
// TODO(TJ): fix fusion gru
cfg.ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN
// disable mkldnn fuse since it should have some bugs
cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
#endif
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
// Only have single batch of data.
std::string line;
std::ifstream file(FLAGS_infer_data);
std::getline(file, line);
auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input;
input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32;
std::vector<PaddleTensor> outputs_slots;
Timer timer;
timer.tic();
for (int i = 0; i < FLAGS_repeat; i++) {
predictor->Run({input}, &outputs_slots);
}
PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
timer.toc() / FLAGS_repeat);
VLOG(3) << "output.size " << outputs_slots.size();
// run native as reference
auto ref_predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run({input}, &ref_outputs_slots);
CompareResult(outputs_slots, ref_outputs_slots);
// print what are fused
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
}
LOG(INFO) << "has num ops: " << num_ops;
}
TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_vis, analysis_mkldnn) {
TestVisualPrediction(/*use_mkldnn*/ true);
}
#endif
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle {
namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0);
EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i];
auto &ref_out = ref_outputs[i];
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
size_t ref_size =
std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
[](int a, int b) { return a * b; });
EXPECT_GT(size, 0);
EXPECT_EQ(size, ref_size);
EXPECT_EQ(out.dtype, ref_out.dtype);
switch (out.dtype) {
case PaddleDType::INT64: {
int64_t *pdata = static_cast<int64_t *>(out.data.data());
int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_EQ(pdata_ref[j], pdata[j]);
}
break;
}
case PaddleDType::FLOAT32: {
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = static_cast<float *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
}
break;
}
}
}
}
void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs.size(); j++) {
predictor->Run(inputs[j], outputs);
}
}
PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
inputs.size());
}
void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat;
std::vector<std::thread> threads;
std::vector<std::unique_ptr<PaddlePredictor>> predictors;
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back(
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
}
for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() {
// Each thread should have local inputs and outputs.
// The inputs of each thread are all the same.
std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
std::vector<PaddleTensor> outputs_tid;
Timer timer;
timer.tic();
for (int i = 0; i < num_times; i++) {
for (size_t j = 0; j < inputs_tid.size(); j++) {
predictors[tid]->Run(inputs_tid[j], &outputs_tid);
}
}
PrintTime(batch_size, num_times, num_threads, tid,
timer.toc() / num_times, inputs_tid.size());
});
}
for (int i = 0; i < num_threads; ++i) {
threads[i].join();
}
}
void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) {
if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs);
} else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads);
}
}
} // namespace inference
} // namespace paddle
...@@ -24,28 +24,28 @@ namespace operators { ...@@ -24,28 +24,28 @@ namespace operators {
void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of AttentionLSTM should not be null."); "Assert only one Input(X) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("C0"), PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(C0) of AttentionLSTM should not be null."); "Assert only one Input(C0) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"),
"Input(LSTMWeight) of AttentionLSTM should not be null."); "Assert only one Input(LSTMWeight) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), PADDLE_ENFORCE(ctx->HasInput("LSTMBias"),
"Input(LSTMBias) of AttentionLSTM should not be null."); "Assert only one Input(LSTMBias) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"),
"Input(AttentionWeight) of AttentionLSTM should not be null."); "Assert only one Input(AttentionWeight) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of AttentionLSTM should not be null."); "Assert only one Output(Hidden) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"), PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of AttentionLSTM should not be null."); "Assert only one Output(Cell) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"),
"Output(AttentionedX) of AttentionLSTM should not be null."); "Assert only one Output(AttentionedX) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"),
"Output(AttentionFCOut) of AttentionLSTM should not be null."); "Assert only one Output(AttentionFCOut) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), PADDLE_ENFORCE(ctx->HasOutput("LSTMX"),
"Output(LSTMX) of AttentionLSTM should not be null."); "Assert only one Output(LSTMX) of AttentionLSTM.");
PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"),
"Output(LSTMOUT) of AttentionLSTM should not be null."); "Assert only one Output(LSTMOUT) of AttentionLSTM.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
const int M = x_dims[1]; const int M = x_dims[1];
......
...@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -118,7 +118,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
output_channels / groups * output_height * output_width * output_depth; output_channels / groups * output_height * output_width * output_depth;
int group_offset_filter = filter->numel() / groups; int group_offset_filter = filter->numel() / groups;
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
if (user_workspace_size > 0) { if (user_workspace_size > 0) {
...@@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -159,20 +158,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
cudnn_filter_desc, filter_data + i * group_offset_filter, handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes, cudnn_filter_desc, filter_data + i * group_offset_filter,
&beta, cudnn_output_desc, output_data + i * group_offset_out)); cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
&beta, cudnn_output_desc, output_data + i * group_offset_out));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
} }
}; };
...@@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -314,11 +311,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
cudnn_filter_desc, filter_algo, &tmp_size)); cudnn_filter_desc, filter_algo, &tmp_size));
workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
} }
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
ScalingParamType<T> alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) { if (input_grad) {
...@@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -326,12 +319,15 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_filter_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
filter_data + i * group_offset_filter, cudnn_output_grad_desc, handle, &alpha, cudnn_filter_desc,
output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo, filter_data + i * group_offset_filter, cudnn_output_grad_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc,
input_grad_data + i * group_offset_in)); data_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_input_desc, input_grad_data + i * group_offset_in));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
...@@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -339,16 +335,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
cudnn_output_grad_desc, output_grad_data + i * group_offset_out, handle, &alpha, cudnn_input_desc,
cudnn_conv_desc, filter_algo, cudnn_workspace, input_data + i * group_offset_in, cudnn_output_grad_desc,
workspace_size_in_bytes, &beta, cudnn_filter_desc, output_grad_data + i * group_offset_out, cudnn_conv_desc,
filter_grad_data + i * group_offset_filter)); filter_algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + i * group_offset_filter));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
} }
}; };
......
...@@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -130,12 +130,13 @@ class ConvMKLDNNHandler : public platform::MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p, const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd, return this->AcquireMemory(weights_pd, user_weights_pd,
user_weights_memory_p, "@weights_mem_p", user_weights_memory_p, "@weights_mem_p",
pipeline); pipeline, is_persistent);
} }
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
...@@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -266,6 +267,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>(); ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto& mkldnn_engine = dev_ctx.GetEngine();
...@@ -296,10 +299,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -296,10 +299,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides"); std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations"); std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation // TODO: add support for dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
PADDLE_ENFORCE( PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
...@@ -310,6 +314,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -310,6 +314,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
if (g > 1) {
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives // Get unique name for storing MKLDNN primitives
...@@ -323,7 +340,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -323,7 +340,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format()); {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format()); {weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? filter->format() : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -336,7 +354,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -336,7 +354,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); weights_tz, platform::MKLDNNGetDataType<T>(),
(g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation. std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr. // Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
...@@ -349,10 +368,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -349,10 +368,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x); bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine); strides, paddings, mkldnn_engine,
fuse_relu, fuse_eltwise);
} else { } else {
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, conv_pd =
paddings, mkldnn_engine); ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, fuse_relu, fuse_eltwise);
} }
// Save conv_pd/src_memory/weights_memory for backward pass // Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd); dev_ctx.SetBlob(key_conv_pd, conv_pd);
...@@ -371,7 +392,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -371,7 +392,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto src_memory_p = auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline); user_weights_memory_p, pipeline, is_test);
auto dst_memory_p = auto dst_memory_p =
handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data)); handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
...@@ -402,11 +423,36 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -402,11 +423,36 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
private: private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
bool fuse_eltwise) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
// Fusion with Elementwise layer relies on adding a sum post-operation with
// the scale parameter. It is assumed that when fuse_eltwise is true, the
// Output tensor contains the data coming from residual connection. The
// result of this post_op is: Output = scale * Output + Conv_Out.
if (fuse_eltwise) {
post_operations.append_sum(1.0f);
}
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine) const { const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_eltwise) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -415,8 +461,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -415,8 +461,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
dst, stride_dims, padding_dims, padding_dims, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero); mkldnn::padding_kind::zero);
auto p_conv_pd = mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>( return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd); p_conv_pd);
...@@ -427,7 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -427,7 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const memory::desc& bias, const memory::desc& dst, const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const mkldnn::engine& engine) const { const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_eltwise) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = {paddings[0], paddings[1]};
...@@ -436,8 +485,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -436,8 +485,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias, dst, stride_dims, padding_dims, padding_dims, bias, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero); mkldnn::padding_kind::zero);
auto p_conv_pd = mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>( return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd); p_conv_pd);
......
...@@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -109,6 +109,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
} }
void Conv2DOpMaker::Make() { void Conv2DOpMaker::Make() {
AddAttr<bool>("is_test", "").SetDefault(false);
AddInput( AddInput(
"Input", "Input",
"(Tensor) The input tensor of convolution operator. " "(Tensor) The input tensor of convolution operator. "
...@@ -161,6 +162,13 @@ void Conv2DOpMaker::Make() { ...@@ -161,6 +162,13 @@ void Conv2DOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_eltwise",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is connected via skip connection "
"to a previous layer.")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
......
...@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -76,7 +76,6 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
conv_desc.descriptor<T>(paddings, strides, dilations); conv_desc.descriptor<T>(paddings, strides, dilations);
// ------------------- cudnn conv workspace --------------------- // ------------------- cudnn conv workspace ---------------------
void* cudnn_workspace = nullptr;
size_t workspace_size_in_bytes; // final workspace to allocate. size_t workspace_size_in_bytes; // final workspace to allocate.
size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
if (user_workspace_size > 0) { if (user_workspace_size > 0) {
...@@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -100,25 +99,21 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
int input_offset = input->numel() / input->dims()[0] / groups; int input_offset = input->numel() / input->dims()[0] / groups;
int output_offset = output->numel() / output->dims()[0] / groups; int output_offset = output->numel() / output->dims()[0] / groups;
int filter_offset = filter->numel() / groups; int filter_offset = filter->numel() / groups;
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
algo, cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
cudnn_output_desc, output_data + output_offset * g)); algo, cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_output_desc, output_data + output_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
} }
}; };
...@@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -206,11 +201,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::max(workspace_size_in_bytes, bwd_filter_ws_size); std::max(workspace_size_in_bytes, bwd_filter_ws_size);
} }
// ------------------- cudnn conv workspace ---------------------
// Already on GPU
void* cudnn_workspace = nullptr;
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
// FIXME(typhoonzero): template type T may not be the same as cudnn call. // FIXME(typhoonzero): template type T may not be the same as cudnn call.
int input_offset = input->numel() / input->dims()[0] / groups; int input_offset = input->numel() / input->dims()[0] / groups;
...@@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -222,12 +212,15 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_output_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
output_grad_data + output_grad_offset * g, cudnn_filter_desc, handle, &alpha, cudnn_output_desc,
filter_data + filter_offset * g, cudnn_conv_desc, data_algo, output_grad_data + output_grad_offset * g, cudnn_filter_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
input_grad_data + input_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
input_grad_data + input_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
...@@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -237,17 +230,17 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad. // Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter // Gradient with respect to the filter
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( auto cudnn_func = [&](void* cudnn_workspace) {
handle, &alpha, cudnn_output_desc, CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
output_grad_data + output_grad_offset * g, cudnn_input_desc, handle, &alpha, cudnn_output_desc,
input_data + input_offset * g, cudnn_conv_desc, filter_algo, output_grad_data + output_grad_offset * g, cudnn_input_desc,
cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc, input_data + input_offset * g, cudnn_conv_desc, filter_algo,
filter_grad_data + filter_offset * g)); cudnn_workspace, workspace_size_in_bytes, &beta,
cudnn_filter_desc, filter_grad_data + filter_offset * g));
};
dev_ctx.RunCudnnFuncWithWorkspace(cudnn_func, workspace_size_in_bytes);
} }
} }
// Release the cudnn workspace
paddle::memory::Free(gpu, cudnn_workspace);
} }
}; };
......
...@@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -138,6 +138,11 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false), a flag indicating whether to " "(bool, default false), a flag indicating whether to "
"interpretate the given labels as soft labels.") "interpretate the given labels as soft labels.")
.SetDefault(false); .SetDefault(false);
AddAttr<int>("ignore_index",
"(int, default -100), Specifies a target value that is"
"ignored and does not contribute to the input gradient."
"Only valid if soft_label is set to False")
.SetDefault(-100);
AddComment(R"DOC( AddComment(R"DOC(
CrossEntropy Operator. CrossEntropy Operator.
......
...@@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> { ...@@ -40,7 +40,7 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
math::CrossEntropyFunctor<DeviceContext, T>()( math::CrossEntropyFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d, ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
ctx.Attr<bool>("soft_label")); ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"));
} }
}; };
...@@ -74,16 +74,22 @@ class XeGradFunctor { ...@@ -74,16 +74,22 @@ class XeGradFunctor {
const T* dy, // NOLINT const T* dy, // NOLINT
const T* x, // NOLINT const T* x, // NOLINT
const int64_t* label, // NOLINT const int64_t* label, // NOLINT
size_t num_classes) size_t num_classes, size_t ignore_index)
: dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} : dx_(dx),
dy_(dy),
x_(x),
label_(label),
num_classes_(num_classes),
ignore_index_(ignore_index) {}
HOSTDEVICE void operator()(size_t sample_id) { HOSTDEVICE void operator()(size_t sample_id) {
auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
for (size_t x_offset = sample_id * num_classes_; for (size_t x_offset = sample_id * num_classes_;
x_offset < (sample_id + 1) * num_classes_; ++x_offset) { x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
dx_[x_offset] = x_offset != x_is_true_offset dx_[x_offset] =
? static_cast<T>(0) (x_offset != x_is_true_offset || label_[sample_id] == ignore_index_)
: -dy_[sample_id] / x_[x_offset]; ? static_cast<T>(0)
: -dy_[sample_id] / x_[x_offset];
} }
} }
...@@ -93,6 +99,7 @@ class XeGradFunctor { ...@@ -93,6 +99,7 @@ class XeGradFunctor {
const T* x_; const T* x_;
const int64_t* label_; const int64_t* label_;
size_t num_classes_; size_t num_classes_;
size_t ignore_index_;
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { ...@@ -109,6 +116,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
// unnecessary to convert tensors to 2-D views. // unnecessary to convert tensors to 2-D views.
int rank = x->dims().size(); int rank = x->dims().size();
int64_t class_num = x->dims()[rank - 1]; int64_t class_num = x->dims()[rank - 1];
int64_t ignore_index = ctx.Attr<int>("ignore_index");
if (ctx.Attr<bool>("soft_label")) { if (ctx.Attr<bool>("soft_label")) {
XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(), XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
label->data<T>(), label->data<T>(),
...@@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> { ...@@ -118,9 +126,9 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
static_cast<size_t>(dx->numel())); static_cast<size_t>(dx->numel()));
for_range(functor); for_range(functor);
} else { } else {
XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(), XeGradFunctor<T> functor(
label->data<int64_t>(), dx_data, dy->data<T>(), x->data<T>(), label->data<int64_t>(),
static_cast<size_t>(class_num)); static_cast<size_t>(class_num), static_cast<size_t>(ignore_index));
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), ctx.template device_context<DeviceContext>(),
static_cast<size_t>(dy->numel())); static_cast<size_t>(dy->numel()));
......
...@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
...@@ -21,7 +22,7 @@ namespace operators { ...@@ -21,7 +22,7 @@ namespace operators {
*/ */
template <typename T> template <typename T>
inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
const framework::Tensor& gt_boxes, const T* weights, const framework::Tensor& gt_boxes, const float* weights,
const bool normalized, framework::Tensor* box_delta) { const bool normalized, framework::Tensor* box_delta) {
auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes); auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes); auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
...@@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num, ...@@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num,
} }
} }
template <typename T>
void BboxOverlaps(const framework::Tensor& r_boxes,
const framework::Tensor& c_boxes,
framework::Tensor* overlaps) {
auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
int r_num = r_boxes.dims()[0];
int c_num = c_boxes.dims()[0];
auto zero = static_cast<T>(0.0);
T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
inter_area;
for (int i = 0; i < r_num; ++i) {
r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
(r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
for (int j = 0; j < c_num; ++j) {
c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
(c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
}
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { ...@@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
"Input(RpnRois) shouldn't be null."); "Input(RpnRois) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("GtClasses"), PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
"Input(GtClasses) shouldn't be null."); "Input(GtClasses) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
"Input(IsCrowd) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
"Input(GtBoxes) shouldn't be null."); "Input(GtBoxes) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput("ImScales"), PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
"Input(ImScales) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasOutput("Rois"), PADDLE_ENFORCE(ctx->HasOutput("Rois"),
"Output(Rois) of RpnTargetAssignOp should not be null"); "Output(Rois) of RpnTargetAssignOp should not be null");
...@@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { ...@@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
auto gt_classes_dims = ctx->GetInputDim("GtClasses"); auto gt_classes_dims = ctx->GetInputDim("GtClasses");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto im_scales_dims = ctx->GetInputDim("ImScales"); auto im_info_dims = ctx->GetInputDim("ImInfo");
PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
"The rank of Input(RpnRois) must be 2."); "The rank of Input(RpnRois) must be 2.");
PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1,
"The rank of Input(GtClasses) must be 1.");
PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
"The rank of Input(GtBoxes) must be 2."); "The rank of Input(GtBoxes) must be 2.");
PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1, PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(ImScales) must be 1."); "The rank of Input(ImInfo) must be 2.");
int class_nums = ctx->Attrs().Get<int>("class_nums"); int class_nums = ctx->Attrs().Get<int>("class_nums");
ctx->SetOutputDim("Rois", {-1, 4}); ctx->SetOutputDim("Rois", {-1, 4});
ctx->SetOutputDim("LabelsInt32", {-1}); ctx->SetOutputDim("LabelsInt32", {-1, 1});
ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
...@@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context, ...@@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context,
concat_functor(context, inputs, axis, out_tensor); concat_functor(context, inputs, axis, out_tensor);
} }
template <typename T>
void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
Tensor* overlaps) {
auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
int r_num = r_boxes.dims()[0];
int c_num = c_boxes.dims()[0];
auto zero = static_cast<T>(0.0);
T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
inter_area;
for (int i = 0; i < r_num; ++i) {
r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
(r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
for (int j = 0; j < c_num; ++j) {
c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
(c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
inter_w = std::max(x_max - x_min + 1, zero);
inter_h = std::max(y_max - y_min + 1, zero);
inter_area = inter_w * inter_h;
overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
}
}
}
template <typename T> template <typename T>
std::vector<std::vector<int>> SampleFgBgGt( std::vector<std::vector<int>> SampleFgBgGt(
const platform::CPUDeviceContext& context, Tensor* iou, const platform::CPUDeviceContext& context, Tensor* iou,
const int batch_size_per_im, const float fg_fraction, const float fg_thresh, const Tensor& is_crowd, const int batch_size_per_im,
const float bg_thresh_hi, const float bg_thresh_lo, const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
std::minstd_rand engine) { const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
std::vector<int> fg_inds; std::vector<int> fg_inds;
std::vector<int> bg_inds; std::vector<int> bg_inds;
std::vector<int> gt_inds; std::vector<int> gt_inds;
T* proposal_to_gt_overlaps = iou->mutable_data<T>(context.GetPlace()); int64_t gt_num = is_crowd.numel();
const int* crowd_data = is_crowd.data<int>();
T* proposal_to_gt_overlaps = iou->data<T>();
int64_t row = iou->dims()[0]; int64_t row = iou->dims()[0];
int64_t col = iou->dims()[1]; int64_t col = iou->dims()[1];
float epsilon = 0.00001; float epsilon = 0.00001;
...@@ -152,6 +125,9 @@ std::vector<std::vector<int>> SampleFgBgGt( ...@@ -152,6 +125,9 @@ std::vector<std::vector<int>> SampleFgBgGt(
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
const T* v = proposal_to_gt_overlaps + i * col; const T* v = proposal_to_gt_overlaps + i * col;
T max_overlap = *std::max_element(v, v + col); T max_overlap = *std::max_element(v, v + col);
if ((i < gt_num) && (crowd_data[i])) {
max_overlap = -1.0;
}
if (max_overlap > fg_thresh) { if (max_overlap > fg_thresh) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
T val = proposal_to_gt_overlaps[i * col + j]; T val = proposal_to_gt_overlaps[i * col + j];
...@@ -170,17 +146,19 @@ std::vector<std::vector<int>> SampleFgBgGt( ...@@ -170,17 +146,19 @@ std::vector<std::vector<int>> SampleFgBgGt(
} }
// Reservoir Sampling // Reservoir Sampling
std::uniform_real_distribution<float> uniform(0, 1);
int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
int fg_rois_this_image = fg_inds.size(); int fg_rois_this_image = fg_inds.size();
int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
std::uniform_real_distribution<float> uniform(0, 1); if (use_random) {
const int64_t fg_size = static_cast<int64_t>(fg_inds.size()); const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
if (fg_size > fg_rois_per_this_image) { if (fg_size > fg_rois_per_this_image) {
for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
int rng_ind = std::floor(uniform(engine) * i); int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < fg_rois_per_this_image) { if (rng_ind < fg_rois_per_this_image) {
std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i); std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
}
} }
} }
} }
...@@ -192,12 +170,14 @@ std::vector<std::vector<int>> SampleFgBgGt( ...@@ -192,12 +170,14 @@ std::vector<std::vector<int>> SampleFgBgGt(
int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
int bg_rois_this_image = bg_inds.size(); int bg_rois_this_image = bg_inds.size();
int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image); int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
const int64_t bg_size = static_cast<int64_t>(bg_inds.size()); if (use_random) {
if (bg_size > bg_rois_per_this_image) { const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { if (bg_size > bg_rois_per_this_image) {
int rng_ind = std::floor(uniform(engine) * i); for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
if (rng_ind < fg_rois_per_this_image) int rng_ind = std::floor(uniform(engine) * i);
std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); if (rng_ind < fg_rois_per_this_image)
std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
}
} }
} }
std::vector<int> new_bg_inds(bg_inds.begin(), std::vector<int> new_bg_inds(bg_inds.begin(),
...@@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, ...@@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
template <typename T> template <typename T>
std::vector<Tensor> SampleRoisForOneImage( std::vector<Tensor> SampleRoisForOneImage(
const platform::CPUDeviceContext& context, Tensor* rpn_rois, const platform::CPUDeviceContext& context, Tensor* rpn_rois,
Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale, Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
const int batch_size_per_im, const float fg_fraction, const float fg_thresh, const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
const float bg_thresh_hi, const float bg_thresh_lo, const float bg_thresh_hi, const float bg_thresh_lo,
const std::vector<float>& bbox_reg_weights, const int class_nums, const std::vector<float>& bbox_reg_weights, const int class_nums,
std::minstd_rand engine) { std::minstd_rand engine, bool use_random) {
auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois); auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
auto im_scale_data = im_scale->data<T>()[0]; auto im_scale = im_info->data<T>()[2];
rpn_rois_et = rpn_rois_et / im_scale_data; rpn_rois_et = rpn_rois_et / im_scale;
Tensor boxes; Tensor boxes;
int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0]; int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
...@@ -270,8 +250,8 @@ std::vector<Tensor> SampleRoisForOneImage( ...@@ -270,8 +250,8 @@ std::vector<Tensor> SampleRoisForOneImage(
// Generate proposal index // Generate proposal index
std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>( std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction, context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
fg_thresh, bg_thresh_hi, bg_thresh_lo, engine); fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
std::vector<int> fg_inds = fg_bg_gt[0]; std::vector<int> fg_inds = fg_bg_gt[0];
std::vector<int> bg_inds = fg_bg_gt[1]; std::vector<int> bg_inds = fg_bg_gt[1];
std::vector<int> gt_inds = fg_bg_gt[2]; std::vector<int> gt_inds = fg_bg_gt[2];
...@@ -291,15 +271,15 @@ std::vector<Tensor> SampleRoisForOneImage( ...@@ -291,15 +271,15 @@ std::vector<Tensor> SampleRoisForOneImage(
// Compute targets // Compute targets
Tensor bbox_targets_single; Tensor bbox_targets_single;
bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace()); bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false, BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
&bbox_targets_single); false, &bbox_targets_single);
// Scale rois // Scale rois
Tensor sampled_rois; Tensor sampled_rois;
sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace()); sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois); auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes); auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
sampled_rois_et = sampled_boxes_et * im_scale_data; sampled_rois_et = sampled_boxes_et * im_scale;
// Expand box targets // Expand box targets
Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
...@@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* rpn_rois = context.Input<LoDTensor>("RpnRois"); auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
auto* gt_classes = context.Input<LoDTensor>("GtClasses"); auto* gt_classes = context.Input<LoDTensor>("GtClasses");
auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
auto* gt_boxes = context.Input<LoDTensor>("GtBoxes"); auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
auto* im_scales = context.Input<LoDTensor>("ImScales"); auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* rois = context.Output<LoDTensor>("Rois"); auto* rois = context.Output<LoDTensor>("Rois");
auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32"); auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
...@@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
std::vector<float> bbox_reg_weights = std::vector<float> bbox_reg_weights =
context.Attr<std::vector<float>>("bbox_reg_weights"); context.Attr<std::vector<float>>("bbox_reg_weights");
int class_nums = context.Attr<int>("class_nums"); int class_nums = context.Attr<int>("class_nums");
bool use_random = context.Attr<bool>("use_random");
PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
"GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
gt_classes->lod().size(), 1UL, gt_classes->lod().size(), 1UL,
"GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); "GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
"GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
"GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1); int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace()); rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
labels_int32->mutable_data<int>({n * batch_size_per_im}, labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
context.GetPlace()); context.GetPlace());
bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums}, bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
context.GetPlace()); context.GetPlace());
...@@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
std::random_device rnd; std::random_device rnd;
std::minstd_rand engine; std::minstd_rand engine;
int seed = int seed = rnd();
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed); engine.seed(seed);
framework::LoD lod; framework::LoD lod;
...@@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
auto rpn_rois_lod = rpn_rois->lod().back(); auto rpn_rois_lod = rpn_rois->lod().back();
auto gt_classes_lod = gt_classes->lod().back(); auto gt_classes_lod = gt_classes->lod().back();
auto is_crowd_lod = is_crowd->lod().back();
auto gt_boxes_lod = gt_boxes->lod().back(); auto gt_boxes_lod = gt_boxes->lod().back();
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor rpn_rois_slice = Tensor rpn_rois_slice =
rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
Tensor gt_classes_slice = Tensor gt_classes_slice =
gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
Tensor is_crowd_slice =
is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
Tensor gt_boxes_slice = Tensor gt_boxes_slice =
gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
Tensor im_scales_slice = im_scales->Slice(i, i + 1); Tensor im_info_slice = im_info->Slice(i, i + 1);
std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>( std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
dev_ctx, &rpn_rois_slice, &gt_classes_slice, &gt_boxes_slice, dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
&im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh, &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine); fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
engine, use_random);
Tensor sampled_rois = tensor_output[0]; Tensor sampled_rois = tensor_output[0];
Tensor sampled_labels_int32 = tensor_output[1]; Tensor sampled_labels_int32 = tensor_output[1];
Tensor sampled_bbox_targets = tensor_output[2]; Tensor sampled_bbox_targets = tensor_output[2];
...@@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> { ...@@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
bbox_inside_weights->set_lod(lod); bbox_inside_weights->set_lod(lod);
bbox_outside_weights->set_lod(lod); bbox_outside_weights->set_lod(lod);
rois->Resize({num_rois, kBoxDim}); rois->Resize({num_rois, kBoxDim});
labels_int32->Resize({num_rois}); labels_int32->Resize({num_rois, 1});
bbox_targets->Resize({num_rois, kBoxDim * class_nums}); bbox_targets->Resize({num_rois, kBoxDim * class_nums});
bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
...@@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
// TODO(buxingyuan): Add Document // TODO(buxingyuan): Add Document
AddInput("RpnRois", "RpnRois."); AddInput("RpnRois", "RpnRois.");
AddInput("GtClasses", "GtClasses."); AddInput("GtClasses", "GtClasses.");
AddInput("IsCrowd", "IsCrowd.");
AddInput("GtBoxes", "GtBoxes."); AddInput("GtBoxes", "GtBoxes.");
AddInput("ImScales", "ImScales."); AddInput("ImInfo", "ImInfo.");
AddOutput("Rois", "Rois."); AddOutput("Rois", "Rois.");
AddOutput("LabelsInt32", "LabelsInt32."); AddOutput("LabelsInt32", "LabelsInt32.");
...@@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("bg_thresh_lo", "bg_thresh_lo"); AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights"); AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
AddAttr<int>("class_nums", "class_nums"); AddAttr<int>("class_nums", "class_nums");
AddAttr<bool>("fix_seed", "fix_seed").SetDefault(false); AddAttr<bool>("use_random", "use_random").SetDefault(true);
AddAttr<int>("seed", "seed").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
Generate Proposals Labels Operator. Generate Proposals Labels Operator.
......
...@@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, ...@@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
} }
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len]; T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1]; T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2; T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
T anchor_center_y = T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
(anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2;
T bbox_center_x = 0, bbox_center_y = 0; T bbox_center_x = 0, bbox_center_y = 0;
T bbox_width = 0, bbox_height = 0; T bbox_width = 0, bbox_height = 0;
...@@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors, ...@@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y = variances_data[i * len + 1] * bbox_center_y = variances_data[i * len + 1] *
bbox_deltas_data[i * len + 1] * anchor_height + bbox_deltas_data[i * len + 1] * anchor_height +
anchor_center_y; anchor_center_y;
bbox_width = std::exp(variances_data[i * len + 2] * bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
bbox_deltas_data[i * len + 2]) * bbox_deltas_data[i * len + 2],
std::log(1000.0 / 16.0))) *
anchor_width; anchor_width;
bbox_height = std::exp(variances_data[i * len + 3] * bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
bbox_deltas_data[i * len + 3]) * bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) *
anchor_height; anchor_height;
} else { } else {
bbox_center_x = bbox_center_x =
bbox_deltas_data[i * len] * anchor_width + anchor_center_x; bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
bbox_center_y = bbox_center_y =
bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; std::log(1000.0 / 16.0))) *
anchor_width;
bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
std::log(1000.0 / 16.0))) *
anchor_height;
} }
proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len] = bbox_center_x - bbox_width / 2;
proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
} }
// return proposals; // return proposals;
} }
...@@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, ...@@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
float min_size, const Tensor &im_info, Tensor *keep) { float min_size, const Tensor &im_info, Tensor *keep) {
const T *im_info_data = im_info.data<T>(); const T *im_info_data = im_info.data<T>();
T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace()); T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
min_size *= im_info_data[2]; T im_scale = im_info_data[2];
keep->Resize({boxes->dims()[0], 1}); keep->Resize({boxes->dims()[0], 1});
min_size = std::max(min_size, 1.0f);
int *keep_data = keep->mutable_data<int>(ctx.GetPlace()); int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
int keep_len = 0; int keep_len = 0;
for (int i = 0; i < boxes->dims()[0]; ++i) { for (int i = 0; i < boxes->dims()[0]; ++i) {
T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
T ws_origin_scale =
(boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
T hs_origin_scale =
(boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
T x_ctr = boxes_data[4 * i] + ws / 2; T x_ctr = boxes_data[4 * i] + ws / 2;
T y_ctr = boxes_data[4 * i + 1] + hs / 2; T y_ctr = boxes_data[4 * i + 1] + hs / 2;
if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] && if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
y_ctr <= im_info_data[0]) { x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
keep_data[keep_len++] = i; keep_data[keep_len++] = i;
} }
} }
...@@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) { ...@@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const T inter_ymin = std::max(box1[1], box2[1]); const T inter_ymin = std::max(box1[1], box2[1]);
const T inter_xmax = std::min(box1[2], box2[2]); const T inter_xmax = std::min(box1[2], box2[2]);
const T inter_ymax = std::min(box1[3], box2[3]); const T inter_ymax = std::min(box1[3], box2[3]);
const T inter_w = inter_xmax - inter_xmin; const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
const T inter_h = inter_ymax - inter_ymin; const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
const T inter_area = inter_w * inter_h; const T inter_area = inter_w * inter_h;
const T bbox1_area = BBoxArea<T>(box1, normalized); const T bbox1_area = BBoxArea<T>(box1, normalized);
const T bbox2_area = BBoxArea<T>(box2, normalized); const T bbox2_area = BBoxArea<T>(box2, normalized);
......
...@@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ...@@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"), PADDLE_ENFORCE(ctx->HasInput("Anchor"),
"Input(DistMat) of RpnTargetAssignOp should not be null"); "Input(Anchor) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
"Input(GtBoxes) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
"Input(Anchor) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
"Input(ImInfo) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("LocationIndex"), ctx->HasOutput("LocationIndex"),
...@@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ...@@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasOutput("TargetLabel"), ctx->HasOutput("TargetLabel"),
"Output(TargetLabel) of RpnTargetAssignOp should not be null"); "Output(TargetLabel) of RpnTargetAssignOp should not be null");
PADDLE_ENFORCE(
auto in_dims = ctx->GetInputDim("DistMat"); ctx->HasOutput("TargetBBox"),
PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Output(TargetBBox) of RpnTargetAssignOp should not be null");
"The rank of Input(DistMat) must be 2.");
auto anchor_dims = ctx->GetInputDim("Anchor");
auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
auto im_info_dims = ctx->GetInputDim("ImInfo");
PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
"The rank of Input(Anchor) must be 2.");
PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
"The rank of Input(GtBoxes) must be 2.");
PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
"The rank of Input(ImInfo) must be 2.");
ctx->SetOutputDim("LocationIndex", {-1}); ctx->SetOutputDim("LocationIndex", {-1});
ctx->SetOutputDim("ScoreIndex", {-1}); ctx->SetOutputDim("ScoreIndex", {-1});
...@@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { ...@@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(
ctx.Input<framework::LoDTensor>("DistMat")->type()), ctx.Input<framework::LoDTensor>("Anchor")->type()),
platform::CPUPlace()); platform::CPUPlace());
} }
}; };
template <typename T> template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> { void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
public: auto* out_data = out->data<T>();
void Compute(const framework::ExecutionContext& context) const override { auto* to_add_data = to_add->data<T>();
auto* anchor_t = context.Input<Tensor>("Anchor"); // (H*W*A) * 4 memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
auto* gt_bbox_t = context.Input<Tensor>("GtBox"); }
auto* dist_t = context.Input<LoDTensor>("DistMat");
template <typename T>
std::vector<Tensor> FilterStraddleAnchor(
const platform::CPUDeviceContext& context, const Tensor* anchor,
const float rpn_straddle_thresh, T im_height, T im_width) {
std::vector<int> inds_inside;
int anchor_num = anchor->dims()[0];
auto* anchor_data = anchor->data<T>();
if (rpn_straddle_thresh >= 0) {
int index;
for (int i = 0; i < anchor_num; ++i) {
index = i * 4;
if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
(anchor_data[index + 1] >= -rpn_straddle_thresh) &&
(anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
(anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
inds_inside.emplace_back(i);
}
}
} else {
for (int i = 0; i < anchor_num; ++i) {
inds_inside.emplace_back(i);
}
}
int inside_num = inds_inside.size();
Tensor inds_inside_t;
int* inds_inside_data =
inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
Tensor inside_anchor_t;
T* inside_anchor_data =
inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
Gather<T>(anchor->data<T>(), 4, inds_inside_data, inside_num,
inside_anchor_data);
std::vector<Tensor> res;
res.emplace_back(inds_inside_t);
res.emplace_back(inside_anchor_t);
return res;
}
template <typename T>
Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
Tensor* gt_boxes, Tensor* is_crowd) {
int gt_num = gt_boxes->dims()[0];
std::vector<int> not_crowd_inds;
auto* is_crowd_data = is_crowd->data<int>();
for (int i = 0; i < gt_num; ++i) {
if (is_crowd_data[i] == 0) {
not_crowd_inds.emplace_back(i);
}
}
int ncrowd_num = not_crowd_inds.size();
Tensor ncrowd_gt_boxes;
T* ncrowd_gt_boxes_data =
ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
ncrowd_gt_boxes_data);
return ncrowd_gt_boxes;
}
void ReservoirSampling(const int num, std::vector<int>* inds,
std::minstd_rand engine, bool use_random) {
std::uniform_real_distribution<float> uniform(0, 1);
size_t len = inds->size();
if (len > static_cast<size_t>(num)) {
if (use_random) {
for (size_t i = num; i < len; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
}
}
inds->resize(num);
}
}
template <typename T>
void ScoreAssign(const T* anchor_by_gt_overlap_data,
const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max,
const int rpn_batch_size_per_im, const float rpn_fg_fraction,
const float rpn_positive_overlap,
const float rpn_negative_overlap, std::vector<int>* fg_inds,
std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
std::minstd_rand engine, bool use_random) {
float epsilon = 0.00001;
int anchor_num = anchor_to_gt_max.dims()[0];
int gt_num = gt_to_anchor_max.dims()[0];
std::vector<int> target_label(anchor_num, -1);
std::vector<int> fg_inds_fake;
std::vector<int> bg_inds_fake;
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
// TODO(buxingyuan): Match with Detectron now
// but it seems here is a bug in two directions assignment
// in which the later one may overwrites the former one.
for (int64_t i = 0; i < anchor_num; ++i) {
bool is_anchors_with_max_overlap = false;
for (int64_t j = 0; j < gt_num; ++j) {
T value = anchor_by_gt_overlap_data[i * gt_num + j];
T diff = std::abs(value - gt_to_anchor_max_data[j]);
if (diff < epsilon) {
is_anchors_with_max_overlap = true;
break;
}
}
bool is_anchor_great_than_thresh =
(anchor_to_gt_max_data[i] >= rpn_positive_overlap);
if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
fg_inds_fake.push_back(i);
}
}
auto* loc_index_t = context.Output<Tensor>("LocationIndex"); // Reservoir Sampling
auto* score_index_t = context.Output<Tensor>("ScoreIndex"); int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox"); ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel"); fg_num = static_cast<int>(fg_inds_fake.size());
for (int64_t i = 0; i < fg_num; ++i) {
target_label[fg_inds_fake[i]] = 1;
}
auto lod = dist_t->lod().back(); int bg_num = rpn_batch_size_per_im - fg_num;
int64_t batch_num = static_cast<int64_t>(lod.size() - 1); for (int64_t i = 0; i < anchor_num; ++i) {
int64_t anchor_num = dist_t->dims()[1]; if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]); bg_inds_fake.push_back(i);
}
}
ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
bg_num = static_cast<int>(bg_inds_fake.size());
for (int64_t i = 0; i < bg_num; ++i) {
target_label[bg_inds_fake[i]] = 0;
}
int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im"); for (int64_t i = 0; i < anchor_num; ++i) {
float pos_threshold = context.Attr<float>("rpn_positive_overlap"); if (target_label[i] == 1) fg_inds->emplace_back(i);
float neg_threshold = context.Attr<float>("rpn_negative_overlap"); if (target_label[i] == 0) bg_inds->emplace_back(i);
float fg_fraction = context.Attr<float>("fg_fraction"); }
fg_num = fg_inds->size();
bg_num = bg_inds->size();
tgt_lbl->resize(fg_num + bg_num, 0);
std::vector<int> fg_lbl(fg_num, 1);
std::vector<int> bg_lbl(bg_num, 0);
std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
}
template <typename T>
std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
const Tensor& anchor_by_gt_overlap,
const int rpn_batch_size_per_im,
const float rpn_positive_overlap,
const float rpn_negative_overlap,
const float rpn_fg_fraction,
std::minstd_rand engine, bool use_random) {
auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
int anchor_num = anchor_by_gt_overlap.dims()[0];
int gt_num = anchor_by_gt_overlap.dims()[1];
std::vector<int> fg_inds;
std::vector<int> bg_inds;
std::vector<int> gt_inds;
std::vector<int> tgt_lbl;
// Calculate the max IoU between anchors and gt boxes
// Map from anchor to gt box that has highest overlap
auto place = ctx.GetPlace();
Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
gt_to_anchor_max.mutable_data<T>({gt_num}, place);
auto anchor_by_gt_overlap_et =
framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
auto anchor_to_gt_max_et =
framework::EigenVector<T>::Flatten(anchor_to_gt_max);
auto gt_to_anchor_max_et =
framework::EigenVector<T>::Flatten(gt_to_anchor_max);
auto anchor_to_gt_argmax_et =
framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
anchor_to_gt_max_et =
anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
anchor_to_gt_argmax_et =
anchor_by_gt_overlap_et.argmax(1).template cast<int>();
gt_to_anchor_max_et =
anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
// Follow the Faster RCNN's implementation
ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
use_random);
int fg_num = fg_inds.size();
int bg_num = bg_inds.size();
gt_inds.reserve(fg_num);
for (int i = 0; i < fg_num; ++i) {
gt_inds.emplace_back(argmax[fg_inds[i]]);
}
int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction); Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
int* score_index_data =
score_index_t.mutable_data<int>({fg_num + bg_num}, place);
int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
std::vector<Tensor> loc_score_tgtlbl_gt;
loc_score_tgtlbl_gt.emplace_back(loc_index_t);
loc_score_tgtlbl_gt.emplace_back(score_index_t);
loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
return loc_score_tgtlbl_gt;
}
int64_t max_num = batch_num * anchor_num; template <typename T>
class RpnTargetAssignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* anchor = context.Input<Tensor>("Anchor"); // (H*W*A) * 4
auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
auto* im_info = context.Input<LoDTensor>("ImInfo");
auto* loc_index = context.Output<LoDTensor>("LocationIndex");
auto* score_index = context.Output<LoDTensor>("ScoreIndex");
auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
"RpnTargetAssignOp gt_boxes needs 1 level of LoD");
PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
"RpnTargetAssignOp is_crowd needs 1 level of LoD");
int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
bool use_random = context.Attr<bool>("use_random");
int64_t max_num = batch_num * rpn_batch_size_per_im;
auto place = context.GetPlace(); auto place = context.GetPlace();
tgt_bbox_t->mutable_data<T>({max_num, 4}, place); loc_index->mutable_data<int>({max_num}, place);
auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place); score_index->mutable_data<int>({max_num}, place);
auto* score_index = score_index_t->mutable_data<int>({max_num}, place); tgt_bbox->mutable_data<T>({max_num, 4}, place);
tgt_lbl->mutable_data<int>({max_num, 1}, place);
Tensor tmp_tgt_lbl;
auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>(); auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
std::random_device rnd; std::random_device rnd;
std::minstd_rand engine; std::minstd_rand engine;
int seed = int seed = rnd();
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed); engine.seed(seed);
int fg_num = 0; framework::LoD lod_loc, loc_score;
int bg_num = 0; std::vector<size_t> lod0_loc(1, 0);
std::vector<size_t> lod0_score(1, 0);
int total_loc_num = 0;
int total_score_num = 0;
auto gt_boxes_lod = gt_boxes->lod().back();
auto is_crowd_lod = is_crowd->lod().back();
for (int i = 0; i < batch_num; ++i) { for (int i = 0; i < batch_num; ++i) {
Tensor dist = dist_t->Slice(lod[i], lod[i + 1]); Tensor gt_boxes_slice =
Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]); gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold, Tensor is_crowd_slice =
rpn_batch_size, fg_num_per_batch, engine, is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
tmp_lbl_data + i * anchor_num); Tensor im_info_slice = im_info->Slice(i, i + 1);
auto* im_info_data = im_info_slice.data<T>();
int cur_fg_num = fg_bg_gt[0].size(); auto im_height = im_info_data[0];
int cur_bg_num = fg_bg_gt[1].size(); auto im_width = im_info_data[1];
std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index, auto im_scale = im_info_data[2];
[i, anchor_num](int d) { return d + i * anchor_num; });
memcpy(score_index, loc_index, cur_fg_num * sizeof(int)); // Filter straddle anchor
std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(), std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
score_index + cur_fg_num, dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
[i, anchor_num](int d) { return d + i * anchor_num; }); Tensor inds_inside = filter_output[0];
Tensor inside_anchor = filter_output[1];
// Filter crowd gt
Tensor ncrowd_gt_boxes =
FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
auto ncrowd_gt_boxes_et =
framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
Tensor anchor_by_gt_overlap;
anchor_by_gt_overlap.mutable_data<T>(
{inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(
dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im,
rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine,
use_random);
Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
int loc_num = sampled_loc_index.dims()[0];
int score_num = sampled_score_index.dims()[0];
// unmap to all anchor
Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
sampled_score_index_unmap.mutable_data<int>({score_num}, place);
Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
loc_num, sampled_loc_index_unmap.data<int>());
Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
score_num, sampled_score_index_unmap.data<int>());
// get target bbox deltas // get target bbox deltas
if (cur_fg_num) { Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
Tensor fg_gt; auto* sampled_anchor_data =
T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place); sampled_anchor.mutable_data<T>({loc_num, 4}, place);
Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num); auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
T* tgt_data = tgt_bbox.data<T>(); Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
Gather<T>(anchor_t->data<T>(), 4, loc_num, sampled_anchor_data);
reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num, Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
tgt_data); loc_num, sampled_gt_data);
Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]), sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
cur_fg_num, gt_data); BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox); &sampled_tgt_bbox);
}
// Add anchor offset
loc_index += cur_fg_num; int anchor_offset = i * anchor_num;
score_index += cur_fg_num + cur_bg_num; auto sampled_loc_index_unmap_et =
fg_num += cur_fg_num; framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
bg_num += cur_bg_num; sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
} auto sampled_score_index_unmap_et =
framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
int lbl_num = fg_num + bg_num; sampled_score_index_unmap_et =
PADDLE_ENFORCE_LE(fg_num, max_num); sampled_score_index_unmap_et + anchor_offset;
PADDLE_ENFORCE_LE(lbl_num, max_num); AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
tgt_bbox_t->Resize({fg_num, 4}); AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
loc_index_t->Resize({fg_num}); AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
score_index_t->Resize({lbl_num}); total_loc_num += loc_num;
auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num, total_score_num += score_num;
lbl_data); lod0_loc.emplace_back(total_loc_num);
} lod0_score.emplace_back(total_score_num);
private:
void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
const int row, const int col, const float pos_threshold,
const float neg_threshold, int64_t* target_label,
std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
float epsilon = 0.0001;
for (int64_t i = 0; i < row; ++i) {
const T* v = dist_data + i * col;
T max = *std::max_element(v, v + col);
for (int64_t j = 0; j < col; ++j) {
if (std::abs(max - v[j]) < epsilon) {
target_label[j] = 1;
}
}
}
// Pick the fg/bg
const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
for (int64_t j = 0; j < col; ++j) {
if (anchor_to_gt_max_data[j] >= pos_threshold) {
target_label[j] = 1;
} else if (anchor_to_gt_max_data[j] < neg_threshold) {
target_label[j] = 0;
}
if (target_label[j] == 1) {
fg_inds->push_back(j);
} else if (target_label[j] == 0) {
bg_inds->push_back(j);
}
} }
}
void ReservoirSampling(const int num, std::minstd_rand engine,
std::vector<int>* inds) const {
std::uniform_real_distribution<float> uniform(0, 1);
size_t len = inds->size();
if (len > static_cast<size_t>(num)) {
for (size_t i = num; i < len; ++i) {
int rng_ind = std::floor(uniform(engine) * i);
if (rng_ind < num)
std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
}
inds->resize(num);
}
}
// std::vector<std::vector<int>> RpnTargetAssign( PADDLE_ENFORCE_LE(total_loc_num, max_num);
std::vector<std::vector<int>> SampleFgBgGt( PADDLE_ENFORCE_LE(total_score_num, max_num);
const platform::CPUDeviceContext& ctx, const Tensor& dist,
const float pos_threshold, const float neg_threshold, lod_loc.emplace_back(lod0_loc);
const int rpn_batch_size, const int fg_num, std::minstd_rand engine, loc_score.emplace_back(lod0_score);
int64_t* target_label) const { loc_index->set_lod(lod_loc);
auto* dist_data = dist.data<T>(); score_index->set_lod(loc_score);
int row = dist.dims()[0]; tgt_bbox->set_lod(lod_loc);
int col = dist.dims()[1]; tgt_lbl->set_lod(loc_score);
loc_index->Resize({total_loc_num});
std::vector<int> fg_inds; score_index->Resize({total_score_num});
std::vector<int> bg_inds; tgt_bbox->Resize({total_loc_num, 4});
std::vector<int> gt_inds; tgt_lbl->Resize({total_score_num, 1});
// Calculate the max IoU between anchors and gt boxes
// Map from anchor to gt box that has highest overlap
auto place = ctx.GetPlace();
Tensor anchor_to_gt_max, anchor_to_gt_argmax;
anchor_to_gt_max.mutable_data<T>({col}, place);
int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
auto x = framework::EigenMatrix<T>::From(dist);
auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
auto x_col_argmax =
framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
x_col_argmax = x.argmax(0).template cast<int>();
// Follow the Faster RCNN's implementation
ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
neg_threshold, target_label, &fg_inds, &bg_inds);
// Reservoir Sampling
ReservoirSampling(fg_num, engine, &fg_inds);
int fg_num2 = static_cast<int>(fg_inds.size());
int bg_num = rpn_batch_size - fg_num2;
ReservoirSampling(bg_num, engine, &bg_inds);
gt_inds.reserve(fg_num2);
for (int i = 0; i < fg_num2; ++i) {
gt_inds.emplace_back(argmax[fg_inds[i]]);
}
std::vector<std::vector<int>> fg_bg_gt;
fg_bg_gt.emplace_back(fg_inds);
fg_bg_gt.emplace_back(bg_inds);
fg_bg_gt.emplace_back(gt_inds);
return fg_bg_gt;
} }
}; };
...@@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
void Make() override { void Make() override {
AddInput("Anchor", AddInput("Anchor",
"(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4]."); AddInput("GtBoxes",
AddInput( "(LoDTensor) input groud-truth bbox with shape [K, 4].");
"DistMat", AddInput("IsCrowd",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " "(LoDTensor) input which indicates groud-truth is crowd.");
"[K, M]. It is pair-wise distance matrix between the entities " AddInput("ImInfo",
"represented by each row and each column. For example, assumed one " "(LoDTensor) input image information with shape [N, 3]. "
"entity is A with shape [K], another entity is B with shape [M]. The " "N is the batch size, each image information includes height, "
"DistMat[i][j] is the distance between A[i] and B[j]. The bigger " "width and scale.");
"the distance is, the better macthing the pairs are. Please note, " AddAttr<int>("rpn_batch_size_per_im",
"This tensor can contain LoD information to represent a batch of " "Total number of RPN examples per image.")
"inputs. One instance of this batch can contain different numbers of " .SetDefault(256);
"entities."); AddAttr<float>(
"rpn_straddle_thresh",
"Remove RPN anchors that go outside the image by straddle_thresh "
"pixels, "
"Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
AddAttr<float>( AddAttr<float>(
"rpn_positive_overlap", "rpn_positive_overlap",
"Minimum overlap required between an anchor and ground-truth " "Minimum overlap required between an anchor and ground-truth "
...@@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"box for the (anchor, gt box) pair to be a negative examples.") "box for the (anchor, gt box) pair to be a negative examples.")
.SetDefault(0.3); .SetDefault(0.3);
AddAttr<float>( AddAttr<float>(
"fg_fraction", "rpn_fg_fraction",
"Target fraction of RoI minibatch that " "Target fraction of RoI minibatch that "
"is labeled foreground (i.e. class > 0), 0-th class is background.") "is labeled foreground (i.e. class > 0), 0-th class is background.")
.SetDefault(0.25); .SetDefault(0.25);
AddAttr<int>("rpn_batch_size_per_im", AddAttr<bool>("use_random",
"Total number of RPN examples per image.") "A flag indicating whether to use a ReservoirSampling. "
.SetDefault(256); "NOTE: DO NOT set this flag to false in training. "
AddAttr<bool>("fix_seed", "Setting this flag to false is only useful in unittest.")
"A flag indicating whether to use a fixed seed to generate " .SetDefault(true);
"random mask. NOTE: DO NOT set this flag to true in "
"training. Setting this flag to true is only useful in "
"unittest.")
.SetDefault(false);
AddAttr<int>("seed", "RpnTargetAssign random seed.").SetDefault(0);
AddOutput( AddOutput(
"LocationIndex", "LocationIndex",
"(Tensor), The indexes of foreground anchors in all RPN anchors, the " "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
...@@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"ScoreIndex is [F + B], F and B are sampled foreground and backgroud " "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
" number."); " number.");
AddOutput("TargetBBox", AddOutput("TargetBBox",
"(Tensor<int64_t>), The target bbox deltas with shape " "(Tensor), The target bbox deltas with shape "
"[F, 4], F is the sampled foreground number."); "[F, 4], F is the sampled foreground number.");
AddOutput( AddOutput(
"TargetLabel", "TargetLabel",
"(Tensor<int64_t>), The target labels of each anchor with shape " "(Tensor<int>), The target labels of each anchor with shape "
"[F + B, 1], F and B are sampled foreground and backgroud number."); "[F + B, 1], F and B are sampled foreground and backgroud number.");
AddComment(R"DOC( AddComment(R"DOC(
This operator can be, for given the IoU between the ground truth bboxes and the This operator can be, for a given set of ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction. anchors, to assign classification and regression targets to each prediction.
The Score index and LocationIndex will be generated according to the DistMat. The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
The rest anchors would not contibute to the RPN training loss The rest anchors would not contibute to the RPN training loss
ScoreIndex is composed of foreground anchor indexes(positive labels) and ScoreIndex is composed of foreground anchor indexes(positive labels) and
......
...@@ -20,6 +20,7 @@ if(WITH_GRPC) ...@@ -20,6 +20,7 @@ if(WITH_GRPC)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc)
return() return()
endif() endif()
......
...@@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() { ...@@ -59,40 +59,32 @@ GRPCClient::~GRPCClient() {
} }
channels_.clear(); channels_.clear();
} }
client_thread_->join(); client_thread_->join();
} }
bool GRPCClient::AsyncSendVar(const std::string& ep, VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, int64_t time_out) { const std::string& var_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx; const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep; const std::string ep_val = ep;
const std::string var_name_val = var_name; const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
SendProcessor* s = new SendProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "Send", var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
this] {
auto* var = p_scope->FindVar(var_name_val); auto* var = p_scope->FindVar(var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(var_name_val, var, *p_ctx, &req); SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
// varhandle VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = var_name_val;
var_h.ctx = p_ctx;
var_h.method = "Send";
VLOG(3) << var_h.String() << " begin";
// stub context // stub context
SendProcessor* s = new SendProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = nullptr; s->response_call_back_ = nullptr;
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
...@@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -102,13 +94,13 @@ bool GRPCClient::AsyncSendVar(const std::string& ep,
}); });
req_count_++; req_count_++;
return true; return h;
} }
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
const ::grpc::ByteBuffer& ret_msg) { const ::grpc::ByteBuffer& ret_msg) {
framework::Variable* outvar = nullptr; framework::Variable* outvar = nullptr;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar);
} }
template <typename T> template <typename T>
...@@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { ...@@ -119,37 +111,30 @@ void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
result->Swap(&tmp); result->Swap(&tmp);
} }
bool GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, int64_t time_out) { const std::string& var_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx; const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep; const std::string ep_val = ep;
const std::string var_name_val = var_name; const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch);
VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
this] {
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
::grpc::ByteBuffer buf; ::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
// var handle VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = var_name_val;
var_h.ctx = p_ctx;
var_h.method = "Get";
VLOG(3) << var_h.String() << " begin";
// stub context // stub context
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
...@@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -160,42 +145,36 @@ bool GRPCClient::AsyncGetVar(const std::string& ep,
req_count_++; req_count_++;
return true; return h;
} }
bool GRPCClient::AsyncPrefetchVar(const std::string& ep, VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& in_var_name, const std::string& in_var_name,
const std::string& out_var_name, const std::string& out_var_name,
int64_t time_out) { int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx; const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep; const std::string ep_val = ep;
const std::string in_var_name_val = in_var_name; const std::string in_var_name_val = in_var_name;
const std::string out_var_name_val = out_var_name; const std::string out_var_name_val = out_var_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch);
VarHandlePtr h(
new VarHandle(ep, "Prefetch", out_var_name_val, p_ctx, p_scope));
s->Prepare(h, time_out);
framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
time_out, ch, this] { time_out, s, this] {
auto* var = p_scope->FindVar(in_var_name_val); auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
// var handle VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
VarHandle var_h;
var_h.ep = ep_val;
var_h.scope = p_scope;
var_h.name = out_var_name_val;
var_h.ctx = p_ctx;
var_h.method = "Prefetch";
VLOG(3) << var_h.String() << " begin";
// stub context // stub context
GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
...@@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -206,56 +185,68 @@ bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
}); });
req_count_++; req_count_++;
return true; return h;
} }
void GRPCClient::AsyncSendBatchBarrier(const std::string& ep, VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out) { int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out); VarHandlePtr h(new VarHandle(ep, "BatchBarrier", BATCH_BARRIER_MESSAGE,
nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE); req.set_varname(BATCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
return h;
} }
void GRPCClient::AsyncSendFetchBarrier(const std::string& ep, VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out) { int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
s->Prepare(time_out); VarHandlePtr h(new VarHandle(ep, "FetchBarrier", FETCH_BARRIER_MESSAGE,
nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE); req.set_varname(FETCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
return h;
} }
void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out); VarHandlePtr h(
new VarHandle(ep, "SendComplete", COMPLETE_MESSAGE, nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
return h;
} }
void GRPCClient::AsyncCheckpointNotify(const std::string& ep, VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
const std::string& dir, const std::string& dir,
int64_t time_out) { int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
s->Prepare(time_out); VarHandlePtr h(new VarHandle(ep, "CheckPointNotify", CHECKPOINT_SAVE_MESSAGE,
nullptr, nullptr));
s->Prepare(h, time_out);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_varname(CHECKPOINT_SAVE_MESSAGE);
...@@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, ...@@ -264,6 +255,7 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++; req_count_++;
return h;
} }
bool GRPCClient::Wait() { bool GRPCClient::Wait() {
...@@ -276,32 +268,42 @@ void GRPCClient::Proceed() { ...@@ -276,32 +268,42 @@ void GRPCClient::Proceed() {
void* tag = nullptr; void* tag = nullptr;
bool ok = false; bool ok = false;
VLOG(3) << "GRPCClient Proceed begin";
while (!stopped_ && cq_.Next(&tag, &ok)) { while (!stopped_ && cq_.Next(&tag, &ok)) {
BaseProcessor* c = static_cast<BaseProcessor*>(tag); BaseProcessor* c = static_cast<BaseProcessor*>(tag);
GPR_ASSERT(ok); GPR_ASSERT(ok);
PADDLE_ENFORCE(c); PADDLE_ENFORCE(c);
if (c->status_.ok()) { if (c->status_.ok()) {
VLOG(3) << c->var_h_.String() << " process"; VLOG(3) << c->GetVarHandlePtr()->String() << " process";
c->Process(); c->Process();
} else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
LOG(ERROR) << c->var_h_.String() LOG(ERROR) << c->GetVarHandlePtr()->String()
<< " meets grpc error:" << c->status_.error_message(); << " meets grpc error:" << c->status_.error_message();
{ {
std::lock_guard<std::mutex> lk(sync_mutex_); std::lock_guard<std::mutex> lk(sync_mutex_);
ok_ = false; ok_ = false;
} }
sync_cond_.notify_all(); c->Finish(false);
} else { } else {
LOG(FATAL) << c->var_h_.String() LOG(FATAL) << c->GetVarHandlePtr()->String()
<< " meets grpc error:" << c->status_.error_message(); << " meets grpc error:" << c->status_.error_message();
c->Finish(false);
} }
delete c;
bool notify = false;
{ {
std::lock_guard<std::mutex> lk(sync_mutex_); std::lock_guard<std::mutex> lk(sync_mutex_);
req_count_--; req_count_--;
notify = (req_count_ <= 0 || !c->status_.ok());
}
delete c;
if (notify) {
sync_cond_.notify_all();
} }
sync_cond_.notify_all();
} }
VLOG(3) << "GRPCClient Proceed end";
} }
std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) { std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
......
...@@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); ...@@ -53,15 +53,14 @@ void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
class BaseProcessor { class BaseProcessor {
public: public:
explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) { BaseProcessor() { context_ = nullptr; }
context_ = nullptr;
}
virtual ~BaseProcessor() {} virtual ~BaseProcessor() {}
virtual void Prepare(const VarHandle& var_info, int64_t time_out) { virtual void Prepare(VarHandlePtr h, int64_t time_out) {
var_h_ = h;
context_.reset(new grpc::ClientContext()); context_.reset(new grpc::ClientContext());
var_h_ = var_info;
context_->set_wait_for_ready(true); context_->set_wait_for_ready(true);
if (time_out) { if (time_out) {
std::chrono::system_clock::time_point deadline = std::chrono::system_clock::time_point deadline =
...@@ -71,21 +70,21 @@ class BaseProcessor { ...@@ -71,21 +70,21 @@ class BaseProcessor {
} }
} }
virtual void Prepare(int64_t time_out) { void Process() {
context_.reset(new grpc::ClientContext()); ProcessImpl();
context_->set_wait_for_ready(true); var_h_->Finish(true);
std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
context_->set_deadline(deadline);
} }
virtual void Process() = 0; VarHandlePtr GetVarHandlePtr() { return var_h_; }
bool Wait() { return var_h_->Wait(); }
void Finish(bool ok) { return var_h_->Finish(ok); }
virtual void ProcessImpl() = 0;
std::unique_ptr<grpc::ClientContext> context_; std::unique_ptr<grpc::ClientContext> context_;
grpc::Status status_; grpc::Status status_;
VarHandle var_h_;
protected:
VarHandlePtr var_h_;
}; };
typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)> typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
...@@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)> ...@@ -94,13 +93,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
class SendProcessor : public BaseProcessor { class SendProcessor : public BaseProcessor {
public: public:
explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch), stub_g_(ch) {} : BaseProcessor(), stub_g_(ch) {}
virtual ~SendProcessor() {} virtual ~SendProcessor() {}
virtual void Process() { void ProcessImpl() override {
if (response_call_back_) { if (response_call_back_) {
response_call_back_(var_h_, reply_); response_call_back_(*var_h_.get(), reply_);
} }
} }
...@@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)> ...@@ -115,13 +114,13 @@ typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
class GetProcessor : public BaseProcessor { class GetProcessor : public BaseProcessor {
public: public:
explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch), stub_g_(ch) {} : BaseProcessor(), stub_g_(ch) {}
virtual ~GetProcessor() {} virtual ~GetProcessor() {}
virtual void Process() { void ProcessImpl() override {
if (response_call_back_) { if (response_call_back_) {
response_call_back_(var_h_, reply_); response_call_back_(*var_h_.get(), reply_);
} }
} }
...@@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor { ...@@ -133,13 +132,13 @@ class GetProcessor : public BaseProcessor {
class BatchBarrierProcessor : public BaseProcessor { class BatchBarrierProcessor : public BaseProcessor {
public: public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) { : BaseProcessor() {
stub_ = sendrecv::SendRecvService::NewStub(ch); stub_ = sendrecv::SendRecvService::NewStub(ch);
} }
virtual ~BatchBarrierProcessor() {} virtual ~BatchBarrierProcessor() {}
virtual void Process() {} void ProcessImpl() override {}
sendrecv::VoidMessage reply_; sendrecv::VoidMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_; std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
}; };
...@@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor { ...@@ -147,13 +146,13 @@ class BatchBarrierProcessor : public BaseProcessor {
class FetchBarrierProcessor : public BaseProcessor { class FetchBarrierProcessor : public BaseProcessor {
public: public:
explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) { : BaseProcessor() {
stub_ = sendrecv::SendRecvService::NewStub(ch); stub_ = sendrecv::SendRecvService::NewStub(ch);
} }
virtual ~FetchBarrierProcessor() {} virtual ~FetchBarrierProcessor() {}
virtual void Process() {} void ProcessImpl() override {}
sendrecv::VariableMessage reply_; sendrecv::VariableMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_; std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
}; };
...@@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor { ...@@ -161,13 +160,13 @@ class FetchBarrierProcessor : public BaseProcessor {
class CheckpointNotifyProcessor : public BaseProcessor { class CheckpointNotifyProcessor : public BaseProcessor {
public: public:
explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch) explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) { : BaseProcessor() {
stub_ = sendrecv::SendRecvService::NewStub(ch); stub_ = sendrecv::SendRecvService::NewStub(ch);
} }
virtual ~CheckpointNotifyProcessor() {} virtual ~CheckpointNotifyProcessor() {}
virtual void Process() {} void ProcessImpl() override {}
sendrecv::VoidMessage reply_; sendrecv::VoidMessage reply_;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_; std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
}; };
...@@ -177,32 +176,37 @@ class GRPCClient : public RPCClient { ...@@ -177,32 +176,37 @@ class GRPCClient : public RPCClient {
GRPCClient() : ok_(true), completed_(false), stopped_(false) {} GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
virtual ~GRPCClient(); virtual ~GRPCClient();
bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, VarHandlePtr AsyncSendVar(const std::string& ep,
const framework::Scope& scope, const std::string& var_name, const platform::DeviceContext& ctx,
int64_t time_out = FLAGS_rpc_deadline) override; const framework::Scope& scope,
const std::string& var_name,
bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, int64_t time_out = FLAGS_rpc_deadline) override;
const framework::Scope& scope, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx,
bool AsyncPrefetchVar(const std::string& ep, const framework::Scope& scope,
const platform::DeviceContext& ctx, const std::string& var_name,
const framework::Scope& scope, int64_t time_out = FLAGS_rpc_deadline) override;
const std::string& in_var_name,
const std::string& out_var_name, VarHandlePtr AsyncPrefetchVar(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override; const platform::DeviceContext& ctx,
const framework::Scope& scope,
void AsyncSendBatchBarrier(const std::string& ep, const std::string& in_var_name,
int64_t time_out = FLAGS_rpc_deadline) override; const std::string& out_var_name,
int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncSendBatchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncSendFetchBarrier(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
void AsyncSendComplete(const std::string& ep,
int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncCheckpointNotify(
const std::string& ep, const std::string& dir,
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendComplete(
const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
bool Wait() override; bool Wait() override;
......
...@@ -82,8 +82,10 @@ class ProtoEncodeHelper { ...@@ -82,8 +82,10 @@ class ProtoEncodeHelper {
: base_(buf), p_(buf), limit_(base_ + max_size) {} : base_(buf), p_(buf), limit_(base_ + max_size) {}
~ProtoEncodeHelper() { ~ProtoEncodeHelper() {
#define REPLACE_ENFORCE_GLOG 1
// Make sure callers didn't do operations that went over max_size promised // Make sure callers didn't do operations that went over max_size promised
PADDLE_ENFORCE_LE(p_, limit_); paddle::platform::throw_on_error(p_ <= limit_);
#undef REPLACE_ENFORCE_GLOG
} }
const char* data() const { return base_; } const char* data() const { return base_; }
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; ...@@ -49,23 +50,77 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
class RPCServer; class RPCServer;
struct VarHandle { class VarHandle {
// RPC endpoint. public:
std::string ep; VarHandle(const std::string ep, const std::string& method,
const platform::DeviceContext* ctx; const std::string& name,
const framework::Scope* scope; const platform::DeviceContext* p_ctx = nullptr,
// Variable name. const framework::Scope* p_scope = nullptr)
std::string name; : ok_(kVarHandleDefaultState) {
// RPC method name. ep_ = ep;
std::string method; ctx_ = p_ctx;
scope_ = p_scope;
name_ = name;
method_ = method;
}
virtual ~VarHandle() {}
public:
bool Wait() {
{
std::unique_lock<std::mutex> lk(sync_mutex_);
wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
}
VLOG(7) << "VarHandle wait:" << ok_;
return ok_ != 0;
}
void Finish(bool ok) {
{
std::unique_lock<std::mutex> lk(sync_mutex_);
ok_ = ok;
}
VLOG(7) << "VarHandle finish:" << ok;
wait_cond_.notify_all();
}
std::string String() const { std::string String() const {
std::ostringstream s; std::ostringstream s;
s << method << " name:[" << name << "], ep:[" << ep << "]"; s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
<< "]";
return s.str(); return s.str();
} }
std::string ep() const { return ep_; }
const platform::DeviceContext* ctx() const { return ctx_; }
const framework::Scope* scope() const { return scope_; }
std::string name() const { return name_; }
std::string method() const { return method_; }
protected:
// RPC endpoint.
std::string ep_;
const platform::DeviceContext* ctx_;
const framework::Scope* scope_;
// Variable name.
std::string name_;
// RPC method name.
std::string method_;
protected:
std::mutex sync_mutex_;
std::condition_variable wait_cond_;
int ok_;
static const int kVarHandleDefaultState = -1;
private:
DISABLE_COPY_AND_ASSIGN(VarHandle);
}; };
typedef std::shared_ptr<VarHandle> VarHandlePtr;
class RequestHandler { class RequestHandler {
public: public:
explicit RequestHandler(bool sync_mode) explicit RequestHandler(bool sync_mode)
......
...@@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -67,24 +67,11 @@ bool RequestSendHandler::Handle(const std::string& varname,
LOG(FATAL) << "sync: Can not find server side var: " << varname; LOG(FATAL) << "sync: Can not find server side var: " << varname;
return false; return false;
} }
if (invar->IsType<framework::SelectedRows>()) {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
sparse_vars_.push_back(invar);
}
} }
} }
return true; return true;
} }
void RequestSendHandler::ResetSparseVarRecorder() {
std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
for (auto* var : sparse_vars_) {
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
}
sparse_vars_.clear();
}
bool RequestGetHandler::Handle(const std::string& varname, bool RequestGetHandler::Handle(const std::string& varname,
framework::Scope* scope, framework::Scope* scope,
framework::Variable* invar, framework::Variable* invar,
......
...@@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler { ...@@ -41,11 +41,6 @@ class RequestSendHandler final : public RequestHandler {
bool Handle(const std::string& varname, framework::Scope* scope, bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar, framework::Variable* var, framework::Variable** outvar,
const std::string& out_var_name = "") override; const std::string& out_var_name = "") override;
void ResetSparseVarRecorder();
private:
std::mutex mutex_sparse_vars_;
std::vector<framework::Variable*> sparse_vars_;
}; };
class RequestGetHandler final : public RequestHandler { class RequestGetHandler final : public RequestHandler {
......
...@@ -14,12 +14,14 @@ ...@@ -14,12 +14,14 @@
#pragma once #pragma once
#include <condition_variable> // NOLINT
#include <string> #include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_deadline); DECLARE_int32(rpc_deadline);
...@@ -31,37 +33,36 @@ class RPCClient { ...@@ -31,37 +33,36 @@ class RPCClient {
public: public:
RPCClient() {} RPCClient() {}
virtual ~RPCClient() {} virtual ~RPCClient() {}
virtual bool AsyncSendVar(const std::string& ep, virtual VarHandlePtr AsyncSendVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual bool AsyncGetVar(const std::string& ep, virtual VarHandlePtr AsyncGetVar(const std::string& ep,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual bool AsyncPrefetchVar(const std::string& ep, virtual VarHandlePtr AsyncPrefetchVar(
const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope, const std::string& in_var_name,
const std::string& in_var_name, const std::string& out_var_name,
const std::string& out_var_name, int64_t time_out = FLAGS_rpc_deadline) = 0;
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendBatchBarrier(
virtual void AsyncSendBatchBarrier(const std::string& ep, const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendFetchBarrier(
virtual void AsyncSendFetchBarrier(const std::string& ep, const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncCheckpointNotify(
virtual void AsyncCheckpointNotify(const std::string& ep, const std::string& ep, const std::string& dir,
const std::string& dir, int64_t time_out = FLAGS_rpc_deadline) = 0;
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendComplete(
virtual void AsyncSendComplete(const std::string& ep, const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
int64_t time_out = FLAGS_rpc_deadline) = 0;
// Complete tells all the pserver instances that finishe the training, // Complete tells all the pserver instances that finishe the training,
// the pserver can reduce it's barrier count, and continue to train // the pserver can reduce it's barrier count, and continue to train
......
...@@ -101,6 +101,8 @@ void RPCServer::Complete() { ...@@ -101,6 +101,8 @@ void RPCServer::Complete() {
{ {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
client_num_--; client_num_--;
need_reset_all_vars_ = true;
VLOG(4) << "decrease client_num to: " << client_num_; VLOG(4) << "decrease client_num to: " << client_num_;
if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
barrier_counter_[kRequestGet]--; barrier_counter_[kRequestGet]--;
...@@ -109,6 +111,11 @@ void RPCServer::Complete() { ...@@ -109,6 +111,11 @@ void RPCServer::Complete() {
barrier_cond_.notify_all(); barrier_cond_.notify_all();
} }
bool RPCServer::NeedResetAllVars() {
std::unique_lock<std::mutex> lock(mutex_);
return need_reset_all_vars_;
}
int RPCServer::GetClientNum() { int RPCServer::GetClientNum() {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
return client_num_; return client_num_;
...@@ -120,6 +127,7 @@ void RPCServer::ResetBarrierCounter() { ...@@ -120,6 +127,7 @@ void RPCServer::ResetBarrierCounter() {
for (auto& t : barrier_counter_) { for (auto& t : barrier_counter_) {
t.second = 0; t.second = 0;
} }
need_reset_all_vars_ = false;
} }
void RPCServer::RegisterRPC(const std::string& rpc_name, void RPCServer::RegisterRPC(const std::string& rpc_name,
......
...@@ -49,7 +49,8 @@ class RPCServer { ...@@ -49,7 +49,8 @@ class RPCServer {
bind_address_(address), bind_address_(address),
exit_flag_(false), exit_flag_(false),
selected_port_(0), selected_port_(0),
client_num_(client_num) {} client_num_(client_num),
need_reset_all_vars_(false) {}
virtual ~RPCServer() {} virtual ~RPCServer() {}
virtual void StartServer() = 0; virtual void StartServer() = 0;
...@@ -86,6 +87,8 @@ class RPCServer { ...@@ -86,6 +87,8 @@ class RPCServer {
void ResetBarrierCounter(); void ResetBarrierCounter();
RPCServerProfiler& Profiler() { return profiler_; } RPCServerProfiler& Profiler() { return profiler_; }
bool NeedResetAllVars();
protected: protected:
virtual void ShutDownImpl() = 0; virtual void ShutDownImpl() = 0;
...@@ -104,6 +107,7 @@ class RPCServer { ...@@ -104,6 +107,7 @@ class RPCServer {
std::atomic<int> exit_flag_; std::atomic<int> exit_flag_;
int selected_port_; int selected_port_;
int client_num_; int client_num_;
bool need_reset_all_vars_;
std::unordered_map<std::string, RequestHandler*> rpc_call_map_; std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
std::unordered_map<std::string, int> rpc_thread_num_; std::unordered_map<std::string, int> rpc_thread_num_;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread> // NOLINT
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
using paddle::operators::distributed::VarHandlePtr;
using paddle::operators::distributed::VarHandle;
void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
TEST(VarHandle, Run) {
std::vector<VarHandlePtr> a;
for (int i = 0; i < 12; i++) {
VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
a.push_back(s);
}
std::vector<std::unique_ptr<std::thread>> t;
for (int i = 0; i < 6; i++) {
t.emplace_back(new std::thread(WaitFalse, a[i]));
}
for (int i = 0; i < 6; i++) {
a[i]->Finish(false);
t[i]->join();
}
for (int i = 6; i < 12; i++) {
t.emplace_back(new std::thread(WaitTrue, a[i]));
}
for (int i = 6; i < 12; i++) {
a[i]->Finish(true);
t[i]->join();
}
}
...@@ -25,14 +25,14 @@ namespace paddle { ...@@ -25,14 +25,14 @@ namespace paddle {
namespace operators { namespace operators {
void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of GRU should not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of GRU.");
PADDLE_ENFORCE(ctx->HasInput("WeightX"), PADDLE_ENFORCE(ctx->HasInput("WeightX"),
"Input(WeightX) of GRU should not be null."); "Assert only one Input(WeightX) of GRU.");
PADDLE_ENFORCE(ctx->HasInput("WeightH"), PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Input(WeightH) of GRU should not be null."); "Assert only one Input(WeightH) of GRU.");
PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null."); PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of GRU.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of GRU should not be null."); "Assert only one Output(Hidden) of GRU.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
...@@ -80,11 +80,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -80,11 +80,11 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of GRU should not be null."); "Assert only one Output(ReorderedH0) of GRU.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of GRU should not be null."); "Assert only one Output(BatchedInput) of GRU.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"), PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
"Output(BatchedOut) of GRU should not be null."); "Assert only one Output(BatchedOut) of GRU.");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedOut", out_dims); ctx->SetOutputDim("BatchedOut", out_dims);
} }
......
...@@ -24,20 +24,17 @@ namespace paddle { ...@@ -24,20 +24,17 @@ namespace paddle {
namespace operators { namespace operators {
void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Assert only one Input(X) of LSTM.");
PADDLE_ENFORCE(ctx->HasInput("WeightX"), PADDLE_ENFORCE(ctx->HasInput("WeightX"),
"Input(WeightX) of LSTM should not be null."); "Assert only one Input(WeightX) of LSTM.");
PADDLE_ENFORCE(ctx->HasInput("WeightH"), PADDLE_ENFORCE(ctx->HasInput("WeightH"),
"Input(WeightH) of LSTM should not be null."); "Assert only one Input(WeightH) of LSTM.");
PADDLE_ENFORCE(ctx->HasInput("Bias"), PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
"Input(Bias) of LSTM should not be null."); PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("XX"),
"Output(XX) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of LSTM should not be null."); "Assert only one Output(Hidden) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"), PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null."); "Assert only one Output(Cell) of LSTM.");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
...@@ -96,15 +93,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -96,15 +93,15 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
} else { } else {
xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1]; xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
"Output(BatchedInput) of LSTM should not be null."); "Assert only one Output(BatchedInput) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
"Output(BatchedHidden) of LSTM should not be null."); "Assert only one Output(BatchedHidden) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
"Output(BatchedCell) of LSTM should not be null."); "Assert only one Output(BatchedCell) of LSTM.");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
"Output(ReorderedH0) of LSTM should not be null."); "Assert only one Output(ReorderedH0) of LSTM");
PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
"Output(ReorderedC0) of LSTM should not be null."); "Assert only one Output(ReorderedC0) of LSTM.");
ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]}); ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
ctx->SetOutputDim("BatchedHidden", out_dims); ctx->SetOutputDim("BatchedHidden", out_dims);
ctx->SetOutputDim("BatchedCell", out_dims); ctx->SetOutputDim("BatchedCell", out_dims);
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/listen_and_serv_op.h"
...@@ -58,17 +59,16 @@ static void ParallelExecuteBlocks( ...@@ -58,17 +59,16 @@ static void ParallelExecuteBlocks(
framework::ProgramDesc *program, framework::Scope *scope) { framework::ProgramDesc *program, framework::Scope *scope) {
std::vector<std::future<void>> fs; std::vector<std::future<void>> fs;
for (size_t idx : parallel_blkids) { for (size_t idx : parallel_blkids) {
fs.push_back( fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
framework::Async([&executor, &prepared, &program, &scope, idx]() { int run_block = idx; // thread local
int run_block = idx; // thread local try {
try { VLOG(3) << "running server block: " << run_block
VLOG(3) << "running server block: " << run_block << "pointer: " << prepared[run_block].get();
<< "pointer: " << prepared[run_block].get(); executor->RunPreparedContext(prepared[run_block].get(), scope);
executor->RunPreparedContext(prepared[run_block].get(), scope); } catch (const std::exception &e) {
} catch (const std::exception &e) { LOG(ERROR) << "run sub program error " << e.what();
LOG(ERROR) << "run sub program error " << e.what(); }
} }));
}));
} }
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
} }
...@@ -101,7 +101,7 @@ static int64_t GetTimestamp() { ...@@ -101,7 +101,7 @@ static int64_t GetTimestamp() {
void ListenAndServOp::RunSyncLoop( void ListenAndServOp::RunSyncLoop(
framework::Executor *executor, framework::ProgramDesc *program, framework::Executor *executor, framework::ProgramDesc *program,
framework::Scope *recv_scope, framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
const std::vector<int> &prefetch_block_id_list, const std::vector<int> &prefetch_block_id_list,
const int checkpoint_point_block_id) const { const int checkpoint_point_block_id) const {
VLOG(2) << "RunSyncLoop"; VLOG(2) << "RunSyncLoop";
...@@ -128,6 +128,7 @@ void ListenAndServOp::RunSyncLoop( ...@@ -128,6 +128,7 @@ void ListenAndServOp::RunSyncLoop(
rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet);
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
while (true) { while (true) {
rpc_service_->Profiler().OneStep(); rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
...@@ -165,9 +166,7 @@ void ListenAndServOp::RunSyncLoop( ...@@ -165,9 +166,7 @@ void ListenAndServOp::RunSyncLoop(
recv_scope); recv_scope);
VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
// reset received sparse vars to avoid reuse it in the next mini-batch ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
->ResetSparseVarRecorder();
rpc_service_->SetCond(distributed::kRequestGet); rpc_service_->SetCond(distributed::kRequestGet);
rpc_service_->WaitBarrier(distributed::kRequestGet); rpc_service_->WaitBarrier(distributed::kRequestGet);
...@@ -175,6 +174,42 @@ void ListenAndServOp::RunSyncLoop( ...@@ -175,6 +174,42 @@ void ListenAndServOp::RunSyncLoop(
} // while(true) } // while(true)
} }
void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
platform::DeviceContext *dev_ctx,
bool reset_all) const {
for (auto &varname : sparse_vars_) {
auto var = recv_scope->FindVar(varname);
if (var == nullptr) {
VLOG(2) << "can not find var " << varname << " in received scope";
continue;
}
if (var->IsType<framework::SelectedRows>()) {
VLOG(3) << "reset sparse var: " << varname;
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
} else {
PADDLE_THROW("The type of sparse var should be SelectedRows");
}
}
if (UNLIKELY(reset_all)) {
for (auto &varname : dense_vars_) {
auto var = recv_scope->FindVar(varname);
if (var == nullptr) {
VLOG(2) << "can not find var " << varname << " in received scope";
continue;
}
if (var->IsType<framework::LoDTensor>()) {
math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
static_cast<float>(0));
} else if (var->IsType<framework::Tensor>()) {
math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
static_cast<float>(0));
} else {
PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
}
}
}
}
void ListenAndServOp::RunAsyncLoop(framework::Executor *executor, void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program, framework::ProgramDesc *program,
framework::Scope *recv_scope) const { framework::Scope *recv_scope) const {
...@@ -248,6 +283,25 @@ static void FillRequestCtx( ...@@ -248,6 +283,25 @@ static void FillRequestCtx(
h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
} }
void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
const framework::Scope &scope) const {
for (const auto &varname : varnames) {
auto var = scope.FindVar(varname);
PADDLE_ENFORCE(var != nullptr,
"Received var should be initialized in the received scope.");
if (var->IsType<framework::SelectedRows>()) {
sparse_vars_.push_back(varname);
} else if (var->IsType<framework::LoDTensor>() ||
var->IsType<framework::Tensor>()) {
dense_vars_.push_back(varname);
} else {
PADDLE_THROW(
"The type of received var should be in [SelectedRows, LoDTensor, "
"Tensor].");
}
}
}
void ListenAndServOp::RunImpl(const framework::Scope &scope, void ListenAndServOp::RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const { const platform::Place &dev_place) const {
// Mark this as PS that it should decide profiling by listening from trainer. // Mark this as PS that it should decide profiling by listening from trainer.
...@@ -258,6 +312,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -258,6 +312,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
bool sync_mode = Attr<bool>("sync_mode"); bool sync_mode = Attr<bool>("sync_mode");
auto fan_in = Attr<int>("Fanin"); auto fan_in = Attr<int>("Fanin");
auto inputs = Inputs("X");
PADDLE_ENFORCE(!rpc_service_); PADDLE_ENFORCE(!rpc_service_);
std::string endpoint = Attr<std::string>("endpoint"); std::string endpoint = Attr<std::string>("endpoint");
...@@ -348,11 +403,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -348,11 +403,16 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
signal(SIGINT, SignalHandler::StopAndExit); signal(SIGINT, SignalHandler::StopAndExit);
signal(SIGTERM, SignalHandler::StopAndExit); signal(SIGTERM, SignalHandler::StopAndExit);
// Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
// so that we can reset them at the end of each iteration.
// NOTE: only used in sync update
CacheVarsType(inputs, recv_scope);
// Write to a file of server selected port for python use. // Write to a file of server selected port for python use.
SavePort(); SavePort();
if (sync_mode) { if (sync_mode) {
RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list, RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
checkpoint_block_id); prefetch_block_id_list, checkpoint_block_id);
} else { } else {
RunAsyncLoop(&executor, program, &recv_scope); RunAsyncLoop(&executor, program, &recv_scope);
} }
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -48,6 +49,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -48,6 +49,7 @@ class ListenAndServOp : public framework::OperatorBase {
void RunSyncLoop(framework::Executor* executor, void RunSyncLoop(framework::Executor* executor,
framework::ProgramDesc* program, framework::ProgramDesc* program,
framework::Scope* recv_scope, framework::Scope* recv_scope,
platform::DeviceContext* dev_ctx,
const std::vector<int>& prefetch_block_id_list, const std::vector<int>& prefetch_block_id_list,
const int checkpoint_point_block_id) const; const int checkpoint_point_block_id) const;
...@@ -64,6 +66,13 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -64,6 +66,13 @@ class ListenAndServOp : public framework::OperatorBase {
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override; const platform::Place& dev_place) const override;
void ResetReceivedVars(framework::Scope* recv_scope,
platform::DeviceContext* dev_ctx,
bool reset_all = false) const;
void CacheVarsType(const std::vector<std::string>& varnames,
const framework::Scope& scope) const;
protected: protected:
mutable std::shared_ptr<distributed::RPCServer> rpc_service_; mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_; mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
...@@ -74,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -74,6 +83,8 @@ class ListenAndServOp : public framework::OperatorBase {
request_checkpoint_handler_; request_checkpoint_handler_;
mutable std::shared_ptr<std::thread> server_thread_; mutable std::shared_ptr<std::thread> server_thread_;
mutable std::vector<std::string> sparse_vars_;
mutable std::vector<std::string> dense_vars_;
}; };
class SignalHandler { class SignalHandler {
......
...@@ -28,7 +28,8 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> { ...@@ -28,7 +28,8 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
const framework::Tensor* prob, const framework::Tensor* prob,
const framework::Tensor* labels, const bool softLabel) { const framework::Tensor* labels, const bool softLabel,
const int ignore_index) {
const int batch_size = prob->dims()[0]; const int batch_size = prob->dims()[0];
if (softLabel) { if (softLabel) {
auto in = EigenMatrix<T>::From(*prob); auto in = EigenMatrix<T>::From(*prob);
...@@ -49,8 +50,12 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> { ...@@ -49,8 +50,12 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
int lbl = label_data[i]; int lbl = label_data[i];
PADDLE_ENFORCE_GE(lbl, 0); PADDLE_ENFORCE_GE(lbl, 0);
PADDLE_ENFORCE_LT(lbl, class_num); PADDLE_ENFORCE_LT(lbl, class_num);
PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index);
int index = i * class_num + lbl; int index = i * class_num + lbl;
loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index])); loss_data[i] =
lbl == ignore_index
? 0
: -math::TolerableValue<T>()(std::log(prob_data[index]));
} }
} }
} }
......
...@@ -23,11 +23,14 @@ namespace math { ...@@ -23,11 +23,14 @@ namespace math {
namespace { namespace {
template <typename T> template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
const int N, const int D) { const int N, const int D,
const int ignore_index) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) { i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D); PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index);
Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]])); Y[i] = ignore_index == label[i]
? 0
: -math::TolerableValue<T>()(log(X[i * D + label[i]]));
} }
} }
...@@ -57,7 +60,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -57,7 +60,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& ctx, void operator()(const platform::CUDADeviceContext& ctx,
framework::Tensor* out, const framework::Tensor* prob, framework::Tensor* out, const framework::Tensor* prob,
const framework::Tensor* labels, bool softLabel) { const framework::Tensor* labels, bool softLabel,
const int ignore_index) {
const T* prob_data = prob->data<T>(); const T* prob_data = prob->data<T>();
T* loss_data = out->mutable_data<T>(ctx.GetPlace()); T* loss_data = out->mutable_data<T>(ctx.GetPlace());
...@@ -77,7 +81,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> { ...@@ -77,7 +81,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
int block = 512; int block = 512;
int grid = (batch_size + block - 1) / block; int grid = (batch_size + block - 1) / block;
CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>( CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
loss_data, prob_data, label_data, batch_size, class_num); loss_data, prob_data, label_data, batch_size, class_num,
ignore_index);
} }
} }
}; };
......
...@@ -38,7 +38,8 @@ class CrossEntropyFunctor { ...@@ -38,7 +38,8 @@ class CrossEntropyFunctor {
public: public:
void operator()(const DeviceContext& context, framework::Tensor* out, void operator()(const DeviceContext& context, framework::Tensor* out,
const framework::Tensor* prob, const framework::Tensor* prob,
const framework::Tensor* labels, const bool softLabel); const framework::Tensor* labels, const bool softLabel,
const int ignore_index);
}; };
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -11,14 +11,151 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,14 +11,151 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include <algorithm>
#include "cub/cub.cuh"
#include "paddle/fluid/operators/norm_op.h" #include "paddle/fluid/operators/norm_op.h"
namespace paddle {
namespace operators {
__device__ __forceinline__ float square_root(float x) { return sqrtf(x); }
__device__ __forceinline__ double square_root(double x) { return sqrt(x); }
template <typename T, int BlockDim>
__global__ void Normalize(const T* x, const int pre,
const int axis_n, // dim in axis
const int post, const T eps, T* y, T* out_norm) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
int num = pre * post;
for (int i = blockIdx.x; i < num; i += gridDim.x) {
int base = (i / post) * post * axis_n + (i % post);
T sum = 0.0;
__shared__ T norm;
for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
const T x_ij = x[base + j * post];
sum += x_ij * x_ij;
}
T reduce_result = BlockReduce(temp_storage).Sum(sum);
if (threadIdx.x == 0) {
norm = square_root(reduce_result + eps);
out_norm[i] = norm;
}
__syncthreads();
for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
const int index = base + j * post;
y[index] = x[index] / norm;
}
}
}
template <typename DeviceContext, typename T>
class NormCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_x = ctx.Input<framework::Tensor>("X");
auto* out_y = ctx.Output<framework::Tensor>("Out");
auto* out_norm = ctx.Output<framework::Tensor>("Norm");
const T* x = in_x->data<T>();
T* y = out_y->mutable_data<T>(ctx.GetPlace());
T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
auto xdim = in_x->dims();
auto ndim = out_norm->dims();
int axis = ctx.Attr<int>("axis");
T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
if (axis < 0) axis = xdim.size() + axis;
int pre, n, post;
GetDims(xdim, axis, &pre, &n, &post);
auto& dev_ctx = ctx.cuda_device_context();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
Normalize<T, block><<<grid, block, 0, dev_ctx.stream()>>>(x, pre, n, post,
eps, y, norm);
}
};
template <typename T, int BlockDim>
__global__ void NormalizeGradient(const T* x, const T* x_norm, const T* y_grad,
const int pre, const int axis_n,
const int post, T* x_grad) {
typedef cub::BlockReduce<T, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage_sum;
int num = pre * post;
for (int i = blockIdx.x; i < num; i += gridDim.x) {
T sum = 0.0;
__shared__ T row_sum;
__shared__ T row_sqrt_norm;
__shared__ T row_norm;
auto base = (i / post) * post * axis_n + (i % post);
for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
int index = base + j * post;
sum += x[index] * y_grad[index];
}
T reduce_result = BlockReduce(temp_storage_sum).Sum(sum);
if (threadIdx.x == 0) {
row_sum = reduce_result;
row_sqrt_norm = x_norm[i];
row_norm = row_sqrt_norm * row_sqrt_norm;
}
__syncthreads();
for (int j = threadIdx.x; j < axis_n; j += blockDim.x) {
int index = base + j * post;
const T x_ij = x[index];
const T dy_ij = y_grad[index];
x_grad[index] = (dy_ij - x_ij * row_sum / row_norm) / row_sqrt_norm;
}
}
}
template <typename DeviceContext, typename T, typename AttrType = T>
class NormGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_x = ctx.Input<framework::Tensor>("X");
auto* in_norm = ctx.Input<framework::Tensor>("Norm");
auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
T* dx = out_dx->mutable_data<T>(ctx.GetPlace());
const T* x = in_x->data<T>();
const T* x_norm = in_norm->data<T>();
const T* dy = in_dy->data<T>();
auto xdim = in_x->dims();
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis = xdim.size() + axis;
int pre, n, post;
GetDims(xdim, axis, &pre, &n, &post);
auto& dev_ctx = ctx.cuda_device_context();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid = std::min(max_blocks, pre * post);
NormalizeGradient<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
x, x_norm, dy, pre, n, post, dx);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext; using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>, REGISTER_OP_CUDA_KERNEL(norm, ops::NormCUDAKernel<CUDA, float>,
ops::NormKernel<CUDA, double>); ops::NormCUDAKernel<CUDA, double>);
REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>, REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradCUDAKernel<CUDA, float>,
ops::NormGradKernel<CUDA, double>); ops::NormGradCUDAKernel<CUDA, double>);
...@@ -65,14 +65,17 @@ class NormKernel : public framework::OpKernel<T> { ...@@ -65,14 +65,17 @@ class NormKernel : public framework::OpKernel<T> {
Eigen::DSizes<int, 1> rdim(1); Eigen::DSizes<int, 1> rdim(1);
// y = x / sqrt((sum(x * x) + epsilon)) // y = x / sqrt((sum(x * x) + epsilon))
// norm = sqrt(sum(x * x) + epsilon) // norm = sqrt(sum(x * x) + epsilon)
auto sum = x.pow(2).sum(rdim) + eps; auto x2 = x * x;
auto sum = x2.sum(rdim) + eps;
norm.device(*place) = sum.sqrt(); norm.device(*place) = sum.sqrt();
// y = x / norm // y = x / norm
Eigen::DSizes<int, 3> rshape(pre, 1, post); Eigen::DSizes<int, 3> rshape(pre, 1, post);
Eigen::DSizes<int, 3> bcast(1, n, 1); Eigen::DSizes<int, 3> bcast(1, n, 1);
y.device(*place) = x / norm.reshape(rshape).broadcast(bcast); y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
} }
}; };
template <typename DeviceContext, typename T, typename AttrType = T> template <typename DeviceContext, typename T, typename AttrType = T>
class NormGradKernel : public framework::OpKernel<T> { class NormGradKernel : public framework::OpKernel<T> {
public: public:
......
...@@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase { ...@@ -44,16 +44,20 @@ class PrefetchOp : public framework::OperatorBase {
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get " VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
<< outs[i] << " back"; << outs[i] << " back";
rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]); rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
ins[i], outs[i]));
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
} }
} }
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (size_t i = 0; i < rets.size(); i++) {
PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
}
} }
}; };
......
...@@ -26,10 +26,13 @@ class PReluOp : public framework::OperatorWithKernel { ...@@ -26,10 +26,13 @@ class PReluOp : public framework::OperatorWithKernel {
std::string mode = ctx->Attrs().Get<std::string>("mode"); std::string mode = ctx->Attrs().Get<std::string>("mode");
auto x_dim = ctx->GetInputDim("X"); auto x_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput("X"),
PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null"); "Input(X) of PreluOp should not be null");
PADDLE_ENFORCE(ctx->HasInput("Alpha"),
"Input(Alpha) of PreluOp should not be null");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of PreluOp should not be null");
if (mode == "all") { if (mode == "all") {
PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1, PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
"For mode 'all', size of weight Alpha must be one."); "For mode 'all', size of weight Alpha must be one.");
......
...@@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase { ...@@ -44,12 +44,15 @@ class RecvOp : public framework::OperatorBase {
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < outs.size(); i++) { for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
} }
if (sync_mode) { if (sync_mode) {
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (size_t i = 0; i < rets.size(); i++) {
PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
}
} }
} }
}; };
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <future> // NOLINT #include <future> // NOLINT
#include <ostream> #include <ostream>
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase { ...@@ -45,18 +46,19 @@ class SendOp : public framework::OperatorBase {
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(); distributed::RPCClient::GetInstance<RPCCLIENT_T>();
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
// TODO(Yancey1989): we need to use an IO threadpool which has rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
// a larger number of threads than the computing threadpool.
rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
} }
} }
if (sync_send) { if (sync_send) {
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (size_t i = 0; i < rets.size(); i++) {
PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
}
} }
} }
}; };
......
...@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker ...@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker
"(bool, default: false), A flag to indicate whether to interpretate " "(bool, default: false), A flag to indicate whether to interpretate "
"the given labels as soft labels.") "the given labels as soft labels.")
.SetDefault(false); .SetDefault(false);
AddAttr<int>(
"ignore_index",
"(int, default -100), Specifies a target value that is ignored and"
"does not contribute to the input gradient. Only valid if soft_label"
"is set to False")
.SetDefault(-100);
AddComment(R"DOC( AddComment(R"DOC(
Softmax With Cross Entropy Operator. Softmax With Cross Entropy Operator.
......
...@@ -26,11 +26,13 @@ using Tensor = framework::Tensor; ...@@ -26,11 +26,13 @@ using Tensor = framework::Tensor;
namespace { namespace {
template <typename T> template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
const int batch_size, const int class_num) { const int batch_size, const int class_num,
const int ignore_index) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
i += blockDim.x * gridDim.x) { i += blockDim.x * gridDim.x) {
int idx = i * class_num + labels[i]; int idx = i * class_num + labels[i];
logit_grad[idx] -= static_cast<T>(1.); logit_grad[idx] -=
ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
} }
} }
...@@ -260,6 +262,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> { ...@@ -260,6 +262,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
auto* loss_data = loss->mutable_data<T>(context.GetPlace()); auto* loss_data = loss->mutable_data<T>(context.GetPlace());
auto soft_label = context.Attr<bool>("soft_label"); auto soft_label = context.Attr<bool>("soft_label");
auto ignore_index = context.Attr<int>("ignore_index");
if (soft_label) { if (soft_label) {
int batch_size = logits->dims()[0]; int batch_size = logits->dims()[0];
int feature_size = logits->dims()[1]; int feature_size = logits->dims()[1];
...@@ -272,7 +275,8 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> { ...@@ -272,7 +275,8 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits, math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
softmax); softmax);
math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()( math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
context.cuda_device_context(), loss, softmax, labels, false); context.cuda_device_context(), loss, softmax, labels, false,
ignore_index);
} }
} }
}; };
...@@ -295,7 +299,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -295,7 +299,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
const int class_num = logit_grad->dims()[1]; const int class_num = logit_grad->dims()[1];
int block = 512; int block = 512;
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
auto ignore_index = context.Attr<int>("ignore_index");
if (context.Attr<bool>("soft_label")) { if (context.Attr<bool>("soft_label")) {
int grid = (batch_size * class_num + block - 1) / block; int grid = (batch_size * class_num + block - 1) / block;
const T* label_data = labels->data<T>(); const T* label_data = labels->data<T>();
...@@ -305,7 +309,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -305,7 +309,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
int grid = (batch_size + block - 1) / block; int grid = (batch_size + block - 1) / block;
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
CrossEntropyGrad<T><<<grid, block, 0, stream>>>( CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
logit_grad_data, label_data, batch_size, class_num); logit_grad_data, label_data, batch_size, class_num, ignore_index);
int num = batch_size * class_num; int num = batch_size * class_num;
grid = (num + block - 1) / block; grid = (num + block - 1) / block;
Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num, Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
......
...@@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> { ...@@ -45,7 +45,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits, math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
softmax); softmax);
math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()( math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label")); dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
context.Attr<int>("ignore_index"));
} }
}; };
......
...@@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
fluid_t->mutable_data<float>(platform::CUDAPlace( fluid_t->mutable_data<float>(platform::CUDAPlace(
boost::get<platform::CUDAPlace>(context.GetPlace()).device)), boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
size * sizeof(float)); size * sizeof(float));
//} else {
// engine->GetOutputInGPU(
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
// size * sizeof(float));
//}
output_index += 1; output_index += 1;
} }
......
...@@ -16,6 +16,9 @@ limitations under the License. */ ...@@ -16,6 +16,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h"
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -142,7 +145,58 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -142,7 +145,58 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
mutable unsigned int* semaphore_; mutable unsigned int* semaphore_;
}; };
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { class CudnnHolder {
public:
CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place)
: workspace_(nullptr), workspace_len_(0), stream_(stream), place_(place) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_));
}
cudnnHandle_t cudnn_handle() const { return cudnn_handle_; }
void RunFunc(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_len) {
std::lock_guard<std::mutex> lock(mtx_);
if (required_workspace_len > workspace_len_) {
ReallocateWorkspace(required_workspace_len);
}
cudnn_func(workspace_);
}
~CudnnHolder() {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
if (workspace_ != nullptr) {
paddle::memory::Free(place_, workspace_);
}
}
private:
void ReallocateWorkspace(size_t required_workspace_len) {
if (required_workspace_len <= workspace_len_) {
return;
}
if (workspace_ != nullptr) {
// Maybe someone is using the current workspace
PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
paddle::memory::Free(place_, workspace_);
}
workspace_ = paddle::memory::Alloc(place_, required_workspace_len);
workspace_len_ = required_workspace_len;
}
cudnnHandle_t cudnn_handle_;
void* workspace_;
size_t workspace_len_;
const cudaStream_t* stream_; // not owned;
const CUDAPlace place_;
std::mutex mtx_;
};
CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
: place_(place), cudnn_holder_(nullptr) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
compute_capability = GetCUDAComputeCapability(place_.device); compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process = GetCUDAMultiProcessors(place_.device);
...@@ -154,10 +208,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { ...@@ -154,10 +208,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
if (dynload::HasCUDNN()) { if (dynload::HasCUDNN()) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); cudnn_holder_.reset(new CudnnHolder(&stream_, place));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
} else {
cudnn_handle_ = nullptr;
} }
} }
...@@ -165,9 +216,6 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -165,9 +216,6 @@ CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device); SetDeviceId(place_.device);
Wait(); Wait();
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
if (cudnn_handle_ != nullptr) {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
}
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_));
...@@ -196,7 +244,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { ...@@ -196,7 +244,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const {
return cublas_handle_; return cublas_handle_;
} }
cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
return cudnn_holder_->cudnn_handle();
}
void CUDADeviceContext::RunCudnnFuncWithWorkspace(
const std::function<void(void*)>& cudnn_func, size_t workspace_len) const {
cudnn_holder_->RunFunc(cudnn_func, workspace_len);
}
cudaStream_t CUDADeviceContext::stream() const { return stream_; } cudaStream_t CUDADeviceContext::stream() const { return stream_; }
......
...@@ -69,6 +69,7 @@ struct DefaultDeviceContextType<platform::CPUPlace> { ...@@ -69,6 +69,7 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice; class EigenCudaStreamDevice;
class CudnnHolder;
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public DeviceContext {
public: public:
...@@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext { ...@@ -96,6 +97,11 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle() const; cudnnHandle_t cudnn_handle() const;
/*! \brief Run a cudnn function with the workspace provided by
* CUDADeviceContext */
void RunCudnnFuncWithWorkspace(const std::function<void(void*)>& cudnn_func,
size_t workspace_len) const;
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream() const; cudaStream_t stream() const;
...@@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext { ...@@ -111,8 +117,8 @@ class CUDADeviceContext : public DeviceContext {
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
std::unique_ptr<CudnnHolder> cudnn_holder_;
cudaStream_t stream_; cudaStream_t stream_;
cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
int compute_capability; int compute_capability;
......
...@@ -192,7 +192,8 @@ class MKLDNNHandler { ...@@ -192,7 +192,8 @@ class MKLDNNHandler {
mkldnn::memory::primitive_desc& user_mpd, // NOLINT mkldnn::memory::primitive_desc& user_mpd, // NOLINT
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix, const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix; auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p"; auto key_reorder_p = key_ + suffix + "reorder_p";
...@@ -213,7 +214,7 @@ class MKLDNNHandler { ...@@ -213,7 +214,7 @@ class MKLDNNHandler {
pipeline.push_back(*reorder_p); pipeline.push_back(*reorder_p);
} }
dev_ctx_.SetBlob(local_key, target_memory_p); dev_ctx_.SetBlob(local_key, target_memory_p);
} else { } else if (!is_persistent) {
// Make reorder if needed // Make reorder if needed
auto reorder_p = std::static_pointer_cast<mkldnn::reorder>( auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
dev_ctx_.GetBlob(key_reorder_p)); dev_ctx_.GetBlob(key_reorder_p));
......
...@@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) { ...@@ -137,7 +137,10 @@ void BindProgramDesc(pybind11::module *m) {
PADDLE_ENFORCE(desc->ParseFromString(data), PADDLE_ENFORCE(desc->ParseFromString(data),
"Fail to parse ProgramDesc from string. This could " "Fail to parse ProgramDesc from string. This could "
"be a bug of Paddle."); "be a bug of Paddle.");
}); })
.def("_version", [](pd::ProgramDesc &self) -> int64_t {
return self.Proto()->version().version();
});
} }
void BindBlockDesc(pybind11::module *m) { void BindBlockDesc(pybind11::module *m) {
......
...@@ -33,6 +33,7 @@ limitations under the License. */ ...@@ -33,6 +33,7 @@ limitations under the License. */
#include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -530,6 +531,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -530,6 +531,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
m.def("_is_program_version_supported", IsProgramVersionSupported);
BindProgramDesc(&m); BindProgramDesc(&m);
BindBlockDesc(&m); BindBlockDesc(&m);
BindVarDsec(&m); BindVarDsec(&m);
...@@ -680,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -680,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
const std::string &, Scope *, std::vector<Scope *> &, const std::string &, Scope *, std::vector<Scope *> &,
const ExecutionStrategy &, const BuildStrategy &, size_t, const ExecutionStrategy &, const BuildStrategy &, size_t,
size_t>()) size_t>())
.def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element // We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope* // of vec<Scope*> will be freed by Python GC. We can only return Scope*
......
cc_library(stringpiece SRCS piece.cc) cc_library(stringpiece SRCS piece.cc)
cc_library(pretty_log SRCS pretty_log.cc)
cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
cc_test(to_string_test SRCS to_string_test.cc) cc_test(to_string_test SRCS to_string_test.cc)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/string/pretty_log.h"
#include <gflags/gflags.h>
DEFINE_bool(color, true, "Whether to turn on pretty log");
namespace paddle {
namespace string {} // namespace string
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gflags/gflags.h>
#include <iostream>
#include <sstream>
#include <string>
#include <utility>
#include "paddle/fluid/string/printf.h"
DECLARE_bool(color);
namespace paddle {
namespace string {
inline std::string black() { return FLAGS_color ? "\e[30m" : ""; }
inline std::string red() { return FLAGS_color ? "\e[31m" : ""; }
inline std::string b_red() { return FLAGS_color ? "\e[41m" : ""; }
inline std::string green() { return FLAGS_color ? "\e[32m" : ""; }
inline std::string yellow() { return FLAGS_color ? "\e[33m" : ""; }
inline std::string blue() { return FLAGS_color ? "\e[34m" : ""; }
inline std::string purple() { return FLAGS_color ? "\e[35m" : ""; }
inline std::string cyan() { return FLAGS_color ? "\e[36m" : ""; }
inline std::string light_gray() { return FLAGS_color ? "\e[37m" : ""; }
inline std::string white() { return FLAGS_color ? "\e[37m" : ""; }
inline std::string light_red() { return FLAGS_color ? "\e[91m" : ""; }
inline std::string dim() { return FLAGS_color ? "\e[2m" : ""; }
inline std::string bold() { return FLAGS_color ? "\e[1m" : ""; }
inline std::string underline() { return FLAGS_color ? "\e[4m" : ""; }
inline std::string blink() { return FLAGS_color ? "\e[5m" : ""; }
inline std::string reset() { return FLAGS_color ? "\e[0m" : ""; }
using TextBlock = std::pair<std::string, std::string>;
struct Style {
static std::string info() { return black(); }
static std::string warn() { return b_red(); }
static std::string suc() { return green(); }
static std::string H1() { return bold() + purple(); }
static std::string H2() { return green(); }
static std::string H3() { return green(); }
static std::string detail() { return light_gray(); }
};
template <typename... Args>
static void PrettyLogEndl(const std::string& style, const char* fmt,
const Args&... args) {
std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl;
}
template <typename... Args>
static void PrettyLog(const std::string& style, const char* fmt,
const Args&... args) {
std::cerr << style << Sprintf(fmt, args...) << reset();
}
} // namespace string
} // namespace paddle
...@@ -1564,6 +1564,9 @@ class Program(object): ...@@ -1564,6 +1564,9 @@ class Program(object):
""" """
return self.desc return self.desc
def _version(self):
return self.desc._version()
def clone(self, for_test=False): def clone(self, for_test=False):
""" """
Create a new, duplicated program. Create a new, duplicated program.
......
...@@ -750,6 +750,10 @@ def load_inference_model(dirname, ...@@ -750,6 +750,10 @@ def load_inference_model(dirname,
program_desc_str = f.read() program_desc_str = f.read()
program = Program.parse_from_string(program_desc_str) program = Program.parse_from_string(program_desc_str)
if not core._is_program_version_supported(program._version()):
raise ValueError("Unsupported program version: %d\n" %
program._version())
# Binary data also need versioning.
load_persistables(executor, dirname, program, params_filename) load_persistables(executor, dirname, program, params_filename)
if pserver_endpoints: if pserver_endpoints:
......
...@@ -55,15 +55,19 @@ for _OP in set(__auto__): ...@@ -55,15 +55,19 @@ for _OP in set(__auto__):
globals()[_OP] = generate_layer_fn(_OP) globals()[_OP] = generate_layer_fn(_OP)
def rpn_target_assign(loc, def rpn_target_assign(bbox_pred,
scores, cls_logits,
anchor_box, anchor_box,
anchor_var, anchor_var,
gt_box, gt_boxes,
is_crowd,
im_info,
rpn_batch_size_per_im=256, rpn_batch_size_per_im=256,
fg_fraction=0.25, rpn_straddle_thresh=0.0,
rpn_fg_fraction=0.5,
rpn_positive_overlap=0.7, rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3): rpn_negative_overlap=0.3,
use_random=True):
""" """
** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **
...@@ -83,14 +87,13 @@ def rpn_target_assign(loc, ...@@ -83,14 +87,13 @@ def rpn_target_assign(loc,
the positive anchors. the positive anchors.
Args: Args:
loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes. N is the batch size, predicted locations of M bounding bboxes. N is the batch size,
and each bounding box has four coordinate values and the layout and each bounding box has four coordinate values and the layout
is [xmin, ymin, xmax, ymax]. is [xmin, ymin, xmax, ymax].
scores(Variable): A 3-D Tensor with shape [N, M, C] represents the cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
predicted confidence predictions. N is the batch size, C is the predicted confidence predictions. N is the batch size, 1 is the
class number, M is number of bounding boxes. For each category frontground and background sigmoid, M is number of bounding boxes.
there are total M scores which corresponding M bounding boxes.
anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes, anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
each box is represented as [xmin, ymin, xmax, ymax], each box is represented as [xmin, ymin, xmax, ymax],
[xmin, ymin] is the left top coordinate of the anchor box, [xmin, ymin] is the left top coordinate of the anchor box,
...@@ -99,11 +102,16 @@ def rpn_target_assign(loc, ...@@ -99,11 +102,16 @@ def rpn_target_assign(loc,
coordinate of the anchor box. coordinate of the anchor box.
anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded
variances of anchors. variances of anchors.
gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D
LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
bboxes of mini-batch input. bboxes of mini-batch input.
is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
3 is the height, width and scale.
rpn_batch_size_per_im(int): Total number of RPN examples per image. rpn_batch_size_per_im(int): Total number of RPN examples per image.
fg_fraction(float): Target fraction of RoI minibatch that is labeled rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
by straddle_thresh pixels.
rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
foreground (i.e. class > 0), 0-th class is background. foreground (i.e. class > 0), 0-th class is background.
rpn_positive_overlap(float): Minimum overlap required between an anchor rpn_positive_overlap(float): Minimum overlap required between an anchor
and ground-truth box for the (anchor, gt box) pair to be a positive and ground-truth box for the (anchor, gt box) pair to be a positive
...@@ -129,45 +137,48 @@ def rpn_target_assign(loc, ...@@ -129,45 +137,48 @@ def rpn_target_assign(loc,
Examples: Examples:
.. code-block:: python .. code-block:: python
loc = layers.data(name='location', shape=[2, 80], bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
scores = layers.data(name='scores', shape=[2, 40], cls_logits = layers.data(name='cls_logits', shape=[100, 1],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
anchor_box = layers.data(name='anchor_box', shape=[20, 4], anchor_box = layers.data(name='anchor_box', shape=[20, 4],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
gt_box = layers.data(name='gt_box', shape=[10, 4], gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32') append_batch_size=False, dtype='float32')
loc_pred, score_pred, loc_target, score_target = loc_pred, score_pred, loc_target, score_target =
fluid.layers.detection_output(loc=location, fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
scores=scores, cls_logits=cls_logits,
anchor_box=anchor_box, anchor_box=anchor_box,
gt_box=gt_box) gt_boxes=gt_boxes)
""" """
helper = LayerHelper('rpn_target_assign', **locals()) helper = LayerHelper('rpn_target_assign', **locals())
# Compute overlaps between the prior boxes and the gt boxes overlaps
iou = iou_similarity(x=gt_box, y=anchor_box)
# Assign target label to anchors # Assign target label to anchors
loc_index = helper.create_tmp_variable(dtype='int32') loc_index = helper.create_tmp_variable(dtype='int32')
score_index = helper.create_tmp_variable(dtype='int32') score_index = helper.create_tmp_variable(dtype='int32')
target_label = helper.create_tmp_variable(dtype='int64') target_label = helper.create_tmp_variable(dtype='int32')
target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype) target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
helper.append_op( helper.append_op(
type="rpn_target_assign", type="rpn_target_assign",
inputs={'Anchor': anchor_box, inputs={
'GtBox': gt_box, 'Anchor': anchor_box,
'DistMat': iou}, 'GtBoxes': gt_boxes,
'IsCrowd': is_crowd,
'ImInfo': im_info
},
outputs={ outputs={
'LocationIndex': loc_index, 'LocationIndex': loc_index,
'ScoreIndex': score_index, 'ScoreIndex': score_index,
'TargetLabel': target_label, 'TargetLabel': target_label,
'TargetBBox': target_bbox, 'TargetBBox': target_bbox
}, },
attrs={ attrs={
'rpn_batch_size_per_im': rpn_batch_size_per_im, 'rpn_batch_size_per_im': rpn_batch_size_per_im,
'rpn_straddle_thresh': rpn_straddle_thresh,
'rpn_positive_overlap': rpn_positive_overlap, 'rpn_positive_overlap': rpn_positive_overlap,
'rpn_negative_overlap': rpn_negative_overlap, 'rpn_negative_overlap': rpn_negative_overlap,
'fg_fraction': fg_fraction 'rpn_fg_fraction': rpn_fg_fraction,
'use_random': use_random
}) })
loc_index.stop_gradient = True loc_index.stop_gradient = True
...@@ -175,12 +186,12 @@ def rpn_target_assign(loc, ...@@ -175,12 +186,12 @@ def rpn_target_assign(loc,
target_label.stop_gradient = True target_label.stop_gradient = True
target_bbox.stop_gradient = True target_bbox.stop_gradient = True
scores = nn.reshape(x=scores, shape=(-1, 1)) cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
loc = nn.reshape(x=loc, shape=(-1, 4)) bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
predicted_scores = nn.gather(scores, score_index) predicted_cls_logits = nn.gather(cls_logits, score_index)
predicted_location = nn.gather(loc, loc_index) predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
return predicted_scores, predicted_location, target_label, target_bbox return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox
def detection_output(loc, def detection_output(loc,
...@@ -1258,15 +1269,17 @@ def anchor_generator(input, ...@@ -1258,15 +1269,17 @@ def anchor_generator(input,
def generate_proposal_labels(rpn_rois, def generate_proposal_labels(rpn_rois,
gt_classes, gt_classes,
is_crowd,
gt_boxes, gt_boxes,
im_scales, im_info,
batch_size_per_im=256, batch_size_per_im=256,
fg_fraction=0.25, fg_fraction=0.25,
fg_thresh=0.25, fg_thresh=0.25,
bg_thresh_hi=0.5, bg_thresh_hi=0.5,
bg_thresh_lo=0.0, bg_thresh_lo=0.0,
bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
class_nums=None): class_nums=None,
use_random=True):
""" """
** Generate proposal labels Faster-RCNN ** ** Generate proposal labels Faster-RCNN **
TODO(buxingyuan): Add Document TODO(buxingyuan): Add Document
...@@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois, ...@@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois,
inputs={ inputs={
'RpnRois': rpn_rois, 'RpnRois': rpn_rois,
'GtClasses': gt_classes, 'GtClasses': gt_classes,
'IsCrowd': is_crowd,
'GtBoxes': gt_boxes, 'GtBoxes': gt_boxes,
'ImScales': im_scales 'ImInfo': im_info
}, },
outputs={ outputs={
'Rois': rois, 'Rois': rois,
...@@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois, ...@@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois,
'bg_thresh_hi': bg_thresh_hi, 'bg_thresh_hi': bg_thresh_hi,
'bg_thresh_lo': bg_thresh_lo, 'bg_thresh_lo': bg_thresh_lo,
'bbox_reg_weights': bbox_reg_weights, 'bbox_reg_weights': bbox_reg_weights,
'class_nums': class_nums 'class_nums': class_nums,
'use_random': use_random
}) })
rois.stop_gradient = True rois.stop_gradient = True
......
...@@ -968,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): ...@@ -968,7 +968,7 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
return out return out
def cross_entropy(input, label, soft_label=False): def cross_entropy(input, label, soft_label=False, ignore_index=-100):
""" """
**Cross Entropy Layer** **Cross Entropy Layer**
...@@ -1012,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False): ...@@ -1012,7 +1012,10 @@ def cross_entropy(input, label, soft_label=False):
tensor<float/double> with shape [N x D]. tensor<float/double> with shape [N x D].
soft_label (bool): a flag indicating whether to soft_label (bool): a flag indicating whether to
interpretate the given labels as soft interpretate the given labels as soft
labels, default `False`. labels. Default: `False`.
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100
Returns: Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss. A 2-D tensor with shape [N x 1], the cross entropy loss.
...@@ -1037,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False): ...@@ -1037,7 +1040,8 @@ def cross_entropy(input, label, soft_label=False):
inputs={'X': [input], inputs={'X': [input],
'Label': [label]}, 'Label': [label]},
outputs={'Y': [out]}, outputs={'Y': [out]},
attrs={"soft_label": soft_label}) attrs={"soft_label": soft_label,
"ignore_index": ignore_index})
return out return out
...@@ -4242,7 +4246,10 @@ def multiplex(inputs, index): ...@@ -4242,7 +4246,10 @@ def multiplex(inputs, index):
return out return out
def softmax_with_cross_entropy(logits, label, soft_label=False): def softmax_with_cross_entropy(logits,
label,
soft_label=False,
ignore_index=-100):
""" """
**Softmax With Cross Entropy Operator.** **Softmax With Cross Entropy Operator.**
...@@ -4284,6 +4291,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): ...@@ -4284,6 +4291,10 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
soft_label is set to true, Label is a Tensor<float/double> with soft_label is set to true, Label is a Tensor<float/double> with
soft_label (bool): A flag to indicate whether to interpretate the given soft_label (bool): A flag to indicate whether to interpretate the given
labels as soft labels. By default, `soft_label` is set to False. labels as soft labels. By default, `soft_label` is set to False.
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100
Returns: Returns:
Variable: The cross entropy loss is a 2-D tensor with shape [N x 1]. Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
...@@ -4305,7 +4316,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): ...@@ -4305,7 +4316,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
'Label': label}, 'Label': label},
outputs={'Softmax': softmax, outputs={'Softmax': softmax,
'Loss': loss}, 'Loss': loss},
attrs={'soft_label': soft_label}) attrs={'soft_label': soft_label,
'ignore_index': ignore_index})
return loss return loss
......
...@@ -128,6 +128,13 @@ class ParallelExecutor(object): ...@@ -128,6 +128,13 @@ class ParallelExecutor(object):
os.environ.get('CPU_NUM', multiprocessing.cpu_count())) os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
exec_strategy.num_threads = cpu_num * 2 exec_strategy.num_threads = cpu_num * 2
# Set 1 thread num under nccl2 distribute
# env to make sure all gpus run ops in same order.
if num_trainers > 1:
assert (use_cuda)
# FIXME(gongwb): avoid this set.
exec_strategy.num_threads = 1
if build_strategy is None: if build_strategy is None:
build_strategy = BuildStrategy() build_strategy = BuildStrategy()
...@@ -135,11 +142,6 @@ class ParallelExecutor(object): ...@@ -135,11 +142,6 @@ class ParallelExecutor(object):
main = main if main else framework.default_main_program() main = main if main else framework.default_main_program()
if scope == None: if scope == None:
scope = executor.global_scope() scope = executor.global_scope()
# FIXME(Yancey1989): it's a temporary approach to determinate the distribute
# train program, call self.bcast_param() at the end of each mini-batch.
self.is_dist = True if "recv" in [
op.type for op in main.global_block().ops
] else False
if share_vars_from and not isinstance(share_vars_from, if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor): ParallelExecutor):
...@@ -279,21 +281,11 @@ class ParallelExecutor(object): ...@@ -279,21 +281,11 @@ class ParallelExecutor(object):
self.executor.run(fetch_list, fetch_var_name) self.executor.run(fetch_list, fetch_var_name)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
if self.is_dist:
self._bcast_params()
if return_numpy: if return_numpy:
return executor.as_numpy(arr) return executor.as_numpy(arr)
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def _bcast_params(self):
"""
Broadcast the parameters to other devices. It is used during
distributed training.
"""
self.executor._bcast_params(set(self.persistable_vars))
@property @property
def device_count(self): def device_count(self):
return len(self._act_places) return len(self._act_places)
...@@ -178,7 +178,4 @@ if __name__ == '__main__': ...@@ -178,7 +178,4 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
# TODO(minqiyang): remove this line after fixing the deletion main(use_cuda=use_cuda, parallel=parallel)
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -152,7 +152,4 @@ if __name__ == '__main__': ...@@ -152,7 +152,4 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
# TODO(minqiyang): remove this line after fixing the deletion main(use_cuda=use_cuda, parallel=parallel)
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -155,7 +155,4 @@ if __name__ == '__main__': ...@@ -155,7 +155,4 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
# TODO(minqiyang): remove this line after fixing the deletion main(use_cuda=use_cuda, parallel=parallel)
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -137,7 +137,4 @@ if __name__ == '__main__': ...@@ -137,7 +137,4 @@ if __name__ == '__main__':
for parallel in (False, True): for parallel in (False, True):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
continue continue
# TODO(minqiyang): remove this line after fixing the deletion main(use_cuda=use_cuda, parallel=parallel)
# order problem of Scope in ParallelExecutor in manylinux
if six.PY2:
main(use_cuda=use_cuda, parallel=parallel)
...@@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase): ...@@ -148,51 +148,60 @@ class TestAnchorGenerator(unittest.TestCase):
class TestGenerateProposalLabels(unittest.TestCase): class TestGenerateProposalLabels(unittest.TestCase):
def test_generate_proposal_labels(self): def test_generate_proposal_labels(self):
rpn_rois = layers.data( program = Program()
name='rpn_rois', with program_guard(program):
shape=[4, 4], rpn_rois = layers.data(
dtype='float32', name='rpn_rois',
lod_level=1, shape=[4, 4],
append_batch_size=False) dtype='float32',
gt_classes = layers.data( lod_level=1,
name='gt_classes', append_batch_size=False)
shape=[6], gt_classes = layers.data(
dtype='int32', name='gt_classes',
lod_level=1, shape=[6],
append_batch_size=False) dtype='int32',
gt_boxes = layers.data( lod_level=1,
name='gt_boxes', append_batch_size=False)
shape=[6, 4], is_crowd = layers.data(
dtype='float32', name='is_crowd',
lod_level=1, shape=[6],
append_batch_size=False) dtype='int32',
im_scales = layers.data( lod_level=1,
name='im_scales', append_batch_size=False)
shape=[1], gt_boxes = layers.data(
dtype='float32', name='gt_boxes',
lod_level=1, shape=[6, 4],
append_batch_size=False) dtype='float32',
class_nums = 5 lod_level=1,
rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels( append_batch_size=False)
rpn_rois=rpn_rois, im_info = layers.data(
gt_classes=gt_classes, name='im_info',
gt_boxes=gt_boxes, shape=[1, 3],
im_scales=im_scales, dtype='float32',
batch_size_per_im=2, lod_level=1,
fg_fraction=0.5, append_batch_size=False)
fg_thresh=0.5, class_nums = 5
bg_thresh_hi=0.5, rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
bg_thresh_lo=0.0, rpn_rois=rpn_rois,
bbox_reg_weights=[0.1, 0.1, 0.2, 0.2], gt_classes=gt_classes,
class_nums=class_nums) is_crowd=is_crowd,
assert rois.shape[1] == 4 gt_boxes=gt_boxes,
assert rois.shape[0] == labels_int32.shape[0] im_info=im_info,
assert rois.shape[0] == bbox_targets.shape[0] batch_size_per_im=2,
assert rois.shape[0] == bbox_inside_weights.shape[0] fg_fraction=0.5,
assert rois.shape[0] == bbox_outside_weights.shape[0] fg_thresh=0.5,
assert bbox_targets.shape[1] == 4 * class_nums bg_thresh_hi=0.5,
assert bbox_inside_weights.shape[1] == 4 * class_nums bg_thresh_lo=0.0,
assert bbox_outside_weights.shape[1] == 4 * class_nums bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
class_nums=class_nums)
assert rois.shape[1] == 4
assert rois.shape[0] == labels_int32.shape[0]
assert rois.shape[0] == bbox_targets.shape[0]
assert rois.shape[0] == bbox_inside_weights.shape[0]
assert rois.shape[0] == bbox_outside_weights.shape[0]
assert bbox_targets.shape[1] == 4 * class_nums
assert bbox_inside_weights.shape[1] == 4 * class_nums
assert bbox_outside_weights.shape[1] == 4 * class_nums
class TestMultiBoxHead(unittest.TestCase): class TestMultiBoxHead(unittest.TestCase):
...@@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase): ...@@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase):
def test_rpn_target_assign(self): def test_rpn_target_assign(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
loc_shape = [10, 50, 4] bbox_pred_shape = [10, 50, 4]
score_shape = [10, 50, 2] cls_logits_shape = [10, 50, 2]
anchor_shape = [50, 4] anchor_shape = [50, 4]
loc = layers.data( bbox_pred = layers.data(
name='loc', name='bbox_pred',
shape=loc_shape, shape=bbox_pred_shape,
append_batch_size=False, append_batch_size=False,
dtype='float32') dtype='float32')
scores = layers.data( cls_logits = layers.data(
name='scores', name='cls_logits',
shape=score_shape, shape=cls_logits_shape,
append_batch_size=False, append_batch_size=False,
dtype='float32') dtype='float32')
anchor_box = layers.data( anchor_box = layers.data(
...@@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase): ...@@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase):
shape=anchor_shape, shape=anchor_shape,
append_batch_size=False, append_batch_size=False,
dtype='float32') dtype='float32')
gt_box = layers.data( gt_boxes = layers.data(
name='gt_box', shape=[4], lod_level=1, dtype='float32') name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
is_crowd = layers.data(
name='is_crowd',
shape=[10],
dtype='int32',
lod_level=1,
append_batch_size=False)
im_info = layers.data(
name='im_info',
shape=[1, 3],
dtype='float32',
lod_level=1,
append_batch_size=False)
pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign( pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
loc=loc, bbox_pred=bbox_pred,
scores=scores, cls_logits=cls_logits,
anchor_box=anchor_box, anchor_box=anchor_box,
anchor_var=anchor_var, anchor_var=anchor_var,
gt_box=gt_box, gt_boxes=gt_boxes,
is_crowd=is_crowd,
im_info=im_info,
rpn_batch_size_per_im=256, rpn_batch_size_per_im=256,
fg_fraction=0.25, rpn_straddle_thresh=0.0,
rpn_fg_fraction=0.5,
rpn_positive_overlap=0.7, rpn_positive_overlap=0.7,
rpn_negative_overlap=0.3) rpn_negative_overlap=0.3)
......
...@@ -28,6 +28,10 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl ...@@ -28,6 +28,10 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
if(APPLE)
# this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
endif()
function(py_test_modules TARGET_NAME) function(py_test_modules TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
...@@ -46,6 +50,7 @@ function(py_test_modules TARGET_NAME) ...@@ -46,6 +50,7 @@ function(py_test_modules TARGET_NAME)
endfunction() endfunction()
list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_dist_train)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
...@@ -61,11 +66,12 @@ if(WITH_DISTRIBUTE) ...@@ -61,11 +66,12 @@ if(WITH_DISTRIBUTE)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
...@@ -209,5 +209,34 @@ class TestCrossEntropyOp6(OpTest): ...@@ -209,5 +209,34 @@ class TestCrossEntropyOp6(OpTest):
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001) ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp7(OpTest):
"""Test cross-entropy with ignore index.
"""
def setUp(self):
self.op_type = "cross_entropy"
batch_size = 30
class_num = 10
ignore_index = 3
X = randomize_probability(batch_size, class_num, dtype='float64')
label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
cross_entropy = np.asmatrix(
[[-np.log(X[i][label[i][0]])]
if label[i][0] != ignore_index else [0]
for i in range(X.shape[0])],
dtype="float64")
self.inputs = {"X": X, "Label": label}
self.outputs = {"Y": cross_entropy}
self.attrs = {"soft_label": False, "ignore_index": ignore_index}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -84,7 +84,7 @@ class TestDataBalance(unittest.TestCase): ...@@ -84,7 +84,7 @@ class TestDataBalance(unittest.TestCase):
self.data_file_name = './data_balance_test.recordio' self.data_file_name = './data_balance_test.recordio'
self.lod_data_file_name = './data_balance_with_lod_test.recordio' self.lod_data_file_name = './data_balance_with_lod_test.recordio'
self.total_ins_num = 50 self.total_ins_num = 50
self.batch_size = 10 self.batch_size = 12
self.prepare_data() self.prepare_data()
self.prepare_lod_data() self.prepare_lod_data()
......
...@@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase): ...@@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase):
t = self._transpiler_instance(config) t = self._transpiler_instance(config)
trainer_main = t.get_trainer_program() trainer_main = t.get_trainer_program(wait_port=False)
trainer_startup = fluid.default_startup_program() trainer_startup = fluid.default_startup_program()
assert (src.num_blocks == 1) assert (src.num_blocks == 1)
......
...@@ -20,10 +20,10 @@ import paddle.fluid as fluid ...@@ -20,10 +20,10 @@ import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
def generate_proposal_labels_in_python( def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes,
rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im, im_info, batch_size_per_im, fg_fraction,
fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, fg_thresh, bg_thresh_hi, bg_thresh_lo,
class_nums): bbox_reg_weights, class_nums):
rois = [] rois = []
labels_int32 = [] labels_int32 = []
bbox_targets = [] bbox_targets = []
...@@ -31,13 +31,13 @@ def generate_proposal_labels_in_python( ...@@ -31,13 +31,13 @@ def generate_proposal_labels_in_python(
bbox_outside_weights = [] bbox_outside_weights = []
lod = [] lod = []
assert len(rpn_rois) == len( assert len(rpn_rois) == len(
im_scales), 'batch size of rpn_rois and ground_truth is not matched' im_info), 'batch size of rpn_rois and ground_truth is not matched'
for im_i in range(len(im_scales)): for im_i in range(len(im_info)):
frcn_blobs = _sample_rois( frcn_blobs = _sample_rois(
rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i], rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
bg_thresh_lo, bbox_reg_weights, class_nums) bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums)
lod.append(frcn_blobs['rois'].shape[0]) lod.append(frcn_blobs['rois'].shape[0])
...@@ -50,13 +50,14 @@ def generate_proposal_labels_in_python( ...@@ -50,13 +50,14 @@ def generate_proposal_labels_in_python(
return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod
def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
bbox_reg_weights, class_nums): bg_thresh_lo, bbox_reg_weights, class_nums):
rois_per_image = int(batch_size_per_im) rois_per_image = int(batch_size_per_im)
fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
# Roidb # Roidb
im_scale = im_info[2]
inv_im_scale = 1. / im_scale inv_im_scale = 1. / im_scale
rpn_rois = rpn_rois * inv_im_scale rpn_rois = rpn_rois * inv_im_scale
...@@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, ...@@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[ box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
overlapped_boxes_ind] overlapped_boxes_ind]
crowd_ind = np.where(is_crowd)[0]
gt_overlaps[crowd_ind] = -1
max_overlaps = gt_overlaps.max(axis=1) max_overlaps = gt_overlaps.max(axis=1)
max_classes = gt_overlaps.argmax(axis=1) max_classes = gt_overlaps.argmax(axis=1)
...@@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, ...@@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
fg_inds = np.where(max_overlaps >= fg_thresh)[0] fg_inds = np.where(max_overlaps >= fg_thresh)[0]
fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0]) fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
# Sample foreground if there are too many # Sample foreground if there are too many
if fg_inds.shape[0] > fg_rois_per_this_image: # if fg_inds.shape[0] > fg_rois_per_this_image:
fg_inds = np.random.choice( # fg_inds = np.random.choice(
fg_inds, size=fg_rois_per_this_image, replace=False) # fg_inds, size=fg_rois_per_this_image, replace=False)
fg_inds = fg_inds[:fg_rois_per_this_image]
# Background # Background
bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >= bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
...@@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im, ...@@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
bg_inds.shape[0]) bg_inds.shape[0])
# Sample background if there are too many # Sample background if there are too many
if bg_inds.shape[0] > bg_rois_per_this_image: # if bg_inds.shape[0] > bg_rois_per_this_image:
bg_inds = np.random.choice( # bg_inds = np.random.choice(
bg_inds, size=bg_rois_per_this_image, replace=False) # bg_inds, size=bg_rois_per_this_image, replace=False)
bg_inds = bg_inds[:bg_rois_per_this_image]
keep_inds = np.append(fg_inds, bg_inds) keep_inds = np.append(fg_inds, bg_inds)
sampled_labels = max_classes[keep_inds] sampled_labels = max_classes[keep_inds]
...@@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest): ...@@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest):
self.inputs = { self.inputs = {
'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod), 'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
'GtClasses': (self.gt_classes[0], self.gts_lod), 'GtClasses': (self.gt_classes[0], self.gts_lod),
'IsCrowd': (self.is_crowd[0], self.gts_lod),
'GtBoxes': (self.gt_boxes[0], self.gts_lod), 'GtBoxes': (self.gt_boxes[0], self.gts_lod),
'ImScales': self.im_scales[0] 'ImInfo': self.im_info
} }
self.attrs = { self.attrs = {
'batch_size_per_im': self.batch_size_per_im, 'batch_size_per_im': self.batch_size_per_im,
...@@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest): ...@@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest):
'bg_thresh_hi': self.bg_thresh_hi, 'bg_thresh_hi': self.bg_thresh_hi,
'bg_thresh_lo': self.bg_thresh_lo, 'bg_thresh_lo': self.bg_thresh_lo,
'bbox_reg_weights': self.bbox_reg_weights, 'bbox_reg_weights': self.bbox_reg_weights,
'class_nums': self.class_nums 'class_nums': self.class_nums,
'use_random': False
} }
self.outputs = { self.outputs = {
'Rois': (self.rois[0], [self.lod]), 'Rois': (self.rois, [self.lod]),
'LabelsInt32': (self.labels_int32[0], [self.lod]), 'LabelsInt32': (self.labels_int32, [self.lod]),
'BboxTargets': (self.bbox_targets[0], [self.lod]), 'BboxTargets': (self.bbox_targets, [self.lod]),
'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]), 'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]), 'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
} }
def test_check_output(self): def test_check_output(self):
...@@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest): ...@@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest):
self.set_data() self.set_data()
def init_test_params(self): def init_test_params(self):
self.batch_size_per_im = 10 self.batch_size_per_im = 512
self.fg_fraction = 1.0 self.fg_fraction = 0.25
self.fg_thresh = 0.5 self.fg_thresh = 0.5
self.bg_thresh_hi = 0.5 self.bg_thresh_hi = 0.5
self.bg_thresh_lo = 0.0 self.bg_thresh_lo = 0.0
...@@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest): ...@@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest):
def init_test_input(self): def init_test_input(self):
np.random.seed(0) np.random.seed(0)
image_nums = 1
gt_nums = 6 # Keep same with batch_size_per_im for unittest gt_nums = 6 # Keep same with batch_size_per_im for unittest
proposal_nums = self.batch_size_per_im - gt_nums proposal_nums = 2000 #self.batch_size_per_im - gt_nums
images_shape = [] images_shape = [[64, 64]]
self.im_scales = [] self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
for i in range(image_nums): for i in range(len(images_shape)):
images_shape.append(np.random.randint(200, size=2)) self.im_info[i, 0] = images_shape[i][0]
self.im_scales.append(np.ones((1)).astype(np.float32)) self.im_info[i, 1] = images_shape[i][1]
self.im_info[i, 2] = 0.8 #scale
self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape, self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
proposal_nums) proposal_nums)
...@@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest): ...@@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest):
images_shape, self.class_nums, gt_nums) images_shape, self.class_nums, gt_nums)
self.gt_classes = [gt['gt_classes'] for gt in ground_truth] self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
self.gt_boxes = [gt['boxes'] for gt in ground_truth] self.gt_boxes = [gt['boxes'] for gt in ground_truth]
self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
def init_test_output(self): def init_test_output(self):
self.rois, self.labels_int32, self.bbox_targets, \ self.rois, self.labels_int32, self.bbox_targets, \
self.bbox_inside_weights, self.bbox_outside_weights, \ self.bbox_inside_weights, self.bbox_outside_weights, \
self.lod = generate_proposal_labels_in_python( self.lod = generate_proposal_labels_in_python(
self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales, self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
self.batch_size_per_im, self.fg_fraction, self.batch_size_per_im, self.fg_fraction,
self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo, self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
self.bbox_reg_weights, self.class_nums self.bbox_reg_weights, self.class_nums
) )
self.rois = np.vstack(self.rois)
self.labels_int32 = np.hstack(self.labels_int32)
self.labels_int32 = self.labels_int32[:, np.newaxis]
self.bbox_targets = np.vstack(self.bbox_targets)
self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
def _generate_proposals(images_shape, proposal_nums): def _generate_proposals(images_shape, proposal_nums):
...@@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums): ...@@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums):
for i, image_shape in enumerate(images_shape): for i, image_shape in enumerate(images_shape):
proposals = _generate_boxes(image_shape, proposal_nums) proposals = _generate_boxes(image_shape, proposal_nums)
rpn_rois.append(proposals) rpn_rois.append(proposals)
num_proposals += len(proposals) num_proposals = len(proposals)
rpn_rois_lod.append(num_proposals) rpn_rois_lod.append(num_proposals)
return rpn_rois, [rpn_rois_lod] return rpn_rois, [rpn_rois_lod]
...@@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums): ...@@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums):
gt_classes = np.random.randint( gt_classes = np.random.randint(
low=1, high=class_nums, size=gt_nums).astype(np.int32) low=1, high=class_nums, size=gt_nums).astype(np.int32)
gt_boxes = _generate_boxes(image_shape, gt_nums) gt_boxes = _generate_boxes(image_shape, gt_nums)
ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes)) is_crowd = np.zeros((gt_nums), dtype=np.int32)
is_crowd[0] = 1
ground_truth.append(
dict(
gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
num_gts += len(gt_classes) num_gts += len(gt_classes)
gts_lod.append(num_gts) gts_lod.append(num_gts)
return ground_truth, [gts_lod] return ground_truth, [gts_lod]
......
...@@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances): ...@@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances):
#anchor_loc: width, height, center_x, center_y #anchor_loc: width, height, center_x, center_y
anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32) anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2 anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2 anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
#predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height
pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32) pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
...@@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances): ...@@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances):
i, 0] + anchor_loc[i, 2] i, 0] + anchor_loc[i, 2]
pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[ pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
i, 1] + anchor_loc[i, 3] i, 1] + anchor_loc[i, 3]
pred_bbox[i, 2] = math.exp(variances[i, 2] * pred_bbox[i, 2] = math.exp(
bbox_deltas[i, 2]) * anchor_loc[i, 0] min(variances[i, 2] * bbox_deltas[i, 2], math.log(
pred_bbox[i, 3] = math.exp(variances[i, 3] * 1000 / 16.0))) * anchor_loc[i, 0]
bbox_deltas[i, 3]) * anchor_loc[i, 1] pred_bbox[i, 3] = math.exp(
min(variances[i, 3] * bbox_deltas[i, 3], math.log(
1000 / 16.0))) * anchor_loc[i, 1]
else: else:
for i in range(bbox_deltas.shape[0]): for i in range(bbox_deltas.shape[0]):
pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[ pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
i, 2] i, 2]
pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[ pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
i, 3] i, 3]
pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0] pred_bbox[i, 2] = math.exp(
pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1] min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
0]
pred_bbox[i, 3] = math.exp(
min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
1]
proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2 proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2 proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1
return proposals return proposals
...@@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info): ...@@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info):
"""Only keep boxes with both sides >= min_size and center within the image. """Only keep boxes with both sides >= min_size and center within the image.
""" """
# Scale min_size to match image scale # Scale min_size to match image scale
min_size *= im_info[2] im_scale = im_info[2]
min_size = max(min_size, 1.0)
ws = boxes[:, 2] - boxes[:, 0] + 1 ws = boxes[:, 2] - boxes[:, 0] + 1
hs = boxes[:, 3] - boxes[:, 1] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1
ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
x_ctr = boxes[:, 0] + ws / 2. x_ctr = boxes[:, 0] + ws / 2.
y_ctr = boxes[:, 1] + hs / 2. y_ctr = boxes[:, 1] + hs / 2.
keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) & keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
(y_ctr < im_info[0]))[0] (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
return keep return keep
...@@ -204,7 +213,7 @@ def iou(box_a, box_b): ...@@ -204,7 +213,7 @@ def iou(box_a, box_b):
xb = min(xmax_a, xmax_b) xb = min(xmax_a, xmax_b)
yb = min(ymax_a, ymax_b) yb = min(ymax_a, ymax_b)
inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0) inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)
iou_ratio = inter_area / (area_a + area_b - inter_area) iou_ratio = inter_area / (area_a + area_b - inter_area)
......
...@@ -556,6 +556,15 @@ class TestBook(unittest.TestCase): ...@@ -556,6 +556,15 @@ class TestBook(unittest.TestCase):
out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0) out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
print(str(program)) print(str(program))
def test_cross_entropy(self):
program = Program()
with program_guard(program):
x = layers.data(name="x", shape=[30, 10], dtype="float32")
label = layers.data(name="label", shape=[30, 1], dtype="int32")
mode = 'channel'
out = layers.cross_entropy(x, label, False, 4)
self.assertIsNotNone(out)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -63,5 +63,27 @@ class TestNormOp3(TestNormOp): ...@@ -63,5 +63,27 @@ class TestNormOp3(TestNormOp):
self.epsilon = 1e-8 self.epsilon = 1e-8
class TestNormOp4(TestNormOp):
def init_test_case(self):
self.shape = [128, 1024, 14, 14]
self.axis = 2
self.epsilon = 1e-8
def test_check_grad(self):
# since the gradient check is very slow in large shape, so skip check_grad
pass
class TestNormOp5(TestNormOp):
def init_test_case(self):
self.shape = [2048, 2048]
self.axis = 1
self.epsilon = 1e-8
def test_check_grad(self):
# since the gradient check is very slow in large shape, so skip check_grad
pass
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -20,6 +20,7 @@ import numpy as np ...@@ -20,6 +20,7 @@ import numpy as np
from parallel_executor_test_base import TestParallelExecutorBase from parallel_executor_test_base import TestParallelExecutorBase
import unittest import unittest
import paddle import paddle
import paddle.fluid.core as core
import paddle.dataset.wmt16 as wmt16 import paddle.dataset.wmt16 as wmt16
import os import os
...@@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -170,7 +171,8 @@ class TestTransformer(TestParallelExecutorBase):
writer.complete_append_tensor() writer.complete_append_tensor()
def test_main(self): def test_main(self):
self.check_network_convergence(transformer, use_cuda=True) if core.is_compiled_with_cuda():
self.check_network_convergence(transformer, use_cuda=True)
self.check_network_convergence(transformer, use_cuda=False, iter=5) self.check_network_convergence(transformer, use_cuda=False, iter=5)
......
...@@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase): ...@@ -96,7 +96,8 @@ class TestPyReaderUsingExecutor(unittest.TestCase):
self.queue_capacity = 50 self.queue_capacity = 50
def test(self): def test(self):
for use_cuda in [False, True]: for use_cuda in ([False, True]
if core.is_compiled_with_cuda() else [False]):
for use_parallel_executor in [False, True]: for use_parallel_executor in [False, True]:
for use_double_buffer in [False, True]: for use_double_buffer in [False, True]:
print('Test Parameters:'), print('Test Parameters:'),
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import os
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
import numpy as np import numpy as np
...@@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase): ...@@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase):
self.data_file_name, reader, feeder) self.data_file_name, reader, feeder)
def setUp(self): def setUp(self):
# set parallel threads to fit 20 batches in line 49
os.environ['CPU_NUM'] = str(20)
self.use_cuda = fluid.core.is_compiled_with_cuda() self.use_cuda = fluid.core.is_compiled_with_cuda()
self.data_file_name = './reader_reset_test.recordio' self.data_file_name = './reader_reset_test.recordio'
self.ins_shape = [3] self.ins_shape = [3]
......
...@@ -19,48 +19,58 @@ import numpy as np ...@@ -19,48 +19,58 @@ import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
from test_anchor_generator_op import anchor_generator_in_python from test_anchor_generator_op import anchor_generator_in_python
from test_generate_proposal_labels import _generate_groundtruth from test_generate_proposal_labels_op import _generate_groundtruth
from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta
def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im, def rpn_target_assign(anchor_by_gt_overlap,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction): rpn_batch_size_per_im,
iou = np.transpose(gt_anchor_iou) rpn_positive_overlap,
anchor_to_gt_max = iou.max(axis=1) rpn_negative_overlap,
anchor_to_gt_argmax = iou.argmax(axis=1) rpn_fg_fraction,
use_random=True):
gt_to_anchor_argmax = iou.argmax(axis=0) anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])] anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0] anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1 gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
tgt_lbl[anchors_with_max_overlap] = 1 gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1 anchor_by_gt_overlap.shape[1])]
anchors_with_max_overlap = np.where(
num_fg = int(fg_fraction * rpn_batch_size_per_im) anchor_by_gt_overlap == gt_to_anchor_max)[0]
fg_inds = np.where(tgt_lbl == 1)[0]
if len(fg_inds) > num_fg: labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
labels[anchors_with_max_overlap] = 1
labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
fg_inds = np.where(labels == 1)[0]
if len(fg_inds) > num_fg and use_random:
disable_inds = np.random.choice( disable_inds = np.random.choice(
fg_inds, size=(len(fg_inds) - num_fg), replace=False) fg_inds, size=(len(fg_inds) - num_fg), replace=False)
tgt_lbl[disable_inds] = -1 else:
fg_inds = np.where(tgt_lbl == 1)[0] disable_inds = fg_inds[num_fg:]
labels[disable_inds] = -1
fg_inds = np.where(labels == 1)[0]
num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1) num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0] bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
tgt_lbl[bg_inds] = 0 if len(bg_inds) > num_bg and use_random:
if len(bg_inds) > num_bg:
enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)] enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
tgt_lbl[enable_inds] = 0 else:
bg_inds = np.where(tgt_lbl == 0)[0] enable_inds = bg_inds[:num_bg]
tgt_lbl[bg_inds] = 0 labels[enable_inds] = 0
fg_inds = np.where(labels == 1)[0]
bg_inds = np.where(labels == 0)[0]
loc_index = fg_inds loc_index = fg_inds
score_index = np.hstack((fg_inds, bg_inds)) score_index = np.hstack((fg_inds, bg_inds))
tgt_lbl = np.expand_dims(tgt_lbl, axis=1) labels = labels[score_index]
assert not np.any(labels == -1), "Wrong labels with -1"
gt_inds = anchor_to_gt_argmax[fg_inds] gt_inds = anchor_to_gt_argmax[fg_inds]
return loc_index, score_index, tgt_lbl, gt_inds return loc_index, score_index, labels, gt_inds
def get_anchor(n, c, h, w): def get_anchor(n, c, h, w):
...@@ -75,85 +85,129 @@ def get_anchor(n, c, h, w): ...@@ -75,85 +85,129 @@ def get_anchor(n, c, h, w):
return anchors return anchors
def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im, def rpn_target_assign_in_python(all_anchors,
rpn_positive_overlap, rpn_negative_overlap, fg_fraction): gt_boxes,
is_crowd,
loc_indexes = [] im_info,
score_indexes = [] lod,
tmp_tgt_labels = [] rpn_straddle_thresh,
tgt_bboxes = [] rpn_batch_size_per_im,
anchor_num = anchor.shape[0] rpn_positive_overlap,
rpn_negative_overlap,
rpn_fg_fraction,
use_random=True):
anchor_num = all_anchors.shape[0]
batch_size = len(lod) - 1 batch_size = len(lod) - 1
for i in range(batch_size): for i in range(batch_size):
im_height = im_info[i][0]
im_width = im_info[i][1]
im_scale = im_info[i][2]
if rpn_straddle_thresh >= 0:
# Only keep anchors inside the image by a margin of straddle_thresh
inds_inside = np.where(
(all_anchors[:, 0] >= -rpn_straddle_thresh) &
(all_anchors[:, 1] >= -rpn_straddle_thresh) & (
all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
# keep only inside anchors
inside_anchors = all_anchors[inds_inside, :]
else:
inds_inside = np.arange(all_anchors.shape[0])
inside_anchors = all_anchors
b, e = lod[i], lod[i + 1] b, e = lod[i], lod[i + 1]
iou_slice = iou[b:e, :] gt_boxes_slice = gt_boxes[b:e, :] * im_scale
bboxes_slice = gt_boxes[b:e, :] is_crowd_slice = is_crowd[b:e]
loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign( not_crowd_inds = np.where(is_crowd_slice == 0)[0]
iou_slice, rpn_batch_size_per_im, rpn_positive_overlap, gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
rpn_negative_overlap, fg_fraction) iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
fg_bboxes = bboxes_slice[gt_inds] loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
fg_anchors = anchor[loc_idx] iou, rpn_batch_size_per_im, rpn_positive_overlap,
box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.]) rpn_negative_overlap, rpn_fg_fraction, use_random)
# unmap to all anchor
loc_inds = inds_inside[loc_inds]
score_inds = inds_inside[score_inds]
sampled_gt = gt_boxes_slice[gt_inds]
sampled_anchor = all_anchors[loc_inds]
box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
if i == 0: if i == 0:
loc_indexes = loc_idx loc_indexes = loc_inds
score_indexes = score_idx score_indexes = score_inds
tmp_tgt_labels = tgt_lbl tgt_labels = labels
tgt_bboxes = box_deltas tgt_bboxes = box_deltas
else: else:
loc_indexes = np.concatenate( loc_indexes = np.concatenate(
[loc_indexes, loc_idx + i * anchor_num]) [loc_indexes, loc_inds + i * anchor_num])
score_indexes = np.concatenate( score_indexes = np.concatenate(
[score_indexes, score_idx + i * anchor_num]) [score_indexes, score_inds + i * anchor_num])
tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl]) tgt_labels = np.concatenate([tgt_labels, labels])
tgt_bboxes = np.vstack([tgt_bboxes, box_deltas]) tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
tgt_labels = tmp_tgt_labels[score_indexes]
return loc_indexes, score_indexes, tgt_bboxes, tgt_labels return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
class TestRpnTargetAssignOp(OpTest): class TestRpnTargetAssignOp(OpTest):
def setUp(self): def setUp(self):
n, c, h, w = 2, 4, 14, 14 n, c, h, w = 2, 4, 14, 14
anchor = get_anchor(n, c, h, w) all_anchors = get_anchor(n, c, h, w)
gt_num = 10 gt_num = 10
anchor = anchor.reshape(-1, 4) all_anchors = all_anchors.reshape(-1, 4)
anchor_num = anchor.shape[0] anchor_num = all_anchors.shape[0]
im_shapes = [[64, 64], [64, 64]] images_shape = [[64, 64], [64, 64]]
gt_box, lod = _generate_groundtruth(im_shapes, 3, 4) #images_shape = [[64, 64]]
bbox = np.vstack([v['boxes'] for v in gt_box]) groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
lod = [0, 4, 8]
iou = _bbox_overlaps(bbox, anchor) #lod = [0, 4]
anchor = anchor.astype('float32') im_info = np.ones((len(images_shape), 3)).astype(np.float32)
bbox = bbox.astype('float32') for i in range(len(images_shape)):
iou = iou.astype('float32') im_info[i, 0] = images_shape[i][0]
im_info[i, 1] = images_shape[i][1]
loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob( im_info[i, 2] = 0.8 #scale
anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25) gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
all_anchors = all_anchors.astype('float32')
gt_boxes = gt_boxes.astype('float32')
rpn_straddle_thresh = 0.0
rpn_batch_size_per_im = 256
rpn_positive_overlap = 0.7
rpn_negative_overlap = 0.3
rpn_fg_fraction = 0.5
use_random = False
loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
rpn_fg_fraction, use_random)
labels = labels[:, np.newaxis]
self.op_type = "rpn_target_assign" self.op_type = "rpn_target_assign"
self.inputs = { self.inputs = {
'Anchor': anchor, 'Anchor': all_anchors,
'GtBox': (bbox, [[4, 4]]), 'GtBoxes': (gt_boxes, [[4, 4]]),
'DistMat': (iou, [[4, 4]]), 'IsCrowd': (is_crowd, [[4, 4]]),
'ImInfo': (im_info, [[1, 1]])
} }
self.attrs = { self.attrs = {
'rpn_batch_size_per_im': 25600, 'rpn_batch_size_per_im': rpn_batch_size_per_im,
'rpn_positive_overlap': 0.95, 'rpn_straddle_thresh': rpn_straddle_thresh,
'rpn_negative_overlap': 0.03, 'rpn_positive_overlap': rpn_positive_overlap,
'fg_fraction': 0.25, 'rpn_negative_overlap': rpn_negative_overlap,
'fix_seed': True 'rpn_fg_fraction': rpn_fg_fraction,
'use_random': use_random
} }
self.outputs = { self.outputs = {
'LocationIndex': loc_index.astype('int32'), 'LocationIndex': loc_index.astype('int32'),
'ScoreIndex': score_index.astype('int32'), 'ScoreIndex': score_index.astype('int32'),
'TargetBBox': tgt_bbox.astype('float32'), 'TargetBBox': tgt_bbox.astype('float32'),
'TargetLabel': tgt_lbl.astype('int64'), 'TargetLabel': labels.astype('int32')
} }
def test_check_output(self): def test_check_output(self):
......
...@@ -88,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): ...@@ -88,5 +88,40 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
self.check_grad(["Logits"], "Loss") self.check_grad(["Logits"], "Loss")
class TestSoftmaxWithCrossEntropyOp3(OpTest):
"""
Test softmax with cross entropy operator with ignore_index.
"""
def setUp(self):
self.op_type = "softmax_with_cross_entropy"
batch_size = 41
class_num = 37
logits = np.random.uniform(0.1, 1.0,
[batch_size, class_num]).astype("float64")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
ignore_index = 7
cross_entropy = np.asmatrix(
[[-np.log(softmax[i][labels[i][0]])]
if labels[i] != ignore_index else [0]
for i in range(softmax.shape[0])],
dtype="float64")
self.inputs = {"Logits": logits, "Label": labels}
self.outputs = {
"Softmax": softmax.astype("float64"),
"Loss": cross_entropy.astype("float64")
}
self.attrs = {"ignore_index": ignore_index}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["Logits"], "Loss")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -16,3 +16,4 @@ from __future__ import print_function ...@@ -16,3 +16,4 @@ from __future__ import print_function
from .program_utils import * from .program_utils import *
from .ufind import * from .ufind import *
from .checkport import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import socket
from contextlib import closing
def wait_server_ready(endpoints):
"""
Wait until parameter servers are ready, use connext_ex to detect
port readiness.
Args:
endpoints (list): endpoints string list, like:
["127.0.0.1:8080", "127.0.0.1:8081"]
Examples:
.. code-block:: python
wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
"""
while True:
all_ok = True
for ep in endpoints:
ip_port = ep.split(":")
with closing(socket.socket(socket.AF_INET,
socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0:
all_ok = False
if not all_ok:
sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
sys.stderr.flush()
time.sleep(3)
else:
break
...@@ -381,7 +381,7 @@ class DistributeTranspiler(object): ...@@ -381,7 +381,7 @@ class DistributeTranspiler(object):
pserver_endpoints) pserver_endpoints)
self._split_table_grad_and_add_send_vars(program, pserver_endpoints) self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
def get_trainer_program(self): def get_trainer_program(self, wait_port=True):
""" """
Get transpiled trainer side program. Get transpiled trainer side program.
...@@ -393,6 +393,9 @@ class DistributeTranspiler(object): ...@@ -393,6 +393,9 @@ class DistributeTranspiler(object):
delete_ops(self.origin_program.global_block(), self.optimize_ops) delete_ops(self.origin_program.global_block(), self.optimize_ops)
self.origin_program.__str__() self.origin_program.__str__()
if wait_port:
wait_server_ready(self.pserver_endpoints)
return self.origin_program return self.origin_program
def _get_trainer_startup_program(self, recv_vars, eplist): def _get_trainer_startup_program(self, recv_vars, eplist):
......
...@@ -60,13 +60,81 @@ class InferenceTranspiler(object): ...@@ -60,13 +60,81 @@ class InferenceTranspiler(object):
if not isinstance(scope, core.Scope): if not isinstance(scope, core.Scope):
raise TypeError("scope should be as Scope type or None") raise TypeError("scope should be as Scope type or None")
use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
self._fuse_batch_norm(program, place, scope)
if use_mkldnn: if use_mkldnn:
self._fuse_relu_mkldnn(program)
self._fuse_conv_bias_mkldnn(program) self._fuse_conv_bias_mkldnn(program)
else: self._fuse_conv_relu_mkldnn(program)
self._fuse_batch_norm(program, place, scope) self._fuse_conv_eltwise_mkldnn(program)
self._fuse_conv_relu_mkldnn(
program) # ResNet residual block merging
self._fuse_bn_relu_mkldnn(program)
def _fuse_conv_eltwise_mkldnn(self, program):
'''
Transpile the program fusing elementwise_add into conv for MKLDNN
program. Elementwise add following convolution OP can be fused by adding
'fuse_eltwise' attribute to convolution OP and replacing its output
Tensor with second parameter of elementwise_add.
The result of fuse is:
- before:
- conv->elementwise_add->any_other_op
- after:
- conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self.block = program.block(0)
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
if current_op.type in ['conv2d']:
next_op = self.block.ops[i + 1]
if next_op.type == 'elementwise_add':
self._fuse_conv_eltwise(current_op, next_op)
self.block._remove_op(i + 1) # Remove elementwise_add
i = i + 1
self._adjust_input()
self._remove_unused_var()
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def _fuse_conv_relu_mkldnn(self, program):
'''
Transpile the program by fused relu activation for MKLDNN program.
Relu activation following convolution OP can be fused by adding
'fuse_relu' attribute to convolution OP.
The result of fuse is:
- before:
- conv->relu->any_other_op
- after:
- conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self.block = program.block(0)
i = 0
while i < len(self.block.ops):
current_op = self.block.ops[i]
if current_op.type in ['conv2d']:
next_op = self.block.ops[i + 1]
if next_op.type == 'relu':
# modify bnorm OP to include relu
current_op.set_attr("fuse_relu", True)
# remove relu OP
self.block._remove_op(i + 1)
i = i + 1
def _fuse_relu_mkldnn(self, program): # TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program = program.clone()
def _fuse_bn_relu_mkldnn(self, program):
''' '''
Transpile the program by fused relu activation for MKLDNN program. Transpile the program by fused relu activation for MKLDNN program.
...@@ -160,7 +228,6 @@ class InferenceTranspiler(object): ...@@ -160,7 +228,6 @@ class InferenceTranspiler(object):
self._fuse_conv_bias(i, current_op, next_op) self._fuse_conv_bias(i, current_op, next_op)
self.block._remove_op(i + 1) # Remove old conv self.block._remove_op(i + 1) # Remove old conv
self.block._remove_op(i + 1) # Remove elementwise_add self.block._remove_op(i + 1) # Remove elementwise_add
i = i + 1
i = i + 1 i = i + 1
self._remove_unused_var() self._remove_unused_var()
...@@ -377,6 +444,20 @@ class InferenceTranspiler(object): ...@@ -377,6 +444,20 @@ class InferenceTranspiler(object):
outputs={"Output": out_var}, outputs={"Output": out_var},
attrs=attrs) attrs=attrs)
def _fuse_conv_eltwise(self, conv_op, eltwise_op):
'''
fuse the conv op with elementwise_add
:param conv_op: convolution operator
:type conv_op: Operator
:param eltwise_op: operator adding data from skip connection
:type eltwise_op: Operator
'''
conv_op.set_attr("fuse_eltwise", True)
self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0]
self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0]
def _adjust_input(self): def _adjust_input(self):
for i in range(len(self.block.ops)): for i in range(len(self.block.ops)):
current_op = self.block.ops[i] current_op = self.block.ops[i]
......
...@@ -14,11 +14,14 @@ ...@@ -14,11 +14,14 @@
__all__ = [ __all__ = [
'map_readers', 'buffered', 'compose', 'chain', 'shuffle', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
'multiprocess_reader'
] ]
from threading import Thread from threading import Thread
import subprocess import subprocess
import multiprocessing
import sys
from six.moves.queue import Queue from six.moves.queue import Queue
from six.moves import zip_longest from six.moves import zip_longest
...@@ -332,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): ...@@ -332,6 +335,100 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
return xreader return xreader
def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
"""
multiprocess_reader use python multi process to read data from readers
and then use multiprocess.Queue or multiprocess.Pipe to merge all
data. The process number is equal to the number of input readers, each
process call one reader.
Multiprocess.Queue require the rw access right to /dev/shm, some
platform does not support.
you need to create multiple readers first, these readers should be independent
to each other so that each process can work independently.
An example:
.. code-block:: python
reader0 = reader(["file01", "file02"])
reader1 = reader(["file11", "file12"])
reader1 = reader(["file21", "file22"])
reader = multiprocess_reader([reader0, reader1, reader2],
queue_size=100, use_pipe=False)
"""
try:
import ujson as json
except Exception as e:
sys.stderr.write("import ujson error: " + str(e) + " use json\n")
import json
assert type(readers) is list and len(readers) > 0
def _read_into_queue(reader, queue):
for sample in reader():
if sample is None:
raise ValueError("sample has None")
queue.put(sample)
queue.put(None)
def queue_reader():
queue = multiprocessing.Queue(queue_size)
for reader in readers:
p = multiprocessing.Process(
target=_read_into_queue, args=(reader, queue))
p.start()
reader_num = len(readers)
finish_num = 0
while finish_num < reader_num:
sample = queue.get()
if sample is None:
finish_num += 1
else:
yield sample
def _read_into_pipe(reader, conn):
for sample in reader():
if sample is None:
raise ValueError("sample has None!")
conn.send(json.dumps(sample))
conn.send(json.dumps(None))
conn.close()
def pipe_reader():
conns = []
for reader in readers:
parent_conn, child_conn = multiprocessing.Pipe()
conns.append(parent_conn)
p = multiprocessing.Process(
target=_read_into_pipe, args=(reader, child_conn))
p.start()
reader_num = len(readers)
finish_num = 0
conn_to_remove = []
while finish_num < reader_num:
for conn in conn_to_remove:
conns.remove(conn)
conn_to_remove = []
for conn in conns:
sample = json.loads(conn.recv())
if sample is None:
finish_num += 1
conn.close()
conn_to_remove.append(conn)
else:
yield sample
if use_pipe:
return pipe_reader
else:
return queue_reader
def _buf2lines(buf, line_break="\n"): def _buf2lines(buf, line_break="\n"):
# FIXME: line_break should be automatically configured. # FIXME: line_break should be automatically configured.
lines = buf.split(line_break) lines = buf.split(line_break)
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import time import time
import unittest import unittest
import functools
import paddle.reader import paddle.reader
...@@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase): ...@@ -174,5 +175,33 @@ class TestPipeReader(unittest.TestCase):
temp.close() temp.close()
class TestMultiProcessReader(unittest.TestCase):
def setup(self):
self.samples = []
for i in range(1000):
self.samples.append([[i], [i + 1, i + 2], i + 3])
def reader(index):
for i in range(len(self.samples)):
if i % 3 == index:
yield self.samples[i]
self.reader0 = functools.partial(reader, 0)
self.reader1 = functools.partial(reader, 1)
self.reader2 = functools.partial(reader, 2)
def reader_test(self, use_pipe):
self.setup()
results = []
for data in paddle.reader.multiprocess_reader(
[self.reader0, self.reader1, self.reader2], 100, use_pipe)():
results.append(data)
self.assertEqual(sorted(self.samples), sorted(results))
def test_multi_process_reader(self):
self.reader_test(use_pipe=False)
self.reader_test(use_pipe=True)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册