diff --git a/AUTHORS.md b/AUTHORS.md
index 4ee05420982d13f686cf13e8957ce41dfcdd2cb8..11f227be7148d8d6e055538347a8c31679406c84 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -4,6 +4,7 @@
| backyes | Yan-Fei Wang |
| baiyfbupt | Yi-Fan Bai |
| beckett1124 | Bin Qi |
+| ChengduoZH | Cheng-Duo Zhao|
| chengxiaohua1105 | Xiao-Hua Cheng |
| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
| cxysteven | Xing-Yi Cheng |
diff --git a/Dockerfile b/Dockerfile
index 80a96983ec1ca6b9ec440f7e95de6c328eb1ed40..4d6165b79a1d94b8f27d7f3ee1b6e2cee5992d31 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \
- automake locales clang-format swig doxygen cmake \
+ automake locales clang-format swig cmake \
liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool ccache && \
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..46140a9d1be01a50cd74dab2799e3731e8d3debd
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes opencv-python paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py dataset.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 7071e9fdcd394a5a4db4d0d599610a72d98c0a3c..1b0c7dce8bd6faab0c4c59caa1cbe337483cbd16 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -44,11 +44,25 @@ Currently supported `--model` argument include:
## Run Distributed Benchmark on Kubernetes Cluster
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
index 39ba207fd96f71563504017e77dc0e87c249b3f8..9da8a69af1d7b671b2648b1b3702776c1c0650b0 100644
--- a/benchmark/fluid/kube_gen_job.py
+++ b/benchmark/fluid/kube_gen_job.py
@@ -49,7 +49,7 @@ def parse_args():
parser.add_argument(
'--fluid', default=1, type=int, help='whether is fluid job')
parser.add_argument(
- '--rdma', action='store_ture', help='whether mount rdma libs')
+ '--rdma', action='store_true', help='whether mount rdma libs')
parser.add_argument(
'--disttype',
default="pserver",
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2ee0b668b6d4238d4511253b2233035..afaab5f4de43fa7e94feeed4a1de991351c04b76 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -37,7 +37,8 @@ nohup stdbuf -oL nvidia-smi \
-l 1 &
# mnist
# mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=mnist \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
@@ -46,7 +47,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
# vgg16
# gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=vgg16 \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
@@ -54,7 +56,8 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
2>&1 | tee -a vgg16_gpu_128.log
# flowers gpu 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=vgg16 \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
@@ -64,40 +67,39 @@ FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
# resnet50
# resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=resnet50 \
--device=GPU \
--batch_size=128 \
--data_set=cifar10 \
- --model=resnet_cifar10 \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_128.log
# resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=resnet50 \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
- --model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
2>&1 | tee -a resnet50_gpu_flowers_64.log
# lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=stacked_dynamic_lstm \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
- --hidden_dim=512 \
- --emb_dim=512 \
- --crop_size=1500 \
2>&1 | tee -a lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=machine_translation \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index f53da4d194f8d2428b4121fa1bb31f3fc95a9f64..dbb99d3c03f39f650b2cb0dbe8ee49cd413db6e3 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -1009,3 +1009,9 @@ ____
.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
:noindex:
+gather
+____
+
+.. autofunction:: paddle.fluid.layers.gather
+ :noindex:
+
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
index 0c0156c8e46378e7bbeea8072938b8ccfb9ab6d7..79df6c59578e2acf495a3453ab61f069c3f09a49 100644
--- a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -86,7 +86,7 @@
-
+
---
@@ -123,12 +123,12 @@
- 在科学计算领域,计算图是一种描述计算的经典方式。下图展示了从前向计算图(蓝色)开始,通过添加反向(红色)和优化算法相关(绿色)操作,构建出整个计算图的过程:
--
+-
-
+
- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成,相关概念会在后文详细展开。
- 编译时 Fluid 接受前向计算(这里可以先简单的理解为是一段有序的计算流)`Program`,为这段前向计算按照:前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序,添加相关 `Operator`和`Variable`到`Program`到完整的计算。
@@ -328,7 +328,7 @@
----
+---
### 编译时概念 :==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
@@ -402,7 +402,7 @@
- `Scope`
- 计算相关
- - `Block`
+ - `Block`
- `Kernel`、`OpWithKernel`、`OpWithoutKernel`
-- 执行相关 :`Executor`
+- 执行相关 :`Executor`
@@ -798,7 +798,7 @@ class GPUAllocator : public SystemAllocator {
- step 1:添加Place类型,由用户实现添加到框架
- 可以将Place类型理解为一个整数加上一个枚举型,包括:设备号 + 设备类型
-
+
@@ -824,7 +824,7 @@ class GPUAllocator : public SystemAllocator {
1. DataType 执行数据类型 FP32/FP64/INT32/INT64
1. Memory layout: 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
1. 使用的库
-
+
来区分Kernel,为同一个operator注册多个 Kernel。
```cpp
@@ -876,7 +876,7 @@ step 3: 运行时的 KernelType 推断和Kernel切换,
---
@@ -1107,7 +1107,7 @@ void Run(const framework::Scope &scope,
-
+
|
@@ -1127,13 +1127,13 @@ void Run(const framework::Scope &scope,
- 设计概览
- - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
- - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+ - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+ - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
- fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
- 核心概念
- variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
- Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
- - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+ - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
- TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
- Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
- Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
@@ -1152,7 +1152,7 @@ void Run(const framework::Scope &scope,
- 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
- 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
- 添加新的Kernel [->](
-https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
@@ -1167,10 +1167,10 @@ https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_
Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
-
+
PaddlePaddle 在 Dockerhub 地址:[->](
https://hub.docker.com/r/paddlepaddle/paddle/tags/)
-
+
1. 获取PaddlePaddle的Docker镜像
```bash
docker pull paddlepaddle/paddle:latest-dev
@@ -1183,7 +1183,7 @@ PaddlePaddle 在 Dockerhub 地址:[->](
```
1. 进入docker container后,从源码编译,请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
-
+
---
@@ -1196,7 +1196,7 @@ PaddlePaddle 在 Dockerhub 地址:[->](
1. 开发推荐使用tag为`latest-dev`的镜像,其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像,主要用于运行PaddlePaddle程序。
2. 在Docker中运行GPU程序,推荐使用nvidia-docker,[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
-
+
```bash
nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
```
@@ -1353,9 +1353,9 @@ Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实
}
};
```
-
+
-
+
---
###### 实现带Kernel的Operator step2: 定义Operator类
@@ -1420,11 +1420,11 @@ class ClipOp : public framework::OperatorWithKernel {
2. override InferShape函数(参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24))
1. 什么是`functor` ?
-
+
- 类或结构体仅重载了`()`,一般是可被多个kernel复用的计算函数。
-
+
```cpp
template
class CrossEntropyFunctor {
@@ -1438,9 +1438,9 @@ class ClipOp : public framework::OperatorWithKernel {
};
```
-
+
- 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法: [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
-
+
---
@@ -1504,7 +1504,7 @@ class ClipKernel : public framework::OpKernel {
- 需要注意,Fluid中,不区分Cost Op和中间层Op,所有Op都必须正确处理接收到的梯度
2. 反向Op的输出
- 对可学习参数的求导结果
- - 对所有输入的求导结果
+ - 对所有输入的求导结果
@@ -1520,7 +1520,7 @@ class ClipKernel : public framework::OpKernel {
1. 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
-
+
```cpp
namespace ops = paddle::operators;
REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad,
@@ -1530,13 +1530,13 @@ class ClipKernel : public framework::OpKernel {
REGISTER_OP_CPU_KERNEL(
clip_grad, ops::ClipGradKernel);
```
-
+
- 在上面的代码片段中:
1. `REGISTER_OP` : 注册`ops::ClipOp`类,类型名为`clip`,该类的`ProtoMaker`为`ops::ClipOpMaker`,注册`ops::ClipOpGrad`,类型名为`clip_grad`
1. `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op,例如:优化算法相关的Op
1. `REGISTER_OP_CPU_KERNEL` :注册`ops::ClipKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::ClipGradKernel`类
-
+
1. 按照同样方法,在`.cu`文件中注册GPU Kernel
- 如果CUDA Kernel的实现基于Eigen,需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU`
@@ -1593,7 +1593,7 @@ class ClipKernel : public framework::OpKernel {
```bash
make test ARGS="-R test_mul_op -V"
```
-
+
或者:
```
@@ -1613,7 +1613,7 @@ class ClipKernel : public framework::OpKernel {
- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
-
+
---
### ==10.== 使用相关问题
@@ -1735,7 +1735,7 @@ class ClipKernel : public framework::OpKernel {
y_data = np.random.randint(0, 8, [1]).astype("int32")
y_tensor = core.Tensor()
y_tensor.set(y_data, place)
-
+
x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
x_tensor = core.Tensor()
x_tensor.set(x_data, place)
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
index 75af7354be93a6eeabfa9ccf86903505402a7ca6..3daea71d0933a2774227ff2b5e744392ca6b1765 100644
--- a/doc/fluid/getstarted/index_cn.rst
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -17,3 +17,4 @@
:maxdepth: 1
concepts/use_concepts_cn.rst
+ developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
index 75a43f4af87c34830ec940068196e6ca72640501..fb20bb4f245281c3acf67c417979dc63c144fef3 100644
--- a/doc/fluid/getstarted/index_en.rst
+++ b/doc/fluid/getstarted/index_en.rst
@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
:maxdepth: 1
concepts/index_en.rst
+ developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
index 135beb75d0330f39d062753aa2aa83a077f36bb1..6a964d4f8561f30aa10936d2399698c51583442c 100644
--- a/doc/fluid/getstarted/quickstart_cn.rst
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.
pip install paddlepaddle
-如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
+如果需要安装支持GPU的版本(cuda8.0_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
@@ -28,18 +28,18 @@ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.
import paddle.dataset.uci_housing as uci_housing
import paddle.fluid as fluid
-
+
with fluid.scope_guard(fluid.core.Scope()):
# initialize executor with cpu
exe = fluid.Executor(place=fluid.CPUPlace())
- # load inference model
+ # load inference model
[inference_program, feed_target_names,fetch_targets] = \
fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
# run inference
- result = exe.run(inference_program,
- feed={feed_target_names[0]: uci_housing.predict_reader()},
+ result = exe.run(inference_program,
+ feed={feed_target_names[0]: uci_housing.predict_reader()},
fetch_list=fetch_targets)
- # print predicted price is $12,273.97
+ # print predicted price is $12,273.97
print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
index df6619cfd039fc1fdca8cde57db9cc6aebf8f029..680122f25893a5a48fac103266bda4788f891f6d 100644
--- a/doc/fluid/getstarted/quickstart_en.rst
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
pip install paddlepaddle
-If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
.. code-block:: bash
@@ -31,18 +31,18 @@ code:
import paddle.dataset.uci_housing as uci_housing
import paddle.fluid as fluid
-
+
with fluid.scope_guard(fluid.core.Scope()):
# initialize executor with cpu
exe = fluid.Executor(place=fluid.CPUPlace())
- # load inference model
+ # load inference model
[inference_program, feed_target_names,fetch_targets] = \
fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
# run inference
- result = exe.run(inference_program,
- feed={feed_target_names[0]: uci_housing.predict_reader()},
+ result = exe.run(inference_program,
+ feed={feed_target_names[0]: uci_housing.predict_reader()},
fetch_list=fetch_targets)
- # print predicted price is $12,273.97
+ # print predicted price is $12,273.97
print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
index b7c620179724ebe97a0a47b75a57b376b21ccf90..b57af64f44da82926c4862578f3072960ca5aa92 100644
--- a/doc/fluid/howto/index_cn.rst
+++ b/doc/fluid/howto/index_cn.rst
@@ -4,5 +4,5 @@
.. toctree::
:maxdepth: 1
+ inference/index_cn.rst
optimization/index_cn.rst
- inference/inference_support_in_fluid.md
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
index f3ca41cdbf1d40ec8afaf045233a38755d8a777a..fd21e167ce3a46da167db1e9d7013804f730e047 100644
--- a/doc/fluid/howto/index_en.rst
+++ b/doc/fluid/howto/index_en.rst
@@ -5,4 +5,3 @@ HOW TO
:maxdepth: 1
optimization/index_en.rst
- inference/inference_support_in_fluid.md
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c8d9992fcc92c25f8c14f71c79bde9f79fd92b1f
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,96 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+====================== ========================================
+版本说明 C++预测库
+====================== ========================================
+cpu_avx_mkl `fluid.tgz `_
+cpu_avx_openblas `fluid.tgz `_
+cpu_noavx_openblas `fluid.tgz `_
+cuda7.5_cudnn5_avx_mkl `fluid.tgz `_
+cuda8.0_cudnn5_avx_mkl `fluid.tgz `_
+cuda8.0_cudnn7_avx_mkl `fluid.tgz `_
+====================== ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库,只需在编译时配制下面这些编译选项:
+
+================= =========
+选项 值
+================= =========
+CMAKE_BUILD_TYPE Release
+FLUID_INSTALL_DIR 安装路径
+WITH_FLUID_ONLY ON(推荐)
+WITH_SWIG_PY OFF(推荐
+WITH_PYTHON OFF(推荐)
+WITH_GPU ON/OFF
+WITH_MKL ON/OFF
+================= =========
+
+建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径):
+
+ .. code-block:: bash
+
+ pip install paddlepaddle-gpu
+ PADDLE_ROOT=/path/of/capi
+ git clone https://github.com/PaddlePaddle/Paddle.git
+ cd Paddle
+ mkdir build
+ cd build
+ cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DWITH_FLUID_ONLY=ON \
+ -DWITH_SWIG_PY=OFF \
+ -DWITH_PYTHON=OFF \
+ -DWITH_MKL=OFF \
+ -DWITH_GPU=OFF \
+ ..
+ make
+ make inference_lib_dist
+
+成功编译后,使用C++预测库所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件;(3)版本信息与编译选项信息)
+均会存放于PADDLE_ROOT目录中。目录结构如下:
+
+ .. code-block:: text
+
+ PaddleRoot/
+ ├── CMakeCache.txt
+ ├── paddle
+ │ └── fluid
+ │ ├── framework
+ │ ├── inference
+ │ ├── memory
+ │ ├── platform
+ │ ├── pybind
+ │ └── string
+ ├── third_party
+ │ ├── boost
+ │ │ └── boost
+ │ ├── eigen3
+ │ │ ├── Eigen
+ │ │ └── unsupported
+ │ └── install
+ │ ├── gflags
+ │ ├── glog
+ │ ├── mklml
+ │ ├── protobuf
+ │ ├── snappy
+ │ ├── snappystream
+ │ └── zlib
+ └── version.txt
+
+version.txt 中记录了该预测库的版本信息,包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号,如:
+
+ .. code-block:: text
+
+ GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+ WITH_MKL: ON
+ WITH_GPU: ON
+ CUDA version: 8.0
+ CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+ :maxdepth: 1
+
+ build_and_install_lib_cn.rst
+ inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
similarity index 90%
rename from doc/fluid/howto/inference/inference_support_in_fluid.md
rename to doc/fluid/howto/inference/inference_support_in_fluid_cn.md
index d272cd3e3bdac49b9ed1a21531de1b0be03d881e..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a 100644
--- a/doc/fluid/howto/inference/inference_support_in_fluid.md
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -1,9 +1,8 @@
-# Fluid Inference使用指南
+# 使用指南
## 目录:
- Python Inference API
-- 编译Fluid Inference库
- Inference C++ API
- Inference实例
- Inference计算优化
@@ -55,62 +54,6 @@
return [program, feed_target_names, fetch_targets]
```
-
-## 编译Fluid Inference库
-
- - **不需要额外的CMake选项**
- - 1、 配置CMake命令,更多配置请参考[源码编译PaddlePaddle](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html)
- ```bash
- $ git clone https://github.com/PaddlePaddle/Paddle.git
- $ cd Paddle
- $ mkdir build
- $ cd build
- $ cmake -DCMAKE_INSTALL_PREFIX=your/path/to/paddle_inference_lib \
- -DCMAKE_BUILD_TYPE=Release \
- -DWITH_PYTHON=ON \
- -DWITH_MKL=OFF \
- -DWITH_GPU=OFF \
- ..
- ```
-
- - 2、 编译PaddlePaddle
- ```bash
- $ make
- ```
-
- - 3、 部署。执行如下命令将PaddlePaddle Fluid Inference库部署到`your/path/to/paddle_inference_lib`目录。
- ```bash
- $ make inference_lib_dist
- ```
-
-- 目录结构
-
- ```bash
- $ cd your/path/to/paddle_inference_lib
- $ tree
- .
- |-- paddle
- | `-- fluid
- | |-- framework
- | |-- inference
- | | |-- io.h
- | | `-- libpaddle_fluid.so
- | |-- memory
- | |-- platform
- | `-- string
- |-- third_party
- | |-- eigen3
- | `-- install
- | |-- gflags
- | |-- glog
- | `-- protobuf
- `-- ...
- ```
-
- 假设`PADDLE_ROOT=your/path/to/paddle_inference_lib`。
-
-
-
## 链接Fluid Inference库
- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 9c55f189bcc5cbf0ce84f11e9653fa20b84a51f7..6847f7db7fc0f6b41ced1260d409ca6eba9b53eb 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,46 +17,33 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE)
-function(inference_api_test TARGET_NAME TEST_SRC)
+function(inference_api_test TARGET_NAME)
set(options "")
set(oneValueArgs "")
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
- set(arg_list "")
+ cc_test(test_paddle_inference_${TARGET_NAME}
+ SRCS test_paddle_inference_${TARGET_NAME}.cc
+ DEPS paddle_fluid_api paddle_inference_api
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
if(inference_test_ARGS)
- foreach(arg ${inference_test_ARGS})
- list(APPEND arg_list "_${arg}")
- endforeach()
- else()
- list(APPEND arg_list "_")
+ set_tests_properties(test_paddle_inference_${TARGET_NAME}
+ PROPERTIES DEPENDS "${inference_test_ARGS}")
endif()
- foreach(arg ${arg_list})
- string(REGEX REPLACE "^_$" "" arg "${arg}")
- cc_test(${TARGET_NAME}
- SRCS ${TEST_SRC}
- DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
- ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
- # TODO(panyx0178): Figure out how to add word2vec and image_classification
- # as deps.
- # set_tests_properties(${TARGET_NAME}
- # PROPERTIES DEPENDS ${DEP_TEST})
- endforeach()
endfunction(inference_api_test)
cc_library(paddle_inference_api
- SRCS paddle_inference_api.cc
+ SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-cc_library(paddle_inference_api_impl
- SRCS paddle_inference_api_impl.cc
- DEPS paddle_inference_api paddle_fluid_api)
+if(WITH_TESTING)
+ cc_test(test_paddle_inference_api
+ SRCS test_paddle_inference_api.cc
+ DEPS paddle_inference_api)
-cc_test(test_paddle_inference_api
- SRCS test_paddle_inference_api.cc
- DEPS paddle_inference_api)
-
-inference_api_test(test_paddle_inference_api_impl
- test_paddle_inference_api_impl.cc)
+ inference_api_test(api_impl
+ ARGS test_word2vec test_image_classification)
+endif()
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index f804d9b28697a6703d63d9a640c4ec337effaba6..5fe8399762bba69bc99ed9ae694db32f532ed953 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -40,15 +40,24 @@ struct PaddleBuf {
struct PaddleTensor {
std::string name; // variable name.
std::vector shape;
+ // TODO(Superjomn) for LoD support, add a vector> field if needed.
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
+enum class PaddleEngineKind {
+ kNative = 0, // Use the native Fluid facility.
+ // TODO(Superjomn) support following engines latter.
+ // kAnakin, // Use Anakin for inference.
+ // kTensorRT, // Use TensorRT for inference.
+ // kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
+ // kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
+};
+
/*
-* A simple Inference API for Paddle. Currently this API might just be used by
-* non-sequence scenerios.
-* TODO(Superjomn) Prepare another API for NLP-related usages.
-*/
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
class PaddlePredictor {
public:
struct Config;
@@ -66,34 +75,35 @@ class PaddlePredictor {
// be thread-safe.
virtual std::unique_ptr Clone() = 0;
- virtual bool InitShared() { return false; }
// Destroy the Predictor.
virtual ~PaddlePredictor() {}
- friend std::unique_ptr CreatePaddlePredictor(
- const PaddlePredictor::Config& config);
-
// The common configs for all the predictors.
struct Config {
- enum class EngineKind;
-
std::string model_dir; // path to the model directory.
bool enable_engine{false}; // Enable to execute (part of) the model on
- // third-party engines.
- EngineKind engine_kind{Config::EngineKind::kNone};
-
- enum class EngineKind {
- kNone = -1, // Use the native Fluid facility.
- kAnakin, // Use Anakin for inference.
- kTensorRT, // Use TensorRT for inference.
- kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
- kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
- };
};
};
-// A factory to help create difference predictor.
-template
+struct NativeConfig : public PaddlePredictor::Config {
+ // GPU related fields.
+ bool use_gpu{false};
+ int device{0};
+ float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
+
+ std::string prog_file;
+ std::string param_file;
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template
std::unique_ptr CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index ebe4c3291802707009f30616463705d966e244d6..99a64662d4d04e3cf9dfdafe5b5ab9e5dac0af8a 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -54,11 +54,10 @@ std::string num2str(T a) {
}
} // namespace
-bool PaddlePredictorImpl::Init() {
+bool NativePaddlePredictor::Init() {
VLOG(3) << "Predictor::init()";
- // TODO(panyx0718): Should CPU vs GPU device be decided by id?
- if (config_.device >= 0) {
+ if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
@@ -85,19 +84,21 @@ bool PaddlePredictorImpl::Init() {
}
ctx_ = executor_->Prepare(*inference_program_, 0);
- // Create variables
- // TODO(panyx0718): Why need to test share_variables here?
- if (config_.share_variables) {
- executor_->CreateVariables(*inference_program_, scope_.get(), 0);
- }
+ // Create temporary variables first, so that the first batch do not need to
+ // create variables in the runtime. This is the logics of the old inference
+ // API.
+ // TODO(Superjomn) this should be modified when `Clone` is valid for
+ // multi-thread application.
+ executor_->CreateVariables(*inference_program_, scope_.get(), 0);
+
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
-bool PaddlePredictorImpl::Run(const std::vector &inputs,
- std::vector *output_data) {
+bool NativePaddlePredictor::Run(const std::vector &inputs,
+ std::vector *output_data) {
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
@@ -124,7 +125,7 @@ bool PaddlePredictorImpl::Run(const std::vector &inputs,
scope_.get(),
&feed_targets,
&fetch_targets,
- !config_.share_variables);
+ false /* don't create variable eatch time */);
if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs";
return false;
@@ -133,59 +134,20 @@ bool PaddlePredictorImpl::Run(const std::vector &inputs,
return true;
}
-std::unique_ptr PaddlePredictorImpl::Clone() {
+std::unique_ptr NativePaddlePredictor::Clone() {
VLOG(3) << "Predictor::clone";
- std::unique_ptr cls(new PaddlePredictorImpl(config_));
- if (!cls->InitShared()) {
- LOG(ERROR) << "fail to call InitShared";
+ std::unique_ptr cls(new NativePaddlePredictor(config_));
+
+ if (!dynamic_cast(cls.get())->Init()) {
+ LOG(ERROR) << "fail to call Init";
return nullptr;
}
// fix manylinux compile error.
return std::move(cls);
}
-// TODO(panyx0718): Consider merge with Init()?
-bool PaddlePredictorImpl::InitShared() {
- VLOG(3) << "Predictor::init_shared";
- // 1. Define place, executor, scope
- if (this->config_.device >= 0) {
- place_ = platform::CUDAPlace();
- } else {
- place_ = platform::CPUPlace();
- }
- this->executor_.reset(new framework::Executor(this->place_));
- this->scope_.reset(new framework::Scope());
- // Initialize the inference program
- if (!this->config_.model_dir.empty()) {
- // Parameters are saved in separate files sited in
- // the specified `dirname`.
- this->inference_program_ = inference::Load(
- this->executor_.get(), this->scope_.get(), this->config_.model_dir);
- } else if (!this->config_.prog_file.empty() &&
- !this->config_.param_file.empty()) {
- // All parameters are saved in a single file.
- // The file names should be consistent with that used
- // in Python API `fluid.io.save_inference_model`.
- this->inference_program_ = inference::Load(this->executor_.get(),
- this->scope_.get(),
- this->config_.prog_file,
- this->config_.param_file);
- }
- this->ctx_ = this->executor_->Prepare(*this->inference_program_, 0);
- // 3. create variables
- // TODO(panyx0718): why test share_variables.
- if (config_.share_variables) {
- this->executor_->CreateVariables(
- *this->inference_program_, this->scope_.get(), 0);
- }
- // 4. Get the feed_target_names and fetch_target_names
- this->feed_target_names_ = this->inference_program_->GetFeedTargetNames();
- this->fetch_target_names_ = this->inference_program_->GetFetchTargetNames();
- return true;
-}
-
-bool PaddlePredictorImpl::SetFeed(const std::vector &inputs,
- std::vector *feeds) {
+bool NativePaddlePredictor::SetFeed(const std::vector &inputs,
+ std::vector *feeds) {
VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feed_target_names_.size()) {
LOG(ERROR) << "wrong feed input size.";
@@ -213,7 +175,7 @@ bool PaddlePredictorImpl::SetFeed(const std::vector &inputs,
return true;
}
-bool PaddlePredictorImpl::GetFetch(
+bool NativePaddlePredictor::GetFetch(
const std::vector &fetchs,
std::vector *outputs) {
VLOG(3) << "Predictor::get_fetch";
@@ -280,23 +242,29 @@ bool PaddlePredictorImpl::GetFetch(
}
template <>
-std::unique_ptr CreatePaddlePredictor(
- const ConfigImpl &config) {
- VLOG(3) << "create PaddlePredictorImpl";
- // 1. GPU memeroy
- std::vector flags;
- if (config.fraction_of_gpu_memory >= 0.0f ||
- config.fraction_of_gpu_memory <= 0.95f) {
- flags.push_back("dummpy");
- std::string flag = "--fraction_of_gpu_memory_to_use=" +
- num2str(config.fraction_of_gpu_memory);
- flags.push_back(flag);
- VLOG(3) << "set flag: " << flag;
- framework::InitGflags(flags);
+std::unique_ptr
+CreatePaddlePredictor(
+ const NativeConfig &config) {
+ VLOG(3) << "create NativePaddlePredictor";
+ if (config.use_gpu) {
+ // 1. GPU memeroy
+ PADDLE_ENFORCE(
+ config.fraction_of_gpu_memory > 0.f,
+ "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+ std::vector flags;
+ if (config.fraction_of_gpu_memory >= 0.0f ||
+ config.fraction_of_gpu_memory <= 0.95f) {
+ flags.push_back("dummpy");
+ std::string flag = "--fraction_of_gpu_memory_to_use=" +
+ num2str(config.fraction_of_gpu_memory);
+ flags.push_back(flag);
+ VLOG(3) << "set flag: " << flag;
+ framework::InitGflags(flags);
+ }
}
- std::unique_ptr predictor(new PaddlePredictorImpl(config));
- if (!dynamic_cast(predictor.get())->Init()) {
+ std::unique_ptr predictor(new NativePaddlePredictor(config));
+ if (!dynamic_cast(predictor.get())->Init()) {
return nullptr;
}
return std::move(predictor);
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index c545461680723b429b2253392060ea36b84ce708..84707e223d7aa3d1ebca933923e932b3973613ae 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -29,17 +29,10 @@
namespace paddle {
-struct ConfigImpl : public PaddlePredictor::Config {
- int device;
- float fraction_of_gpu_memory;
- std::string prog_file;
- std::string param_file;
- bool share_variables;
-};
-
-class PaddlePredictorImpl : public PaddlePredictor {
+class NativePaddlePredictor : public PaddlePredictor {
public:
- explicit PaddlePredictorImpl(const ConfigImpl &config) : config_(config) {}
+ explicit NativePaddlePredictor(const NativeConfig &config)
+ : config_(config) {}
bool Init();
@@ -48,16 +41,15 @@ class PaddlePredictorImpl : public PaddlePredictor {
std::unique_ptr Clone() override;
- ~PaddlePredictorImpl() override{};
+ ~NativePaddlePredictor() override{};
private:
- bool InitShared() override;
bool SetFeed(const std::vector &input_datas,
std::vector *feeds);
bool GetFetch(const std::vector &fetchs,
std::vector *output_data);
- ConfigImpl config_;
+ NativeConfig config_;
platform::Place place_;
std::unique_ptr executor_;
std::unique_ptr scope_;
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 096293a4e25df0c78150d85dc091d7ca6539bf40..07b17acd484b13af2ab4019aafa4a08c6b9f59d4 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -40,19 +40,19 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
return pt;
}
-ConfigImpl GetConfig() {
- ConfigImpl config;
+NativeConfig GetConfig() {
+ NativeConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15;
+ config.use_gpu = true;
config.device = 0;
- config.share_variables = true;
return config;
}
TEST(paddle_inference_api_impl, word2vec) {
- ConfigImpl config = GetConfig();
- std::unique_ptr predictor = CreatePaddlePredictor(config);
+ NativeConfig config = GetConfig();
+ auto predictor = CreatePaddlePredictor(config);
framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}};
@@ -104,7 +104,7 @@ TEST(paddle_inference_api_impl, image_classification) {
int batch_size = 2;
bool use_mkldnn = false;
bool repeat = false;
- ConfigImpl config = GetConfig();
+ NativeConfig config = GetConfig();
config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model";
@@ -133,7 +133,7 @@ TEST(paddle_inference_api_impl, image_classification) {
is_combined,
use_mkldnn);
- std::unique_ptr predictor = CreatePaddlePredictor(config);
+ auto predictor = CreatePaddlePredictor(config);
std::vector paddle_tensor_feeds;
paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
@@ -144,8 +144,7 @@ TEST(paddle_inference_api_impl, image_classification) {
float* data = static_cast(outputs[0].data.data);
float* lod_data = output1.data();
for (size_t j = 0; j < len / sizeof(float); ++j) {
- EXPECT_LT(lod_data[j] - data[j], 1e-10);
- EXPECT_GT(lod_data[j] - data[j], -1e-10);
+ EXPECT_NEAR(lod_data[j], data[j], 1e-3);
}
free(data);
}
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index fd409ed4c0f7a504686765909e9c71692aab8824..e7842e9b8130d35e511e02dfb1dc27f307d17f38 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -200,7 +200,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
vars_[var_desc.name()].reset(new VarDesc(var_desc));
}
for (const proto::OpDesc &op_desc : desc_->ops()) {
- ops_.emplace_back(new OpDesc(op_desc, prog, this));
+ ops_.emplace_back(new OpDesc(op_desc, this));
}
}
@@ -209,7 +209,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) {
need_update_ = true;
for (auto &op : other.ops_) {
- ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+ ops_.emplace_back(new OpDesc(*op, this));
}
for (auto &it : other.vars_) {
auto *var = new VarDesc(*it.second);
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 600601669c5d56a3ffc2fb9c804ffad5fde58f0b..189dd6c52f85b5bf623b98c64c07c0c7269505d4 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -105,7 +105,7 @@ class BlockDesc {
size_t OpSize() const { return ops_.size(); }
- OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+ OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
void Flush();
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index d8e711994c5dba15ce0a1c237558b121888902e3..17baacd13eecac8f410631fe9e94788da4fff848 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -11,11 +11,15 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include
#include
+#include
#include
+#include
+
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
@@ -26,9 +30,6 @@
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
-#include
-#include
-
DEFINE_string(ssa_graph_path, "/tmp/ssa_graph.dot",
"the ssa graph path only print with GLOG_v=10,"
"default /tmp/graph.dot");
@@ -148,9 +149,9 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
std::unique_ptr MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
- std::unordered_map var_types;
+ std::unordered_map all_vars;
for (auto *var : program.Block(0).AllVars()) {
- var_types[var->Name()] = var->GetType();
+ all_vars[var->Name()] = var;
}
auto graph = new SSAGraph();
@@ -167,12 +168,28 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program);
- size_t cur_device_id = 0;
std::vector> var_name_on_devices;
std::vector> bcast_var_name_set;
var_name_on_devices.resize(places_.size());
bcast_var_name_set.resize(places_.size());
+ size_t cur_device_id = 0;
+ std::vector balance_grads(places_.size(), 0);
+
+ auto get_appropriate_dev = [&](std::string &g_name) -> size_t {
+ auto var_desc = all_vars.at(g_name);
+ PADDLE_ENFORCE_NOT_NULL(var_desc);
+ auto dim = framework::make_ddim(var_desc->GetShape());
+ int64_t numel = framework::product(dim);
+ PADDLE_ENFORCE_GE(numel, 0);
+ auto smallest =
+ std::min_element(std::begin(balance_grads), std::end(balance_grads));
+ size_t dev_id =
+ static_cast(std::distance(std::begin(balance_grads), smallest));
+ balance_grads[dev_id] += numel;
+ return dev_id;
+ };
+
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
if (boost::get(
@@ -220,13 +237,13 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
switch (strategy_.reduce_) {
case BuildStrategy::ReduceStrategy::kReduce:
+ cur_device_id = get_appropriate_dev(g_name);
CreateReduceOp(&result, g_name, cur_device_id);
var_name_on_devices[cur_device_id].emplace(g_name);
bcast_var_name_set[cur_device_id].emplace(p_name);
- cur_device_id = (cur_device_id + 1) % places_.size();
break;
case BuildStrategy::ReduceStrategy::kAllReduce:
- if (IsSparseGradient(var_types, g_name)) {
+ if (IsSparseGradient(all_vars, g_name)) {
CreateReduceOp(&result, g_name, 0);
CreateBroadcastOp(&result, g_name, 0);
} else {
@@ -269,10 +286,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
}
bool MultiDevSSAGraphBuilder::IsSparseGradient(
- const std::unordered_map &var_types,
+ const std::unordered_map &all_vars,
const std::string &og) const {
- PADDLE_ENFORCE(var_types.count(og) != 0);
- if (var_types.at(og) == proto::VarType::SELECTED_ROWS) {
+ PADDLE_ENFORCE(all_vars.count(og) != 0);
+ if (all_vars.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
return true;
}
return false;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index e07597dbd80889c366babe79455beb12c9eb80d9..544cbe585c7423b5f3eb98ee698ca5668376f1ca 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -106,7 +106,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
size_t src_dev_id) const;
bool IsSparseGradient(
- const std::unordered_map &var_types,
+ const std::unordered_map &all_vars,
const std::string &og) const;
private:
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 09b67e5a1741c68c5f5487340e8fc86ff31e00a4..f92769192c218eb7cdc2350ff6e4721b45005806 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -103,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
need_update_ = true;
}
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
: desc_(desc), need_update_(false) {
// restore inputs_
int input_size = desc_.inputs_size();
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 1a330db7cc5555a939950043ac90a321573b292d..a02d3e269129596f65a2fb346e76c1af7fbead95 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,13 +33,14 @@ class OpDesc {
OpDesc(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs);
- OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+ OpDesc(const proto::OpDesc &desc, BlockDesc *block);
explicit OpDesc(BlockDesc *block) : block_(block) {}
OpDesc(const OpDesc &other, BlockDesc *block) {
*this = other;
block_ = block;
+ need_update_ = true;
}
void CopyFrom(const OpDesc &op_desc);
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 64fb028f83a539d17885186d5d8ee6ef26f095e9..1e01a6e900404990e16674755367d2fc6d832725 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -51,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
auto *block = desc_.mutable_blocks(i);
blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
}
- for (auto &block : blocks_) {
- for (auto *op : block->AllOps()) {
- for (const auto &attr : op->Proto()->attrs()) {
- if (attr.type() == proto::AttrType::BLOCK) {
- size_t blk_idx = attr.block_idx();
- op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+ for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
+ auto all_ops = blocks_[block_id]->AllOps();
+ for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
+ auto &op = all_ops[op_id];
+ for (const std::string &attr_name : op->AttrNames()) {
+ if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+ int sub_block_id =
+ o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+ op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
}
}
}
@@ -86,6 +89,16 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
for (auto &block_desc : *desc_.mutable_blocks()) {
blocks_.emplace_back(new BlockDesc(this, &block_desc));
}
+ for (auto &block : blocks_) {
+ for (auto *op : block->AllOps()) {
+ for (const auto &attr : op->Proto()->attrs()) {
+ if (attr.type() == proto::AttrType::BLOCK) {
+ size_t blk_idx = attr.block_idx();
+ op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+ }
+ }
+ }
+ }
}
const std::vector ProgramDesc::GetFeedTargetNames() {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 76126f3dc64d71770d13f9d66bb30f176c112629..0b36f1116d15004b355e854e101abb9ad3297836 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,8 +25,10 @@ void FileReader::ReadNext(std::vector *out) {
if (out->empty()) {
return;
}
+
+ PADDLE_ENFORCE_EQ(out->size(), dims_.size());
for (size_t i = 0; i < dims_.size(); ++i) {
- auto &actual = out->at(i).dims();
+ auto &actual = (*out)[i].dims();
auto &expect = dims_[i];
PADDLE_ENFORCE_EQ(actual.size(), expect.size());
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 0a1db7758bd9ec0dac133efcbf495de1d690021d..2f19ec0f0a9338e2b96d1f64eac45387bae4d1eb 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -39,7 +39,7 @@ template
inline const T* Tensor::data() const {
check_memory_size();
PADDLE_ENFORCE(std::is_same::value ||
- holder_->type().hash_code() == typeid(T).hash_code(),
+ holder_->type() == std::type_index(typeid(T)),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
@@ -53,7 +53,7 @@ template
inline T* Tensor::data() {
check_memory_size();
PADDLE_ENFORCE(std::is_same::value ||
- holder_->type().hash_code() == typeid(T).hash_code(),
+ holder_->type() == std::type_index(typeid(T)),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast(reinterpret_cast(holder_->ptr()) +
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index cc4a725dfb3b3e7723a3a3a4008b20acdb53899d..ec16a1c600a3bafc1c4cbbd920360253c106e3a1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -5,14 +5,19 @@ cc_library(paddle_fluid_api
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-# Create static library
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-cc_library(paddle_fluid DEPS ${fluid_modules})
+if(WITH_CONTRIB)
+ set(fluid_modules "${fluid_modules}" paddle_inference_api)
+endif()
+
+# Create static library
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
# Create shared library
cc_library(paddle_fluid_shared SHARED
SRCS io.cc
- DEPS ${fluid_modules})
+ DEPS ${fluid_modules} paddle_fluid_api)
+
set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
if(NOT APPLE)
# TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 9f6ce40ede25248a4f779b379c132806a4ec06ba..913e344d371ddf3ea05a53c216e5b3bea8f11c7b 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -21,7 +21,10 @@ limitations under the License. */
#include
#include
+#include
#include
+#include
+#include
#include "paddle/fluid/inference/analysis/graph_traits.h"
#include "paddle/fluid/inference/analysis/node.h"
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index 60f159da9140516284449a0274906df004b23ac5..dcee75cee50ede1d2b660e88e06544440bd5ef77 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -44,6 +44,6 @@ TEST_F(DFG_Tester, Test) {
LOG(INFO) << graph.nodes.size();
}
-} // analysis
-} // inference
-} // paddle
+}; // namespace analysis
+}; // namespace inference
+}; // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index f848a7d1add79c3032da7defc34a406dccf29d2e..9f67c989cca4a936cd320b73efaae277263fb3e2 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include
#include
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
namespace paddle {
namespace inference {
namespace analysis {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index cd0d4fabaafe844bcc5bb8bfc2586971197d9167..33517e57becdffc0416f204247eac5feadb7ed82 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -19,6 +19,8 @@
#pragma once
+#include
+
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index 851c98bef305fa9e20dced5f7c26e9d1b6ddf4f2..817d32c92cdbdc234eef9ed5156891c2b11ced4c 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -32,6 +32,6 @@ TEST_F(DFG_Tester, Init) {
LOG(INFO) << '\n' << graph.DotString();
}
-} // analysis
-} // inference
-} // paddle
+} // namespace analysis
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 24ea9a4bae7132eb1692b0ffb02f8ab5e02b21a9..153dca576bd6734d62f00c4a7cb9b503506b33e2 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -50,7 +50,7 @@ struct DataTypeNamer {
return dic_.at(x);
}
- const std::string &repr(size_t &hash) const {
+ const std::string &repr(size_t &hash) const { // NOLINT
PADDLE_ENFORCE(dic_.count(hash), "unknown type for representation");
return dic_.at(hash);
}
@@ -62,7 +62,9 @@ struct DataTypeNamer {
SET_TYPE(float);
}
- std::unordered_map dic_;
+ std::unordered_map
+ dic_;
};
#undef SET_TYPE
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index 5c89b1304d84abc9a4942f12da46b4bfe76f44f5..aa0e8667b5e4a9e6156c25fcad03bb8eee3287f6 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -16,6 +16,7 @@ limitations under the License. */
#include
#include
+#include
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
index ed90a0dcf31e154c4d82be08ce35e2f11d11c139..a31afbe6933da8d3c7a88142cc12d63b98b55796 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -18,6 +18,8 @@ limitations under the License. */
#pragma once
+#include
+
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/node.h"
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index c86083d12153921672e15c172b874f77a8b46cde..722fa99a48a5f2b0e778904de0c35977d0ee3cc0 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include
#include
+#include
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 5ada1d631269209e912e2d4817382ea2c6c67353..23ca8bfac84f35ebdca2e2a1a8538d366358ca8b 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+ DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 6297051e5a30f1daa512d25d5aa3ab3b2f79f1d1..79d01b640a214ed5eb86173a36d5e85a6626066f 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -24,7 +24,7 @@ class ReluOpConverter : public OpConverter {
void operator()(const framework::proto::OpDesc& op) override {
// Here the two nullptr looks strange, that's because the
// framework::OpDesc's constructor is strange.
- framework::OpDesc op_desc(op, nullptr, nullptr);
+ framework::OpDesc op_desc(op, nullptr);
LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
"type is Relu";
const nvinfer1::ITensor* input_tensor =
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 209936c3bafb0d31546856dc36c1b48053a0634b..668d344f1bba1c012dcb42c71b996209b4703d78 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
class Conv2dOpConverter : public OpConverter {
public:
Conv2dOpConverter() {}
- void operator()(const framework::proto::OpDesc& op) override {
+ void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) override {
LOG(INFO)
<< "convert a fluid conv2d op to tensorrt conv layer without bias";
}
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd05608d7620ee4d917b30f919fba70f6aeff17f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+ T* odata, nvinfer1::DimsHW ostrides) {
+ for (int h = 0; h < shape.h(); ++h) {
+ for (int w = 0; w < shape.w(); ++w) {
+ odata[h * ostrides.h() + w * ostrides.w()] =
+ idata[h * ostrides.h() + w * ostrides.w()];
+ }
+ }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+ TensorRTEngine::Weight* oweights) {
+ int c = iweights.dims[0];
+ int k = iweights.dims[1];
+ oweights->dims.assign({k, c});
+ nvinfer1::DimsHW istrides = {1, k};
+ nvinfer1::DimsHW ostrides = {c, 1};
+ Reorder2({k, c}, static_cast(iweights.get().values), istrides,
+ static_cast(const_cast(oweights->get().values)),
+ ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+ void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) override {
+ VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+ framework::OpDesc op_desc(op, nullptr, nullptr);
+ PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+ PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
+ PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+ // Declare inputs
+ auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+ // Declare weights
+ auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+ PADDLE_ENFORCE_NOT_NULL(Y_v);
+ auto* Y_t = Y_v->GetMutable();
+ // This may trigger a GPU->CPU copy, because TRT's weight can only be
+ // assigned from CPU memory, that can't be avoided.
+ auto* weight_data = Y_t->mutable_data(platform::CPUPlace());
+ PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix
+ size_t n_output = Y_t->dims()[1];
+
+ framework::LoDTensor tmp;
+ tmp.Resize(Y_t->dims());
+ memcpy(tmp.mutable_data(platform::CPUPlace()), Y_t->data(),
+ Y_t->dims()[0] * Y_t->dims()[1]);
+
+ TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+ static_cast(weight_data),
+ Y_t->memory_size() / sizeof(float)};
+ TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+ static_cast(tmp.data()),
+ Y_t->memory_size() / sizeof(float));
+ weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+ tmp_weight.dims = weight.dims;
+
+ // The data layout of TRT FC layer's weight is different from fluid's FC,
+ // need to reorder the elements.
+ ReorderCKtoKC(tmp_weight, &weight);
+
+ // Currently, the framework can only handle one fluid op -> one TRT layer,
+ // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+ // handle `mul`, leave `add` as another layer.
+ // DEBUG
+ TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+ auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+ *const_cast(X),
+ n_output, weight.get(), bias.get());
+
+ auto output_name = op_desc.Output("Out").front();
+ engine_->DeclareOutput(layer, 0, output_name);
+ }
+};
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+
+} // namespace tensorrt
+} // namespace inference
+} // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index ed09f54bde00d12aaec829ba90cc08ebfef57e92..6bb07709c7ee1c6b29c46425849a4f472d3df59d 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -24,10 +24,11 @@ namespace tensorrt {
class MulOpConverter : public OpConverter {
public:
MulOpConverter() {}
- void operator()(const framework::proto::OpDesc& op) override {
- VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
+ void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) override {
+ VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
- framework::OpDesc op_desc(op, nullptr, nullptr);
+ framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1cd3ed9a00acead2599420f88499bd0d74c2974b..4d21e241c0fe0abd9d454aa4f5f5ffeda747bed9 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
class OpConverter {
public:
OpConverter() {}
- virtual void operator()(const framework::proto::OpDesc& op) {}
- void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
- std::string type = op.type();
- auto* it = Registry::Lookup(type);
- PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
- it->SetEngine(engine);
- (*it)(op);
- }
+ // Converter logic for an op.
+ virtual void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) {}
+
+ // Convert a single fluid operaotr and add the corresponding layer to TRT.
+ void ConvertOp(const framework::proto::OpDesc& op,
+ const std::unordered_set& parameters,
+ const framework::Scope& scope, TensorRTEngine* engine) {
+ framework::OpDesc op_desc(op, nullptr, nullptr);
+
+ OpConverter* it{nullptr};
- // convert fluid op to tensorrt layer
- void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
- OpConverter::Run(op, engine);
+ if (op_desc.Type() == "mul") {
+ PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+ std::string Y = op_desc.Input("Y")[0];
+ if (parameters.count(Y)) {
+ it = Registry::Lookup("fc");
+ }
+ }
+ if (!it) {
+ it = Registry::Lookup(op_desc.Type());
+ }
+ PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+ op_desc.Type());
+ it->SetEngine(engine);
+ (*it)(op, scope);
}
// convert fluid block to tensorrt network
void ConvertBlock(const framework::proto::BlockDesc& block,
- TensorRTEngine* engine) {
+ const std::unordered_set& parameters,
+ const framework::Scope& scope, TensorRTEngine* engine) {
for (int i = 0; i < block.ops_size(); i++) {
const auto& op = block.ops(i);
- OpConverter::Run(op, engine);
+ ConvertOp(op, parameters, scope, engine);
}
}
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30253072ac581ceca85ca10151a176f87a7cb39
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+ std::unordered_set parameters({"mul-Y"});
+ framework::Scope scope;
+ TRTConvertValidation validator(20, parameters, scope, 1000);
+
+ validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+ validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+ validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+ // Prepare Op description
+ framework::OpDesc desc;
+ desc.SetType("mul");
+ desc.SetInput("X", {"mul-X"});
+ desc.SetInput("Y", {"mul-Y"});
+ desc.SetOutput("Out", {"mul-Out"});
+
+ validator.SetOp(*desc.Proto());
+
+ validator.Execute(10);
+}
+
+} // namespace tensorrt
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index d8b61d5f08ffd071c112b4677fcb6f6f50784bbc..1ce1130e5d660d717a1262a1fbdb4b620462c0b3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -21,7 +21,9 @@ namespace inference {
namespace tensorrt {
TEST(MulOpConverter, main) {
- TRTConvertValidation validator(10, 1000);
+ framework::Scope scope;
+ std::unordered_set parameters;
+ TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9ae7de9cbfa656fbcbb48557bd4b548115897c6d..1d3f5eabb2f839b2acfa9da6527589df1ec3767f 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
#include
#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
@@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) {
conv2d_op->SetType("conv2d");
OpConverter converter;
- converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+ framework::Scope scope;
+ converter.ConvertBlock(*block->Proto(), {}, scope,
+ nullptr /*TensorRTEngine*/);
}
} // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 37fcb5c50309db0ad0924a057a6b481750665531..d7e05dd5b5b235b7b166b22c5b094dc364e28dfc 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -19,6 +19,9 @@ limitations under the License. */
#pragma once
+#include
+#include
+
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/analysis/helper.h"
@@ -58,7 +61,10 @@ class TRTConvertValidation {
public:
TRTConvertValidation() = delete;
- TRTConvertValidation(int batch_size, int workspace_size = 1 << 10) {
+ TRTConvertValidation(int batch_size,
+ const std::unordered_set& parameters,
+ framework::Scope& scope, int workspace_size = 1 << 10)
+ : parameters_(parameters), scope_(scope) {
// create engine.
engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
engine_->InitNetwork();
@@ -73,19 +79,22 @@ class TRTConvertValidation {
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
}
+ // Declare a parameter varaible in the scope.
+ void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+ DeclVar(name, dims);
+ }
+
void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims);
}
+ // Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// Init Fluid tensor.
- std::vector dim_vec(dims.nbDims);
- for (int i = 0; i < dims.nbDims; i++) {
- dim_vec[i] = dims.d[i];
- }
+ std::vector dim_vec(dims.d, dims.d + dims.nbDims);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable();
x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -96,20 +105,22 @@ class TRTConvertValidation {
op_ = framework::OpRegistry::CreateOp(desc);
OpConverter op_converter;
- op_converter.ConvertOp(desc, engine_.get());
+ op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
engine_->FreezeNetwork();
// Declare outputs.
- op_desc_.reset(new framework::OpDesc(desc, nullptr, nullptr));
+ op_desc_.reset(new framework::OpDesc(desc, nullptr));
// Set Inputs.
for (const auto& input : op_desc_->InputArgumentNames()) {
+ if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input);
PADDLE_ENFORCE(var);
auto tensor = var->GetMutable();
+
engine_->SetInputFromCPU(
- input, static_cast(tensor->data()),
+ input, static_cast(tensor->data()),
sizeof(float) *
analysis::AccuDims(tensor->dims(), tensor->dims().size()));
}
@@ -117,18 +128,21 @@ class TRTConvertValidation {
void Execute(int batch_size) {
// Execute Fluid Op
- // Execute TRT
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
- engine_->Execute(batch_size);
-
op_->Run(scope_, place);
+ // Execute TRT.
+ engine_->Execute(batch_size);
+ cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+ const size_t output_space_size = 200;
for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector fluid_out;
- std::vector trt_out(200);
- engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float));
+ std::vector trt_out(output_space_size);
+ engine_->GetOutputInCPU(output, &trt_out[0],
+ output_space_size * sizeof(float));
+ cudaStreamSynchronize(*engine_->stream());
auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable();
@@ -136,7 +150,7 @@ class TRTConvertValidation {
// Compare two output
ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out.size(); i++) {
- EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001);
+ EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
}
}
}
@@ -146,9 +160,10 @@ class TRTConvertValidation {
private:
std::unique_ptr engine_;
cudaStream_t stream_;
- framework::Scope scope_;
std::unique_ptr op_;
std::unique_ptr op_desc_;
+ const std::unordered_set& parameters_;
+ framework::Scope& scope_;
};
} // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index a88236ae98e1816fc43796ead596c432b798d7de..3d75fefc1a735168131a6c67ac073e80aba32945 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
name);
auto* output = layer->getOutput(offset);
+ SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
infer_network_->markOutput(*output);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index d9d3163b66d4c4c302d12edcc42f00e1cdfa5a30..fabcfd9e80cc0ef2637201a1499ebbe2d6adfd8c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
// Weight is model parameter.
class Weight {
public:
- Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+ Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
w_.type = dtype;
w_.values = value;
w_.count = num_elem;
}
const nvinfer1::Weights& get() { return w_; }
+ std::vector dims;
+
private:
nvinfer1::Weights w_;
};
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
index d46fda54e7a9d5bc737a7ec2116daca33ffa015f..3321adf2743c28f6eeca8b5cc91ef89beed6b97c 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -34,9 +34,22 @@ class BilinearInterpOp : public framework::OperatorWithKernel {
int out_w = ctx->Attrs().Get("out_w");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
+ if (ctx->HasInput("OutSize")) {
+ auto out_size_dim = ctx->GetInputDim("OutSize");
+ PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+ "OutSize's dimension size must be 1");
+ PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+ }
std::vector dim_out({dim_x[0], dim_x[1], out_h, out_w});
ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
}
+
+ protected:
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext& ctx) const override {
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace());
+ }
};
class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -45,6 +58,10 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X",
"(Tensor) The input tensor of bilinear interpolation, "
"This is a 4-D tensor with shape of (N x C x h x w)");
+ AddInput("OutSize",
+ "(Tensor) This is a 1-D tensor with two number. "
+ "The first number is height and the second number is width.")
+ .AsDispensable();
AddOutput("Out",
"(Tensor) The dimension of output is (N x C x out_h x out_w]");
@@ -78,6 +95,12 @@ class BilinearInterpOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
}
}
+
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext& ctx) const override {
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace());
+ }
};
} // namespace operators
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
index 510190f1aaf02960284216a1bedd409011088499..4c1971538495c6f111e9db18f4014786f6f0dd58 100644
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -102,10 +102,21 @@ class BilinearInterpOpCUDAKernel : public framework::OpKernel {
auto* input_t = ctx.Input("X"); // float tensor
auto* output_t = ctx.Output("Out"); // float tensor
auto* input = input_t->data();
- auto* output = output_t->mutable_data(ctx.GetPlace());
int out_h = ctx.Attr("out_h");
int out_w = ctx.Attr("out_w");
+ auto out_dims = output_t->dims();
+ auto out_size_t = ctx.Input("OutSize");
+ if (out_size_t != nullptr) {
+ Tensor sizes;
+ framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+ auto size_data = sizes.data();
+ out_h = size_data[0];
+ out_w = size_data[1];
+ }
+ auto* output = output_t->mutable_data(
+ {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+
int batch_size = input_t->dims()[0];
int channels = input_t->dims()[1];
int in_h = input_t->dims()[2];
@@ -139,8 +150,8 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_input_t = ctx.Output(framework::GradVarName("X"));
auto* d_output_t = ctx.Input(framework::GradVarName("Out"));
- auto* d_input = d_input_t->mutable_data(ctx.GetPlace());
auto* d_output = d_output_t->data();
+ auto* d_input = d_input_t->mutable_data(ctx.GetPlace());
auto& device_ctx =
ctx.template device_context();
@@ -149,6 +160,16 @@ class BilinearInterpGradOpCUDAKernel : public framework::OpKernel {
int out_h = ctx.Attr("out_h");
int out_w = ctx.Attr("out_w");
+
+ auto out_size_t = ctx.Input("OutSize");
+ if (out_size_t != nullptr) {
+ Tensor sizes;
+ framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+ auto size_data = sizes.data();
+ out_h = size_data[0];
+ out_w = size_data[1];
+ }
+
int batch_size = d_input_t->dims()[0];
int channels = d_input_t->dims()[1];
int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
index f6cd77e4d49b53ecde6a84908cdffc7e1e02ac6a..8b03cd5a0635584a45782fe5a4823c37fe4fa8e8 100644
--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -24,11 +24,18 @@ class BilinearInterpKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input_t = ctx.Input("X"); // float tensor
auto* output_t = ctx.Output("Out"); // float tensor
+ auto out_dims = output_t->dims();
auto* input = input_t->data();
- auto* output = output_t->mutable_data(ctx.GetPlace());
-
int out_h = ctx.Attr("out_h");
int out_w = ctx.Attr("out_w");
+ auto out_size_t = ctx.Input("OutSize");
+ if (out_size_t != nullptr) {
+ auto out_size_data = out_size_t->data();
+ out_h = out_size_data[0];
+ out_w = out_size_data[1];
+ }
+ auto* output = output_t->mutable_data(
+ {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
int batch_size = input_t->dims()[0];
int channels = input_t->dims()[1];
int in_h = input_t->dims()[2];
@@ -83,9 +90,8 @@ class BilinearInterpGradKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_input_t = ctx.Output(framework::GradVarName("X"));
auto* d_output_t = ctx.Input(framework::GradVarName("Out"));
- auto* d_input = d_input_t->mutable_data(ctx.GetPlace());
auto* d_output = d_output_t->data();
-
+ auto* d_input = d_input_t->mutable_data(ctx.GetPlace());
auto& device_ctx =
ctx.template device_context();
math::SetConstant zero;
@@ -93,6 +99,14 @@ class BilinearInterpGradKernel : public framework::OpKernel {
int out_h = ctx.Attr("out_h");
int out_w = ctx.Attr("out_w");
+
+ auto out_size_t = ctx.Input("OutSize");
+ if (out_size_t != nullptr) {
+ auto out_size_data = out_size_t->data();
+ out_h = out_size_data[0];
+ out_w = out_size_data[1];
+ }
+
int batch_size = d_input_t->dims()[0];
int channels = d_input_t->dims()[1];
int in_h = d_input_t->dims()[2];
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index b9a66474c9afc27462f9c47af1a0465e2cec70bc..cf20530513cf6cd420e56b2f6378225f73c2bc8b 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1,6 +1,7 @@
if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
- grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+ request_handler_impl.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+ selected_rows memory)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index f7ce7786874285795878b655365974f082c00b44..da9ca1a0c1d55018141f0e4285fe35d7c437fd55 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -205,6 +205,8 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
}
bool RPCClient::Wait() {
+ VLOG(3) << "RPCClient begin Wait()"
+ << " req_count_:" << req_count_;
if (req_count_ <= 0) {
return true;
}
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 361cc24b5ba11e2654f1282327730befaeca9f55..e73756d89004bc48339c0aa31dd0857c2ca6722d 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
#include
#include
-using ::grpc::ServerAsyncResponseWriter;
+#include "paddle/fluid/operators/detail/grpc_server.h"
-DEFINE_int32(rpc_server_handle_send_threads, 20,
- "Number of threads used to handle send at rpc server.");
-DEFINE_int32(rpc_server_handle_get_threads, 20,
- "Number of threads used to handle get at rpc server.");
-DEFINE_int32(rpc_server_handle_prefetch_threads, 1,
- "Number of threads used to handle prefetch at rpc server.");
+using ::grpc::ServerAsyncResponseWriter;
namespace paddle {
namespace operators {
@@ -36,49 +29,40 @@ enum CallStatus { PROCESS = 0, FINISH };
class RequestBase {
public:
explicit RequestBase(GrpcService::AsyncService* service,
- ::grpc::ServerCompletionQueue* cq, bool sync_mode,
- const platform::DeviceContext* dev_ctx)
+ ::grpc::ServerCompletionQueue* cq,
+ RequestHandler* request_handler, int req_id)
: service_(service),
cq_(cq),
- sync_mode_(sync_mode),
status_(PROCESS),
- dev_ctx_(dev_ctx) {
+ request_handler_(request_handler),
+ req_id_(req_id) {
PADDLE_ENFORCE(cq_);
}
virtual ~RequestBase() {}
- virtual void Process() { assert(false); }
+ virtual void Process() = 0;
CallStatus Status() { return status_; }
void SetStatus(CallStatus status) { status_ = status; }
- virtual std::string GetReqName() {
- assert(false);
- return "";
- }
+ virtual std::string GetReqName() = 0;
protected:
::grpc::ServerContext ctx_;
GrpcService::AsyncService* service_;
::grpc::ServerCompletionQueue* cq_;
- const bool sync_mode_;
CallStatus status_;
- const platform::DeviceContext* dev_ctx_;
+ RequestHandler* request_handler_;
+ int req_id_;
};
class RequestSend final : public RequestBase {
public:
explicit RequestSend(GrpcService::AsyncService* service,
- ::grpc::ServerCompletionQueue* cq, bool sync_mode,
- framework::Scope* scope, ReceivedQueue* queue,
- const platform::DeviceContext* dev_ctx, int req_id)
- : RequestBase(service, cq, sync_mode, dev_ctx),
- queue_(queue),
- responder_(&ctx_),
- req_id_(req_id) {
- if (sync_mode_) {
- request_.reset(new VariableResponse(scope, dev_ctx_, false));
- } else {
- request_.reset(new VariableResponse(scope, dev_ctx_, true));
- }
+ ::grpc::ServerCompletionQueue* cq,
+ RequestHandler* request_handler, int req_id)
+ : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+ request_.reset(new VariableResponse(request_handler->scope(),
+ request_handler->dev_ctx(),
+ !request_handler->sync_mode()));
int method_id = static_cast(detail::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -87,12 +71,17 @@ class RequestSend final : public RequestBase {
virtual ~RequestSend() {}
- virtual std::string GetReqName() { return request_->Varname(); }
+ std::string GetReqName() override { return request_->Varname(); }
+
+ void Process() override {
+ std::string varname = GetReqName();
+ VLOG(3) << "RequestSend var_name:" << varname;
- virtual void Process() {
- std::string var_name = GetReqName();
- VLOG(3) << "RequestSend " << var_name;
- queue_->Push(std::make_pair(var_name, request_));
+ auto scope = request_->GetMutableLocalScope();
+ auto invar = request_->GetVar();
+ framework::Variable* outvar = nullptr;
+
+ request_handler_->Handle(varname, scope, invar, &outvar);
status_ = FINISH;
responder_.Finish(reply_, ::grpc::Status::OK,
@@ -102,105 +91,85 @@ class RequestSend final : public RequestBase {
protected:
sendrecv::VoidMessage reply_;
std::shared_ptr request_;
- ReceivedQueue* queue_;
ServerAsyncResponseWriter responder_;
- int req_id_;
};
class RequestGet final : public RequestBase {
public:
explicit RequestGet(GrpcService::AsyncService* service,
- ::grpc::ServerCompletionQueue* cq, bool sync_mode,
- framework::Scope* scope,
- const platform::DeviceContext* dev_ctx,
- framework::BlockingQueue* queue,
- int req_id)
- : RequestBase(service, cq, sync_mode, dev_ctx),
- responder_(&ctx_),
- scope_(scope),
- queue_(queue),
- req_id_(req_id) {
+ ::grpc::ServerCompletionQueue* cq,
+ RequestHandler* request_handler, int req_id)
+ : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
auto method_id = static_cast(detail::GrpcMethod::kGetVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, &request_, &responder_, cq_, cq_,
- reinterpret_cast(static_cast(req_id_)));
+ reinterpret_cast(static_cast(req_id)));
}
virtual ~RequestGet() {}
- virtual std::string GetReqName() { return request_.varname(); }
+ std::string GetReqName() override { return request_.varname(); }
- virtual void Process() {
+ void Process() override {
// proc request.
- std::string var_name = request_.varname();
- VLOG(3) << "RequestGet " << var_name;
- auto* var = scope_->FindVar(var_name);
+ std::string varname = request_.varname();
+ VLOG(3) << "RequestGet " << varname;
+
+ auto scope = request_handler_->scope();
+ auto invar = scope->FindVar(varname);
+ framework::Variable* outvar = nullptr;
- if (var_name != FETCH_BARRIER_MESSAGE) {
- SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
+ request_handler_->Handle(varname, scope, invar, &outvar);
+
+ if (outvar) {
+ SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+ &reply_);
}
status_ = FINISH;
responder_.Finish(reply_, ::grpc::Status::OK,
reinterpret_cast(static_cast(req_id_)));
-
- if (var_name == FETCH_BARRIER_MESSAGE) {
- sendrecv::VariableMessage msg;
- MessageWithName msg_with_name = std::make_pair(var_name, msg);
- queue_->Push(msg_with_name);
- }
}
protected:
sendrecv::VariableMessage request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
- framework::Scope* scope_;
- framework::BlockingQueue* queue_;
- int req_id_;
};
class RequestPrefetch final : public RequestBase {
public:
explicit RequestPrefetch(GrpcService::AsyncService* service,
- ::grpc::ServerCompletionQueue* cq, bool sync_mode,
- framework::Scope* scope,
- const platform::DeviceContext* dev_ctx,
- framework::Executor* executor,
- framework::ProgramDesc* program,
- framework::ExecutorPrepareContext* prefetch_ctx,
- int req_id)
- : RequestBase(service, cq, sync_mode, dev_ctx),
+ ::grpc::ServerCompletionQueue* cq,
+ RequestHandler* request_handler, int req_id)
+ : RequestBase(service, cq, request_handler, req_id),
responder_(&ctx_),
- scope_(scope),
- executor_(executor),
- program_(program),
- prefetch_ctx_(prefetch_ctx),
- req_id_(req_id) {
- // prefetch always create a new sub scope
- request_.reset(new VariableResponse(scope, dev_ctx_, true));
+ local_scope_(nullptr) {
+ request_.reset(new VariableResponse(request_handler->scope(),
+ request_handler->dev_ctx(), true));
int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
- reinterpret_cast(static_cast(req_id_)));
+ reinterpret_cast(static_cast(req_id)));
}
virtual ~RequestPrefetch() {}
- virtual std::string GetReqName() { return request_->Varname(); }
+ std::string GetReqName() override { return request_->Varname(); }
- virtual void Process() {
+ void Process() override {
// prefetch process...
+ std::string varname = request_->OutVarname();
+ VLOG(3) << "RequestPrefetch " << varname;
+
+ auto scope = request_->GetMutableLocalScope();
+ auto invar = scope->FindVar(varname);
+ framework::Variable* outvar = nullptr;
- std::string var_name = request_->OutVarname();
- VLOG(3) << "RequestPrefetch " << var_name;
- auto var_desc = program_->Block(0).FindVar(var_name);
- framework::Scope* local_scope = request_->GetMutableLocalScope();
- auto* var = local_scope->FindVar(var_name);
- InitializeVariable(var, var_desc->GetType());
- executor_->RunPreparedContext(prefetch_ctx_, local_scope);
+ request_handler_->Handle(varname, scope, invar, &outvar);
- SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply_);
+ SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+ &reply_);
status_ = FINISH;
responder_.Finish(reply_, ::grpc::Status::OK,
@@ -211,202 +180,169 @@ class RequestPrefetch final : public RequestBase {
std::shared_ptr request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
- framework::Scope* scope_;
- framework::Executor* executor_;
- framework::ProgramDesc* program_;
- framework::ExecutorPrepareContext* prefetch_ctx_;
- int req_id_;
+ framework::Scope* local_scope_;
};
-void AsyncGRPCServer::WaitClientGet(int count) {
- int fetch_barriers = 0;
- while (fetch_barriers < count) {
- auto msg = var_get_queue_.Pop();
- if (msg.first == FETCH_BARRIER_MESSAGE) {
- fetch_barriers++;
- }
- }
-}
-
void AsyncGRPCServer::WaitServerReady() {
+ VLOG(3) << "AsyncGRPCServer is wait server ready";
std::unique_lock lock(this->mutex_ready_);
condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+ VLOG(3) << "AsyncGRPCServer WaitSeverReady";
}
-void AsyncGRPCServer::RunSyncUpdate() {
+void AsyncGRPCServer::StartServer() {
::grpc::ServerBuilder builder;
- builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+ builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
&selected_port_);
+
builder.SetMaxSendMessageSize(std::numeric_limits::max());
builder.SetMaxReceiveMessageSize(std::numeric_limits::max());
builder.RegisterService(&service_);
- cq_send_ = builder.AddCompletionQueue();
- cq_get_ = builder.AddCompletionQueue();
- cq_prefetch_ = builder.AddCompletionQueue();
+ for (auto t : rpc_call_map_) {
+ rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+ }
server_ = builder.BuildAndStart();
- LOG(INFO) << "Server listening on " << address_
+ LOG(INFO) << "Server listening on " << bind_address_
<< " selected port: " << selected_port_;
- std::function send_register = std::bind(
- &AsyncGRPCServer::TryToRegisterNewSendOne, this, std::placeholders::_1);
- std::function get_register = std::bind(
- &AsyncGRPCServer::TryToRegisterNewGetOne, this, std::placeholders::_1);
- std::function prefetch_register =
- std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this,
- std::placeholders::_1);
+ std::function f =
+ std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+ std::placeholders::_1, std::placeholders::_2);
- for (int i = 0; i < kSendReqsBufSize; ++i) {
- TryToRegisterNewSendOne(i);
- }
- for (int i = 0; i < kGetReqsBufSize; ++i) {
- TryToRegisterNewGetOne(i);
- }
- for (int i = 0; i < kPrefetchReqsBufSize; ++i) {
- TryToRegisterNewPrefetchOne(i);
- }
+ for (auto& t : rpc_call_map_) {
+ auto& rpc_name = t.first;
+ auto& cq = rpc_cq_[rpc_name];
+ auto threadnum = rpc_thread_num_[rpc_name];
+ auto& reqs = rpc_reqs_[rpc_name];
- for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
- t_sends_.emplace_back(
- new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
- cq_send_.get(), "cq_send", send_register)));
- }
- for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
- t_gets_.emplace_back(
- new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
- cq_get_.get(), "cq_get", get_register)));
- }
- for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
- t_prefetchs_.emplace_back(new std::thread(
- std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(),
- "cq_prefetch", prefetch_register)));
+ reqs.reserve(kRequestBufSize);
+
+ for (int i = 0; i < kRequestBufSize; i++) {
+ TryToRegisterNewOne(rpc_name, i);
+ }
+
+ for (int i = 0; i < threadnum; i++) {
+ rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+ &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+ VLOG(3) << t.first << " creates threads!";
+ }
}
+
{
std::lock_guard lock(this->mutex_ready_);
ready_ = 1;
}
condition_ready_.notify_all();
+
// wait server
server_->Wait();
- for (int i = 0; i < FLAGS_rpc_server_handle_send_threads; ++i) {
- t_sends_[i]->join();
- }
- for (int i = 0; i < FLAGS_rpc_server_handle_get_threads; ++i) {
- t_gets_[i]->join();
- }
- for (int i = 0; i < FLAGS_rpc_server_handle_prefetch_threads; ++i) {
- t_prefetchs_[i]->join();
+
+ for (auto& t : rpc_threads_) {
+ auto& threads = t.second;
+ for (size_t i = 0; i < threads.size(); ++i) {
+ threads[i]->join();
+ VLOG(3) << t.first << " threads ends!";
+ }
}
}
void AsyncGRPCServer::ShutdownQueue() {
- std::unique_lock lock(cq_mutex_);
- cq_send_->Shutdown();
- cq_get_->Shutdown();
- cq_prefetch_->Shutdown();
+ for (auto& t : rpc_cq_) {
+ t.second->Shutdown();
+ VLOG(3) << t.first << " shutdown!";
+ }
}
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
+void AsyncGRPCServer::ShutDownImpl() {
+ std::unique_lock lock(cq_mutex_);
is_shut_down_ = true;
ShutdownQueue();
+
+ VLOG(3) << "server_ shutdown!";
server_->Shutdown();
}
-void AsyncGRPCServer::TryToRegisterNewSendOne(int i) {
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+ int req_id) {
std::unique_lock lock(cq_mutex_);
if (is_shut_down_) {
VLOG(3) << "shutdown, do not TryToRegisterNewSendOne";
return;
}
- RequestSend* send = new RequestSend(&service_, cq_send_.get(), sync_mode_,
- scope_, &var_recv_queue_, dev_ctx_, i);
- send_reqs_[i] = static_cast(send);
- VLOG(4) << "Create RequestSend status:" << send->Status();
-}
-void AsyncGRPCServer::TryToRegisterNewGetOne(int req_id) {
- std::unique_lock lock(cq_mutex_);
- if (is_shut_down_) {
- VLOG(3) << "shutdown, do not TryToRegisterNewGetOne";
- return;
+ VLOG(4) << "register send rpc_name:" << rpc_name
+ << ", handler:" << rpc_call_map_[kRequestSend];
+
+ auto& reqs = rpc_reqs_[rpc_name];
+ auto& handler = rpc_call_map_[rpc_name];
+ auto& cq = rpc_cq_[rpc_name];
+
+ RequestBase* b = nullptr;
+ if (rpc_name == kRequestSend) {
+ b = new RequestSend(&service_, cq.get(), handler, req_id);
+ } else if (rpc_name == kRequestGet) {
+ b = new RequestGet(&service_, cq.get(), handler, req_id);
+ } else if (rpc_name == kRequestPrefetch) {
+ b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+ } else {
+ PADDLE_ENFORCE(false, "not surpported rpc");
}
- RequestGet* get = new RequestGet(&service_, cq_get_.get(), sync_mode_, scope_,
- dev_ctx_, &var_get_queue_, req_id);
- get_reqs_[req_id] = static_cast(get);
- VLOG(4) << "Create RequestGet status:" << get->Status();
-}
-void AsyncGRPCServer::TryToRegisterNewPrefetchOne(int req_id) {
- std::unique_lock lock(cq_mutex_);
- if (is_shut_down_) {
- VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne";
- return;
- }
- RequestPrefetch* prefetch = new RequestPrefetch(
- &service_, cq_prefetch_.get(), sync_mode_, scope_, dev_ctx_, executor_,
- program_, prefetch_ctx_.get(), req_id);
- prefetch_reqs_[req_id] = static_cast(prefetch);
+ reqs[req_id] = b;
- VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
+ VLOG(4) << "Create RequestSend status:" << b->Status();
}
-// FIXME(typhoonzero): change cq_name to enum.
void AsyncGRPCServer::HandleRequest(
- ::grpc::ServerCompletionQueue* cq, const std::string& cq_name,
- std::function TryToRegisterNewOne) {
+ ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+ std::function TryToRegisterNewOne) {
void* tag = NULL;
bool ok = false;
while (true) {
- VLOG(3) << "HandleRequest for " << cq_name << " wait Next";
+ VLOG(3) << "HandleRequest " << rpc_name << " wait next";
if (!cq->Next(&tag, &ok)) {
- LOG(INFO) << cq_name << " CompletionQueue shutdown!";
+ LOG(INFO) << "CompletionQueue " << rpc_name << " shutdown!";
break;
}
- VLOG(3) << "HandleRequest for " << cq_name << " get Next";
- int req_id = static_cast(reinterpret_cast(tag));
- if (sync_mode_) {
- // FIXME(typhoonzero): de-couple the barriers with recv_op
- if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1);
- if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0);
- VLOG(3) << "HandleRequest for " << cq_name << " after WaitCond";
- }
+ int req_id = static_cast(reinterpret_cast(tag));
+ VLOG(3) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+ << " get next";
+ auto& reqs = rpc_reqs_[rpc_name];
RequestBase* base = nullptr;
{
- std::lock_guard l(cq_mutex_);
- if (cq_name == "cq_get") {
- base = get_reqs_[req_id];
- } else if (cq_name == "cq_send") {
- base = send_reqs_[req_id];
- } else if (cq_name == "cq_prefetch") {
- base = prefetch_reqs_[req_id];
- }
+ PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+ std::unique_lock lock(cq_mutex_);
+ base = reqs[req_id];
}
+
// reference:
// https://github.com/tensorflow/tensorflow/issues/5596
// https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
// https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
if (!ok) {
- LOG(WARNING) << cq_name << " recv no regular event:argument name["
+ LOG(WARNING) << "completion queue:" << rpc_name
+ << " recv no regular event:argument name["
<< base->GetReqName() << "]";
- TryToRegisterNewOne(req_id);
+ TryToRegisterNewOne(rpc_name, req_id);
delete base;
continue;
}
+ VLOG(3) << "queue id:" << rpc_name << ", req_id:" << req_id
+ << ", status:" << base->Status();
+
switch (base->Status()) {
case PROCESS: {
base->Process();
- VLOG(4) << cq_name << " PROCESS status:" << base->Status();
break;
}
case FINISH: {
- TryToRegisterNewOne(req_id);
- VLOG(4) << cq_name << " FINISH status:" << base->Status();
+ TryToRegisterNewOne(rpc_name, req_id);
delete base;
break;
}
@@ -415,20 +351,6 @@ void AsyncGRPCServer::HandleRequest(
}
}
-void AsyncGRPCServer::WaitCond(int cond) {
- std::unique_lock lock(this->barrier_mutex_);
- barrier_condition_.wait(lock,
- [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
- {
- std::lock_guard lock(this->barrier_mutex_);
- barrier_cond_step_ = cond;
- }
- barrier_condition_.notify_all();
-}
-
} // namespace detail
} // namespace operators
} // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index bdff9801a928699f8391bfb68c1c7bd2d75aa642..d1fcbc414f123c5c4810d9cecf807a406aa2c405 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
+#include |