diff --git a/Dockerfile b/Dockerfile
index 4d6165b79a1d94b8f27d7f3ee1b6e2cee5992d31..752fea5951bdc8c2cf79a17c960217c88ae62571 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,7 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
RUN apt-get update && \
apt-get install -y --allow-downgrades \
- git python-pip python-dev openssh-server bison \
+ git python-pip python-dev python-opencv openssh-server bison \
libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
curl sed grep graphviz libjpeg-dev zlib1g-dev \
@@ -76,8 +76,7 @@ RUN easy_install -U pip && \
pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip install pre-commit 'ipython==5.3.0' && \
- pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
- pip install opencv-python
+ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
#For docstring checker
RUN pip install pylint pytest astroid isort
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 7b66e8a5b5020fd847982db401665d24ba3a069c..fb4114356d4f37efc8ad672316fd4f99443d9fcd 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
caffe/image/logs
tensorflow/image/logs
tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..8298fcf95a5074bce9533e04d54dab79a1460286
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && chmod +x /usr/bin/paddle_k8s
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py dataset.py models/ /workspace/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
index 7071e9fdcd394a5a4db4d0d599610a72d98c0a3c..1b0c7dce8bd6faab0c4c59caa1cbe337483cbd16 100644
--- a/benchmark/fluid/README.md
+++ b/benchmark/fluid/README.md
@@ -44,11 +44,25 @@ Currently supported `--model` argument include:
## Run Distributed Benchmark on Kubernetes Cluster
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
distributed benchmark jobs to your cluster. To generate a job yaml, just run:
```bash
-python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --parallel 1 --device GPU --update_method pserver " --disttype pserver
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
```
Then the yaml files are generated under directory `myjob`, you can run:
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index c1d458970a58bfac2a3369e8964eb100568b28f2..49f26255f315c3c368f42b367dfc6487ffa0deb5 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -40,10 +40,7 @@ def parse_args():
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
parser.add_argument(
- '--learning_rate',
- type=float,
- default=0.001,
- help='The minibatch size.')
+ '--learning_rate', type=float, default=0.001, help='The learning rate.')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
@@ -72,6 +69,11 @@ def parse_args():
type=int,
default=1,
help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+ parser.add_argument(
+ '--cpus',
+ type=int,
+ default=1,
+ help='If cpus > 1, will use ParallelDo to run, else use Executor.')
parser.add_argument(
'--data_set',
type=str,
@@ -88,8 +90,8 @@ def parse_args():
help='If set, use nvprof for CUDA.')
parser.add_argument(
'--no_test',
- action='store_false',
- help='If set, test the testset during training.')
+ action='store_true',
+ help='If set, do not test the testset during training.')
parser.add_argument(
'--memory_optimize',
action='store_true',
@@ -231,13 +233,10 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
train_losses.append(loss)
print("Pass: %d, Iter: %d, Loss: %f\n" %
(pass_id, iters, np.mean(train_losses)))
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sec\n' %
- (num_samples, train_elapsed, examples_per_sec))
- print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses)))
+ print_train_time(start_time, time.time(), num_samples)
+ print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
# evaluation
- if not args.no_test and batch_acc != None:
+ if not args.no_test and batch_acc:
pass_test_acc = test(exe, infer_prog, test_reader, feeder,
batch_acc)
print(", Test Accuracy: %f" % pass_test_acc)
@@ -315,11 +314,8 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
if batch_id % 1 == 0:
print("Pass %d, batch %d, loss %s" %
(pass_id, batch_id, np.array(loss)))
- train_elapsed = time.time() - start_time
- examples_per_sec = num_samples / train_elapsed
- print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
- (num_samples, train_elapsed, examples_per_sec))
- if not args.no_test and batch_acc != None:
+ print_train_time(start_time, time.time(), num_samples)
+ if not args.no_test and batch_acc:
test_acc = test(startup_exe, infer_prog, test_reader, feeder,
batch_acc)
print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
@@ -329,12 +325,19 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
def print_arguments(args):
vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
vars(args)['device'] == 'GPU')
- print('----------- resnet Configuration Arguments -----------')
+ print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
+def print_train_time(start_time, end_time, num_samples):
+ train_elapsed = end_time - start_time
+ examples_per_sec = num_samples / train_elapsed
+ print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+ (num_samples, train_elapsed, examples_per_sec))
+
+
def main():
args = parse_args()
print_arguments(args)
@@ -342,7 +345,7 @@ def main():
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
- None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
+ None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
if args.use_cprof:
pr = cProfile.Profile()
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
index d264bfc12bdb159c06dae81db4949b9ee17268e2..28a38a931cf6cfcd5dd858b363b3d29b70368315 100644
--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -69,15 +69,30 @@ def get_model(args):
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
- # Train program
- predict = cnn_model(images)
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
-
- # Evaluator
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size_tensor)
+ if args.device == 'CPU' and args.cpus > 1:
+ places = fluid.layers.get_places(args.cpus)
+ pd = fluid.layers.ParallelDo(places)
+ with pd.do():
+ predict = cnn_model(pd.read_input(images))
+ label = pd.read_input(label)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+ pd.write_output(avg_cost)
+ pd.write_output(batch_acc)
+
+ avg_cost, batch_acc = pd()
+ avg_cost = fluid.layers.mean(avg_cost)
+ batch_acc = fluid.layers.mean(batch_acc)
+ else:
+ # Train program
+ predict = cnn_model(images)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+
+ # Evaluator
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
# inference program
inference_program = fluid.default_main_program().clone()
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index 9dec8911ed64e09285fb461c4a12adb601535316..f951f73a35dc4dc6f796178ebbc3e2886b2d7d8c 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -132,18 +132,33 @@ def get_model(args):
input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
- predict = model(input, class_dim)
- cost = fluid.layers.cross_entropy(input=predict, label=label)
- avg_cost = fluid.layers.mean(x=cost)
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
- batch_acc = fluid.layers.accuracy(
- input=predict, label=label, total=batch_size_tensor)
+ if args.device == 'CPU' and args.cpus > 1:
+ places = fluid.layers.get_places(args.cpus)
+ pd = fluid.layers.ParallelDo(places)
+ with pd.do():
+ predict = model(pd.read_input(input), class_dim)
+ label = pd.read_input(label)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+ pd.write_output(avg_cost)
+ pd.write_output(batch_acc)
+
+ avg_cost, batch_acc = pd()
+ avg_cost = fluid.layers.mean(avg_cost)
+ batch_acc = fluid.layers.mean(batch_acc)
+ else:
+ predict = model(input, class_dim)
+ cost = fluid.layers.cross_entropy(input=predict, label=label)
+ avg_cost = fluid.layers.mean(x=cost)
+ batch_acc = fluid.layers.accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
- target_vars=[batch_acc, batch_size_tensor])
+ target_vars=[batch_acc])
optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 81a28b5f3aed0c325398b909d700c23df545824a..1b680d76a8ba1ead7c8c50065e1817c45b951b27 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -101,9 +101,8 @@ def get_model(args):
loss = fluid.layers.mean(x=loss)
# add acc
- batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
- shape=[1], dtype='int64'), total=batch_size_tensor)
+ shape=[1], dtype='int64'))
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
index f6dfd20bf2ee0b668b6d4238d4511253b2233035..5d9b2db87135e53470b106dcd11a6bcfdc5dbda9 100644
--- a/benchmark/fluid/run.sh
+++ b/benchmark/fluid/run.sh
@@ -2,6 +2,7 @@
# This script benchmarking the PaddlePaddle Fluid on
# single thread single GPU.
+mkdir -p logs
#export FLAGS_fraction_of_gpu_memory_to_use=0.0
export CUDNN_PATH=/paddle/cudnn_v5
@@ -35,71 +36,74 @@ nohup stdbuf -oL nvidia-smi \
--format=csv \
--filename=mem.log \
-l 1 &
+
# mnist
# mnist gpu mnist 128
-FLAGS_benchmark=true stdbuf -oL python fluid/mnist.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=mnist \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=500 \
- 2>&1 | tee -a mnist_gpu_128.log
+ 2>&1 | tee -a logs/mnist_gpu_128.log
# vgg16
# gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=vgg16 \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a vgg16_gpu_128.log
+ 2>&1 | tee -a logs/vgg16_gpu_128.log
# flowers gpu 128
-FLAGS_benchmark=true stdbuf -oL python fluid/vgg16.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=vgg16 \
--device=GPU \
--batch_size=32 \
--data_set=flowers \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a vgg16_gpu_flowers_32.log
+ 2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
# resnet50
# resnet50 gpu cifar10 128
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=resnet \
--device=GPU \
--batch_size=128 \
--data_set=cifar10 \
- --model=resnet_cifar10 \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a resnet50_gpu_128.log
+ 2>&1 | tee -a logs/resnet50_gpu_128.log
# resnet50 gpu flowers 64
-FLAGS_benchmark=true stdbuf -oL python fluid/resnet50.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=resnet \
--device=GPU \
--batch_size=64 \
--data_set=flowers \
- --model=resnet_imagenet \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a resnet50_gpu_flowers_64.log
+ 2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
# lstm
# lstm gpu imdb 32 # tensorflow only support batch=32
-FLAGS_benchmark=true stdbuf -oL python fluid/stacked_dynamic_lstm.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=stacked_dynamic_lstm \
--device=GPU \
--batch_size=32 \
--skip_batch_num=5 \
--iterations=30 \
- --hidden_dim=512 \
- --emb_dim=512 \
- --crop_size=1500 \
- 2>&1 | tee -a lstm_gpu_32.log
+ 2>&1 | tee -a logs/lstm_gpu_32.log
# seq2seq
# seq2seq gpu wmb 128
-FLAGS_benchmark=true stdbuf -oL python fluid/machine_translation.py \
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+ --model=machine_translation \
--device=GPU \
--batch_size=128 \
--skip_batch_num=5 \
--iterations=30 \
- 2>&1 | tee -a lstm_gpu_128.log
+ 2>&1 | tee -a logs/lstm_gpu_128.log
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 9459f1ddfe85f5607880d3fdd968b494d6af592a..ffdf91a354bd92bdaf3f88344f0a9256638b568c 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,10 +33,19 @@ ELSE()
SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
ENDIF()
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
ExternalProject_Add(
extern_grpc
DEPENDS protobuf zlib
- URL "http://paddlepaddledeps.bj.bcebos.com/grpc.tar.xz"
+ # NOTE(wuyi):
+ # this package is generated by following steps:
+ # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+ # 2. submodule update --init
+ # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+ # checkout and clean other dirs under third_party
+ # 4. remove .git, and package the directory.
+ URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.8.x.tar.gz"
+ URL_MD5 "c9c58ee7d0e8929a63155af6a2ecdbd0"
PREFIX ${GRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
@@ -49,7 +58,6 @@ ExternalProject_Add(
INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
)
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
"${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61..3e956f8302d261b52f9f76ff8eb4a01f9c6381f8 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,21 @@ get_inference_program
.. autofunction:: paddle.fluid.io.get_inference_program
:noindex:
+save_checkpoint
+---------------
+
+.. autofunction:: paddle.fluid.io.save_checkpoint
+ :noindex:
+
+load_checkpoint
+---------------
+
+.. autofunction:: paddle.fluid.io.load_checkpoint
+ :noindex:
+
+clean_checkpoint
+----------------
+
+.. autofunction:: paddle.fluid.io.clean_checkpoint
+ :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index f53da4d194f8d2428b4121fa1bb31f3fc95a9f64..f78e6db3268e44d5f30d83508f07c4ed68106e48 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
.. autofunction:: paddle.fluid.layers.Print
:noindex:
+is_empty
+--------
+
+.. autofunction:: paddle.fluid.layers.is_empty
+ :noindex:
+
device
======
@@ -255,6 +261,19 @@ double_buffer
.. autofunction:: paddle.fluid.layers.double_buffer
:noindex:
+random_data_generator
+---------------------
+
+.. autofunction:: paddle.fluid.layers.random_data_generator
+ :noindex:
+
+Preprocessor
+------------
+
+.. autoclass:: paddle.fluid.layers.Preprocessor
+ :members:
+ :noindex:
+
nn
==
@@ -594,6 +613,29 @@ roi_pool
.. autofunction:: paddle.fluid.layers.roi_pool
:noindex:
+dice_loss
+---------
+
+.. autofunction:: paddle.fluid.layers.dice_loss
+ :noindex:
+
+resize_bilinear
+---------------
+
+.. autofunction:: paddle.fluid.layers.resize_bilinear
+ :noindex:
+
+gather
+------
+
+.. autofunction:: paddle.fluid.layers.gather
+ :noindex:
+
+random_crop
+-----------
+
+.. autofunction:: paddle.fluid.layers.random_crop
+ :noindex:
ops
===
@@ -742,6 +784,12 @@ sum
.. autofunction:: paddle.fluid.layers.sum
:noindex:
+shape
+-----
+
+.. autofunction:: paddle.fluid.layers.shape
+ :noindex:
+
sigmoid
-------
@@ -991,21 +1039,3 @@ zeros
.. autofunction:: paddle.fluid.layers.zeros
:noindex:
-topk
-----
-
-.. autofunction:: paddle.fluid.layers.topk
- :noindex:
-
-dice_loss
-----
-
-.. autofunction:: paddle.fluid.layers.dice_loss
- :noindex:
-
-upsampling_bilinear2d
-____
-
-.. autofunction:: paddle.fluid.layers.upsampling_bilinear2d
- :noindex:
-
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index df2bd2eace52e78805433bea320f5de95d45bfc7..6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -47,28 +47,6 @@ DecayedAdagrad
:members:
:noindex:
-Adadelta
------------------
-
-.. autoclass:: paddle.fluid.optimizer.Adadelta
- :members:
- :noindex:
-
-RMSProp
------------------
-
-.. autoclass:: paddle.fluid.optimizer.RMSProp
- :members:
- :noindex:
-
-ModelAverage
------------------
-
-.. autoclass:: paddle.fluid.optimizer.ModelAverage
- :members:
- :noindex:
-
-
SGDOptimizer
------------
@@ -111,25 +89,31 @@ DecayedAdagradOptimizer
:members:
:noindex:
+RMSPropOptimizer
+----------------
-AdadeltaOptimizer
------------------
-
-.. autoclass:: paddle.fluid.optimizer.AdadeltaOptimizer
+.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
:members:
:noindex:
+Adadelta
+--------
-RMSPropOptimizer
------------------
+.. autoclass:: paddle.fluid.optimizer.Adadelta
+ :members:
+ :noindex:
-.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+ModelAverage
+------------
+
+.. autoclass:: paddle.fluid.optimizer.ModelAverage
:members:
:noindex:
-
+
Optimizer
---------
.. autoclass:: paddle.fluid.optimizer.Optimizer
:members:
:noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0db35766c34e3d14939a8aa5861686b..39fda65863471a78895503184848a754828b71a1 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
.. autofunction:: paddle.fluid.profiler.profiler
:noindex:
+start_profiler
+--------------
+
+.. autofunction:: paddle.fluid.profiler.start_profiler
+ :noindex:
+
+stop_profiler
+-------------
+
+.. autofunction:: paddle.fluid.profiler.stop_profiler
+ :noindex:
+
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
index 6750323c0167bf1efbde6ef4fd670e88a5aa502a..8db67f6703d142da71cf06bd4f7e2cb13556f9b0 100644
--- a/doc/fluid/design/concepts/var_desc.md
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -35,7 +35,7 @@ The computation `Program` consists of nested `Blocks`. Each `Block` will consist
## Definition of VarType
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
```proto
message VarDesc {
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000000000000000000000000000000000..55ce63ec193948424cd0b87f13d56b9cf6154dfc
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+ reader = paddle.batch(mnist.train(), batch_size=1)
+ feeder = fluid.DataFeeder(
+ feed_list=[ # order is image and label
+ fluid.layers.data(
+ name='image', shape=[784]),
+ fluid.layers.data(
+ name='label', shape=[1], dtype='int64'),
+ ],
+ place=fluid.CPUPlace())
+ fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+ data_file = fluid.layers.io.open_recordio_file(
+ filename="./mnist.recordio",
+ shapes=[(-1, 784),(-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int32"])
+ data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+ img, label = fluid.layers.io.read_file(data_file)
+ hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+ prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+ loss = fluid.layers.cross_entropy(input=prediction, label=label)
+ avg_loss = fluid.layers.mean(loss)
+
+ fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+ place = fluid.CPUPlace()
+
+ exe = fluid.Executor(place)
+ exe.run(fluid.default_startup_program())
+ avg_loss_np = []
+
+ # train a pass
+ batch_id = 0
+ while True:
+ tmp, = exe.run(fetch_list=[avg_loss])
+
+ avg_loss_np.append(tmp)
+ print(batch_id)
+ batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+ reader = paddle.batch(mnist.train(), batch_size=1)
+ feeder = fluid.DataFeeder(
+ feed_list=[ # order is image and label
+ fluid.layers.data(
+ name='image', shape=[784]),
+ fluid.layers.data(
+ name='label', shape=[1], dtype='int64'),
+ ],
+ place=fluid.CPUPlace())
+ fluid.recordio_writer.convert_reader_to_recordio_files(
+ filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+ file_list = glob.glob(file_pattern)
+ ret_list = []
+ for idx, f in enumerate(file_list):
+ if (idx + trainers) % trainers == trainer_id:
+ ret_list.append(f)
+ return ret_list
+
+trainers = int(os.getenv("TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+ filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+ thread_num=1,
+ shapes=[(-1, 784),(-1, 1)],
+ lod_levels=[0, 0],
+ dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md
deleted file mode 120000
index db30af7f53231c687f9ad61ad961a685733cbad0..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/benchmark/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../benchmark/cluster/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md
deleted file mode 120000
index ca963ef5f06aa0c2fe507ba7548dca8017358120..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/benchmark/vgg16/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b55a66ded8b48f7105c05f1462839a72ab5f904
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+## 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放,随着程序的运行占用的内存越来越大,一方面会影响程序的稳定性,可能让运行速度越来越慢,或者造成oom,甚至会影响运行程序的机器的稳定性,造成宕机。
+
+
+目前有很多内存泄漏分析工具,比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行,valgrind直接分析非常困难,需要自己编译debug版本的、带valgrind支持的专用Python版本,而且输出的信息中大部分是Python自己的符号和调用信息,分析起来很困难,另外使用valgrind会让程序运行速度变得非常慢,所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能:
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析,主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 使用流程
+#### 环境
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev,基于Ubuntu 16.04.4 LTS环境。
+
+#### 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump,默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行,会在perf_log这个文件夹下生成很多文件,如下:
+
+```
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun 1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式:
+ - 完整模式。会对当前heap做一个分析,显示目前分配内存一些调用路径。
+
+ ```
+ pprof --pdf python test.log.0012.heap
+ ```
+ 上述命令会生成一个profile00x.pdf的文件,可以直接打开,例如:[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出,在CPU版本fluid的运行过程中,分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少,所以被忽略了,这对于分配内存泄漏是很不方便的,因为泄漏是一个缓慢的过程,在这种图中是无法看到的。
+
+ ![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+
+ - Diff模式。可以对两个时刻的heap做diff,把一些内存分配没有发生变化的模块去掉,而把增量部分显示出来。
+ ```
+ pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+ ```
+ 生成的结果为:[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+
+ 从图中可以看出:ProgramDesc这个结构,在两个版本之间增长了200MB+,所以这里有很大的内存泄漏的可能性,最终结果也确实证明是这里造成了泄漏。
+
+ ![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+ ![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index cdd6917239371a660d0df05bb623f0b94f8f11a3..0607748b751e9f2d606236d9e98868335379b05c 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数:
- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库
```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
```
- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库
```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
```
-执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时,容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
## 基于Linux交叉编译环境的编译方式
本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 6af16fc114a2310e364023ec43cc3c64149af8f7..572063e8012efee2d2e142eb57e459e0e8c6382c 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
```
The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading.
The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 741c01ce5428c0046daa5a784da70d4bb492438c..de7e9eb75c3a053179f2d03ac887955bb4e0a6d2 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -23,7 +23,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
在 `这里 `__ 找到 paddle_manylinux_devel
镜像的编译以及使用方法。或者参考下述可选步骤,从源码中构建用于编译PaddlePaddle的Docker镜像。
-如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
编译PaddlePaddle,需要执行:
@@ -106,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
- 学习 Docker 有多难?
- 理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+ 理解 Docker 并不难,大概花十分钟看一下 `这篇文章 `_ 。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
- 我可以用 IDE 吗?
@@ -123,7 +123,7 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
- 可以并行编译吗?
- 是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+ 是的。我们的 Docker image 运行一个 `Bash脚本 `_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
- Docker 需要 sudo
@@ -131,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译,这样可以免去单独安
- 在 Windows/MacOS 上编译很慢
- Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+ Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考 `这个issue `_ 。
- 磁盘不够
- 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+ 本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考 `这篇文章 `_ 来清理这些内容。
.. _compile_deps:
@@ -211,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,
编译选项的设置
++++++++++++++
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如
.. code-block:: bash
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index b06c43e19dcfc52ad0f074a85517a16744895a3a..b08b45d43ec7f1deb2889832079a731ee724a44c 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
1. A computer -- Linux, Windows, MacOS.
2. Docker.
-Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image.
+Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image.
We run all the tools by running this image.
.. _build_step:
@@ -26,6 +26,8 @@ you can also find how to build and use paddle_manylinux_devel Docker image from
`here `__
Or you can build your own image from source as the optional step below:
+If you don't wish to use docker,you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
.. code-block:: bash
# 1. clone the source code
@@ -108,7 +110,7 @@ Frequently Asked Questions
- How difficult is it to learn Docker?
- It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
+ It takes you ten minutes to read `an introductory article `_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
- Can I use my favorite IDE?
@@ -125,7 +127,7 @@ Frequently Asked Questions
- Does Docker do parallel building?
- Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+ Our building Docker image runs a `Bash script `_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
- Docker requires sudo
@@ -133,11 +135,11 @@ Frequently Asked Questions
- Docker on Windows/MacOS builds slowly
- On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+ On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to `this issue `_ for details.
- Not enough disk space
- Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+ Examples in this article use option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to `this article `_ .
.. _compile_deps:
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 6847f7db7fc0f6b41ced1260d409ca6eba9b53eb..1e3bb7bf16f969255dba6f6ec7a6a70bbb1e07ee 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -17,33 +17,77 @@ if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE)
+set(ANAKIN_INCLUDE "" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api)
+
+# if anakin is set enable anakin api implementation
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+ set(ANAKIN_FOUND ON)
+else()
+ set(ANAKIN_FOUND OFF)
+endif()
+
+if (ANAKIN_FOUND)
+ # Anakin's code style doesn't follow google c style.
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
+ -Wno-error=reorder
+ -Wno-error=format
+ -Wno-error=switch
+ -Wno-error=return-type
+ -Wno-error=non-virtual-dtor
+ -Wno-error=cpp")
+
+ message(STATUS "Anakin for inference is enabled")
+ message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+ include_directories("${ANAKIN_INCLUDE}")
+ # Anakin's source path is a mass, need to set sub-directories trivially.
+ include_directories("${ANAKIN_INCLUDE}/saber")
+ link_directories("${ANAKIN_LIBRARY}")
+
+ nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
+ target_link_libraries(inference_anakin_api anakin)
+ list(APPEND inference_deps inference_anakin_api)
+endif()
+
+
function(inference_api_test TARGET_NAME)
- set(options "")
- set(oneValueArgs "")
- set(multiValueArgs ARGS)
- cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
- set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
- cc_test(test_paddle_inference_${TARGET_NAME}
- SRCS test_paddle_inference_${TARGET_NAME}.cc
- DEPS paddle_fluid_api paddle_inference_api
- ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
- if(inference_test_ARGS)
- set_tests_properties(test_paddle_inference_${TARGET_NAME}
- PROPERTIES DEPENDS "${inference_test_ARGS}")
- endif()
-endfunction(inference_api_test)
+ if (WITH_TESTING)
+ set(options "")
+ set(oneValueArgs "")
+ set(multiValueArgs ARGS)
+ cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+ set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+ cc_test(${TARGET_NAME}
+ SRCS ${TARGET_NAME}.cc
+ DEPS "${inference_deps}"
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+ if(inference_test_ARGS)
+ set_tests_properties(${TARGET_NAME}
+ PROPERTIES DEPENDS "${inference_test_ARGS}")
+ endif()
+ endif(WITH_TESTING)
+endfunction(inference_api_test)
cc_library(paddle_inference_api
SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-if(WITH_TESTING)
- cc_test(test_paddle_inference_api
- SRCS test_paddle_inference_api.cc
- DEPS paddle_inference_api)
+cc_test(test_paddle_inference_api
+ SRCS test_paddle_inference_api.cc
+ DEPS paddle_inference_api)
- inference_api_test(api_impl
- ARGS test_word2vec test_image_classification)
+inference_api_test(test_paddle_inference_api_impl
+ ARGS test_word2vec test_image_classification)
+
+if (ANAKIN_FOUND)
+ nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+ DEPS ${inference_deps} protobuf)
+endif()
+
+if(WITH_TESTING)
+ add_subdirectory(demo)
endif()
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b0fa77ad13c19f177e5b2446bcda6551471e45f
--- /dev/null
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+inference_api_test(simple_on_word2vec ARGS test_word2vec)
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b4843f714f11484860056711fd223edc8a5d037
--- /dev/null
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+
+#include
+#include
+#include
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+ //# 1. Create PaddlePredictor with a config.
+ NativeConfig config;
+ config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+ config.use_gpu = use_gpu;
+ config.fraction_of_gpu_memory = 0.15;
+ config.device = 0;
+ auto predictor =
+ CreatePaddlePredictor(config);
+
+ for (int batch_id = 0; batch_id < 3; batch_id++) {
+ //# 2. Prepare input.
+ int64_t data[4] = {1, 2, 3, 4};
+
+ PaddleBuf buf{.data = data, .length = sizeof(data)};
+ PaddleTensor tensor{.name = "",
+ .shape = std::vector({4, 1}),
+ .data = buf,
+ .dtype = PaddleDType::INT64};
+
+ // For simplicity, we set all the slots with the same data.
+ std::vector slots(4, tensor);
+
+ //# 3. Run
+ std::vector outputs;
+ CHECK(predictor->Run(slots, &outputs));
+
+ //# 4. Get output.
+ ASSERT_EQ(outputs.size(), 1UL);
+ LOG(INFO) << "output buffer size: " << outputs.front().data.length;
+ const size_t num_elements = outputs.front().data.length / sizeof(float);
+ // The outputs' buffers are in CPU memory.
+ for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+ LOG(INFO) << static_cast(outputs.front().data.data)[i];
+ }
+ }
+}
+
+TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+#endif
+
+} // namespace demo
+} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
index b4c7f9bef4d2e83038ff223614a89e1b0493fc6f..c4588cf04030b9627dbe9b40c1bb04d1e782ebba 100644
--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -1,16 +1,16 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
/*
* This file contains the definition of a simple Inference API for Paddle.
@@ -40,20 +40,30 @@ struct PaddleBuf {
struct PaddleTensor {
std::string name; // variable name.
std::vector shape;
+ // TODO(Superjomn) for LoD support, add a vector> field if needed.
PaddleBuf data; // blob of data.
PaddleDType dtype;
};
+enum class PaddleEngineKind {
+ kNative = 0, // Use the native Fluid facility.
+ kAnakin, // Use Anakin for inference.
+ // TODO(Superjomn) support following engines latter.
+ // kTensorRT, // Use TensorRT for inference.
+ // kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
+ // kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
+};
+
/*
* A simple Inference API for Paddle. Currently this API can be used by
* non-sequence scenerios.
- * TODO(Superjomn) Support another API for NLP-related usages.
*/
class PaddlePredictor {
public:
struct Config;
PaddlePredictor() = default;
PaddlePredictor(const PaddlePredictor&) = delete;
+ PaddlePredictor& operator=(const PaddlePredictor&) = delete;
// Predict an record.
// The caller should be responsible for allocating and releasing the memory of
@@ -67,16 +77,7 @@ class PaddlePredictor {
virtual std::unique_ptr Clone() = 0;
// Destroy the Predictor.
- virtual ~PaddlePredictor() {}
-
- enum class EngineKind {
- kNative = -1, // Use the native Fluid facility.
- // TODO(Superjomn) support latter.
- // kAnakin, // Use Anakin for inference.
- // kTensorRT, // Use TensorRT for inference.
- // kAutoMixedAnakin, // Automatically mix Fluid with Anakin.
- // kAutoMixedTensorRT, // Automatically mix Fluid with TensorRT.
- };
+ virtual ~PaddlePredictor() = default;
// The common configs for all the predictors.
struct Config {
@@ -86,18 +87,31 @@ class PaddlePredictor {
};
struct NativeConfig : public PaddlePredictor::Config {
+ // GPU related fields.
bool use_gpu{false};
- int device;
- float fraction_of_gpu_memory;
+ int device{0};
+ float fraction_of_gpu_memory{-1.f}; // Negative to notify initialization.
+
std::string prog_file;
std::string param_file;
- bool share_variables;
};
-// A factory to help create difference predictor.
-template <
- typename ConfigT,
- PaddlePredictor::EngineKind engine = PaddlePredictor::EngineKind::kNative>
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+ int device;
+ std::string model_file;
+ int max_batch_size{-1};
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template
std::unique_ptr CreatePaddlePredictor(const ConfigT& config);
} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..865d7ac10db55ce9565f4b1a35defa2a3d1d40ef
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+
+namespace paddle {
+
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+ const AnakinConfig &config) {
+ CHECK(Init(config));
+}
+
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+ // TODO(Superjomn) Tell anakin to support return code.
+ engine_.Build(config.model_file, config.max_batch_size);
+ return true;
+}
+
+bool PaddleInferenceAnakinPredictor::Run(
+ const std::vector &inputs,
+ std::vector *output_data) {
+ for (const auto &input : inputs) {
+ if (input.dtype != PaddleDType::FLOAT32) {
+ LOG(ERROR) << "Only support float type inputs. " << input.name
+ << "'s type is not float";
+ return false;
+ }
+ engine_.SetInputFromCPU(
+ input.name, static_cast(input.data.data), input.data.length);
+ }
+
+ // TODO(Superjomn) Tell anakin to support return code.
+ engine_.Execute();
+
+ if (output_data->empty()) {
+ LOG(ERROR) << "At least one output should be set with tensors' names.";
+ return false;
+ }
+ for (auto &output : *output_data) {
+ auto *tensor = engine_.GetOutputInGPU(output.name);
+ output.shape = tensor->shape();
+ // Copy data from GPU -> CPU
+ if (cudaMemcpy(output.data.data,
+ tensor->data(),
+ tensor->size(),
+ cudaMemcpyDeviceToHost) != 0) {
+ LOG(ERROR) << "copy data from GPU to CPU error";
+ return false;
+ }
+ }
+ return true;
+}
+
+// TODO(Superjomn) To implement latter.
+std::unique_ptr PaddleInferenceAnakinPredictor::Clone() {
+ return nullptr;
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr
+CreatePaddlePredictor(
+ const AnakinConfig &config) {
+ std::unique_ptr x(
+ new PaddleInferenceAnakinPredictor(config));
+ return x;
+};
+
+} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe9f562e9d1d40c30585bcb68fa51e445bedb4aa
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+// NOTE This header file do not have namespace.
+// TODO(Superjomn) Tell Anakin to provide better APIs.
+#include
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+ PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+ // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+ // should be allocated first.
+ // TODO(Superjomn) should unify all the behaviors of output_data accross all
+ // the engines.
+ bool Run(const std::vector& inputs,
+ std::vector* output_data) override;
+
+ std::unique_ptr Clone() override;
+
+ private:
+ bool Init(const AnakinConfig& config);
+
+ anakin::AnakinEngine
+ engine_;
+};
+
+} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43324bc67cba16c36d9dbcb58ccde1c57293085e
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include
+
+namespace paddle {
+
+TEST(inference, anakin) {
+ AnakinConfig config;
+
+ auto engine =
+ CreatePaddlePredictor(config);
+}
+
+} // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
index 989252f69e42778dfd791cdee02c550f2aa78803..bda2981a14482e2c4a29773d37b074506cc344b1 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -1,16 +1,16 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
#include
#include
@@ -54,18 +54,24 @@ std::string num2str(T a) {
}
} // namespace
-bool NativePaddlePredictor::Init() {
+bool NativePaddlePredictor::Init(
+ std::shared_ptr parent_scope) {
VLOG(3) << "Predictor::init()";
- // TODO(panyx0718): Should CPU vs GPU device be decided by id?
- if (config_.device >= 0) {
+ if (config_.use_gpu) {
place_ = paddle::platform::CUDAPlace(config_.device);
} else {
place_ = paddle::platform::CPUPlace();
}
- paddle::framework::InitDevices(false);
+ if (parent_scope) {
+ scope_ = parent_scope;
+ sub_scope_ = &(parent_scope->NewScope());
+ } else {
+ paddle::framework::InitDevices(false);
+ scope_.reset(new paddle::framework::Scope());
+ }
+
executor_.reset(new paddle::framework::Executor(place_));
- scope_.reset(new paddle::framework::Scope());
// Initialize the inference program
if (!config_.model_dir.empty()) {
@@ -84,18 +90,22 @@ bool NativePaddlePredictor::Init() {
return false;
}
ctx_ = executor_->Prepare(*inference_program_, 0);
+ executor_->CreateVariables(
+ *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
- // Create variables
- // TODO(panyx0718): Why need to test share_variables here?
- if (config_.share_variables) {
- executor_->CreateVariables(*inference_program_, scope_.get(), 0);
- }
// Get the feed_target_names and fetch_target_names
feed_target_names_ = inference_program_->GetFeedTargetNames();
fetch_target_names_ = inference_program_->GetFetchTargetNames();
return true;
}
+NativePaddlePredictor::~NativePaddlePredictor() {
+ if (sub_scope_) {
+ PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
+ scope_->DeleteScope(sub_scope_);
+ }
+};
+
bool NativePaddlePredictor::Run(const std::vector &inputs,
std::vector *output_data) {
VLOG(3) << "Predictor::predict";
@@ -120,11 +130,12 @@ bool NativePaddlePredictor::Run(const std::vector &inputs,
}
// Run the inference program
// if share variables, we need not create variables
- executor_->RunPreparedContext(ctx_.get(),
- scope_.get(),
- &feed_targets,
- &fetch_targets,
- !config_.share_variables);
+ executor_->RunPreparedContext(
+ ctx_.get(),
+ sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
+ &feed_targets,
+ &fetch_targets,
+ false /* don't create variable eatch time */);
if (!GetFetch(fetchs, output_data)) {
LOG(ERROR) << "fail to get fetchs";
return false;
@@ -137,7 +148,7 @@ std::unique_ptr NativePaddlePredictor::Clone() {
VLOG(3) << "Predictor::clone";
std::unique_ptr cls(new NativePaddlePredictor(config_));
- if (!dynamic_cast(cls.get())->Init()) {
+ if (!dynamic_cast(cls.get())->Init(scope_)) {
LOG(ERROR) << "fail to call Init";
return nullptr;
}
@@ -242,11 +253,16 @@ bool NativePaddlePredictor::GetFetch(
template <>
std::unique_ptr
-CreatePaddlePredictor(
+CreatePaddlePredictor(
const NativeConfig &config) {
VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) {
// 1. GPU memeroy
+ PADDLE_ENFORCE_GT(
+ config.fraction_of_gpu_memory,
+ 0.f,
+ "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+ PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
std::vector flags;
if (config.fraction_of_gpu_memory >= 0.0f ||
config.fraction_of_gpu_memory <= 0.95f) {
@@ -260,7 +276,7 @@ CreatePaddlePredictor(
}
std::unique_ptr predictor(new NativePaddlePredictor(config));
- if (!dynamic_cast(predictor.get())->Init()) {
+ if (!dynamic_cast(predictor.get())->Init(nullptr)) {
return nullptr;
}
return std::move(predictor);
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
index 84707e223d7aa3d1ebca933923e932b3973613ae..86d1db7bcc7567e104cd20c9f767ed4513f611f5 100644
--- a/paddle/contrib/inference/paddle_inference_api_impl.h
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -34,14 +34,15 @@ class NativePaddlePredictor : public PaddlePredictor {
explicit NativePaddlePredictor(const NativeConfig &config)
: config_(config) {}
- bool Init();
+ // will only create sub scope if have global scope
+ bool Init(std::shared_ptr parent_scope);
bool Run(const std::vector &inputs,
std::vector *output_data) override;
std::unique_ptr Clone() override;
- ~NativePaddlePredictor() override{};
+ ~NativePaddlePredictor() override;
private:
bool SetFeed(const std::vector &input_datas,
@@ -52,11 +53,13 @@ class NativePaddlePredictor : public PaddlePredictor {
NativeConfig config_;
platform::Place place_;
std::unique_ptr executor_;
- std::unique_ptr scope_;
+ std::shared_ptr scope_;
std::unique_ptr ctx_;
std::unique_ptr inference_program_;
std::vector feed_target_names_;
std::vector fetch_target_names_;
+ // Do not use unique_ptr, use parent scope to delete
+ framework::Scope *sub_scope_{nullptr};
};
} // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
index 5240fc2f20211ac5d38c57b71db31d04a6dc536a..1f960677163988be6f4c502738861bf86588f406 100644
--- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -47,7 +47,6 @@ NativeConfig GetConfig() {
config.fraction_of_gpu_memory = 0.15;
config.use_gpu = true;
config.device = 0;
- config.share_variables = true;
return config;
}
@@ -75,7 +74,7 @@ TEST(paddle_inference_api_impl, word2vec) {
ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length;
float* data = static_cast(outputs[0].data.data);
- for (int j = 0; j < len / sizeof(float); ++j) {
+ for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0);
}
@@ -93,7 +92,7 @@ TEST(paddle_inference_api_impl, word2vec) {
TestInference(config.model_dir, cpu_feeds, cpu_fetchs1);
float* lod_data = output1.data();
- for (size_t i = 0; i < output1.numel(); ++i) {
+ for (int i = 0; i < output1.numel(); ++i) {
EXPECT_LT(lod_data[i] - data[i], 1e-3);
EXPECT_GT(lod_data[i] - data[i], -1e-3);
}
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index ed1e70c6460b513c1d2e1add18ac037f71d36944..dbd375aa31bfbdcb109b6302acf23b3bb3b6befe 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto glog lod_rank_table feed_fetch_method)
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index e7842e9b8130d35e511e02dfb1dc27f307d17f38..f537e4b9e569dd4c513ac0efde7240833bcf04b6 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -169,17 +169,13 @@ void BlockDesc::Flush() {
}
if (need_update_) {
- auto &op_field = *this->desc_->mutable_ops();
- this->ClearPBOps();
- op_field.Reserve(static_cast(ops_.size()));
+ this->desc_->mutable_ops()->Clear();
for (auto &op_desc : ops_) {
- op_field.AddAllocated(op_desc->Proto());
+ this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
}
- auto &var_field = *this->desc_->mutable_vars();
- this->ClearPBVars();
- var_field.Reserve(static_cast(vars_.size()));
+ this->desc_->mutable_vars()->Clear();
for (auto &var_desc : vars_) {
- var_field.AddAllocated(var_desc.second->Proto());
+ this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
}
need_update_ = false;
}
@@ -217,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
}
}
-void BlockDesc::ClearPBOps() {
- auto ops = this->desc_->mutable_ops();
- while (!ops->empty()) {
- // we do not own the OpDesc, so release the ownership.
- ops->ReleaseLast();
- }
-}
-
-void BlockDesc::ClearPBVars() {
- auto vars = this->desc_->mutable_vars();
- while (!vars->empty()) {
- // we do not own the VarDesc, so release the ownership.
- vars->ReleaseLast();
- }
-}
-
void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
"Parent block ID has been set to %d. Cannot set to %d",
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 189dd6c52f85b5bf623b98c64c07c0c7269505d4..ce48548418478cc5c9f9ca1244df9e66dca884e6 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -41,11 +41,6 @@ class BlockDesc {
BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
- ~BlockDesc() {
- this->ClearPBVars();
- this->ClearPBOps();
- }
-
int32_t ID() const { return desc_->idx(); }
int32_t Parent() const { return desc_->parent_idx(); }
@@ -113,10 +108,6 @@ class BlockDesc {
ProgramDesc *Program() const { return this->prog_; }
- private:
- void ClearPBOps();
- void ClearPBVars();
-
private:
ProgramDesc *prog_; // not_own
proto::BlockDesc *desc_; // not_own
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 1bcd8412eb2d618b923bcd0557d118af62271f4a..c026e6c100a303b43650f08cd12d7260258c8f7e 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -36,5 +36,6 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
device_context broadcast_op_handle)
cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
device_context gather_op_handle)
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
# device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index e8d510ec955602b5a3f73ca06caa121886eb150b..e7aa74742f827efabff1189d3213edd748d9082d 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -22,6 +22,7 @@ struct ExecutionStrategy {
size_t num_threads_{0};
bool use_event_{true};
bool allow_op_delay_{false};
+ size_t num_iteration_per_drop_scope_{100};
};
} // namespace details
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4e7ec52f907f9403e21ec2734d61824f51a58b
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include
+#include
+#include "paddle/fluid/framework/executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
+ ExecutionStrategy strategy, std::vector local_scopes,
+ std::vector var_infos, std::vector places,
+ std::unique_ptr &&underlying_executor)
+ : strategy_(std::move(strategy)),
+ underlying_executor_(std::move(underlying_executor)),
+ local_scopes_(std::move(local_scopes)),
+ var_infos_(std::move(var_infos)),
+ places_(std::move(places)) {}
+
+FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
+ const std::vector &fetch_tensors) {
+ if (drop_scope_counter_ == 0) {
+ // Create local scopes.
+ for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+ auto &scope = *it;
+ Scope &local_scope = scope->NewScope();
+ *scope->Var(details::kLocalExecScopeName)->GetMutable() =
+ &local_scope;
+
+ for (auto &info : var_infos_) {
+ if (scope->FindVar(info.name_) != nullptr) {
+ continue;
+ }
+
+ if (info.persistable_) { // Persistable
+ InitializeVariable(scope->Var(info.name_), info.type_);
+ } else {
+ InitializeVariable(local_scope.Var(info.name_), info.type_);
+ }
+ }
+ }
+ }
+
+ auto fetch_data = underlying_executor_->Run(fetch_tensors);
+ drop_scope_counter_ += 1;
+ if (!fetch_tensors.empty() ||
+ drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+ drop_scope_counter_ = 0;
+ // Wait All computational streams
+ for (auto p : places_) {
+ platform::DeviceContextPool::Instance().Get(p)->Wait();
+ }
+ for (auto &scope : local_scopes_) {
+ auto &local_scope =
+ *scope->Var(details::kLocalExecScopeName)->GetMutable();
+ scope->DeleteScope(local_scope);
+ }
+ }
+ return fetch_data;
+}
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..20df7a4722d589ffd168f842e927cff8411096bb
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include
+#include
+#include
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct VariableInfo {
+ std::string name_;
+ proto::VarType::Type type_;
+ bool persistable_;
+};
+
+class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+ ScopeBufferedSSAGraphExecutor(
+ ExecutionStrategy strategy, std::vector local_scopes,
+ std::vector var_infos, std::vector places,
+ std::unique_ptr&& underlying_executor);
+ FeedFetchList Run(const std::vector& fetch_tensors) override;
+
+ private:
+ size_t drop_scope_counter_{0};
+
+ ExecutionStrategy strategy_;
+ std::unique_ptr underlying_executor_;
+ std::vector local_scopes_;
+ std::vector var_infos_;
+ std::vector places_;
+};
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 8da6ca889b89999e0f6f974503cea476c9de97f3..09b97bd0d98dc4ad1124dcbc495cff921bf03efc 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -17,10 +17,6 @@
namespace paddle {
namespace framework {
namespace details {
-
-SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr &&graph)
- : graph_(std::move(graph)) {}
-
SSAGraphExecutor::~SSAGraphExecutor() {}
} // namespace details
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index a8833b7388ab907020a260d356f1484ffd227658..958086033607a4ed8fb840f5b14fe5779625bd82 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -28,15 +28,11 @@ class SSAGraphExecutor {
DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
public:
- // Steal graph inside
- explicit SSAGraphExecutor(std::unique_ptr &&graph);
+ SSAGraphExecutor() {}
virtual ~SSAGraphExecutor();
virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0;
-
- protected:
- std::unique_ptr graph_;
};
} // namespace details
} // namespace framework
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 815f739371e77d953a28be99b38ec1b8ff26506c..496fadd04dac982b87b9d9e14f599ed37d9709d0 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -21,7 +21,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector &local_scopes,
const std::vector &places,
std::unique_ptr &&graph)
- : SSAGraphExecutor(std::move(graph)),
+ : graph_(std::move(graph)),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr),
local_scopes_(local_scopes),
@@ -189,7 +189,9 @@ void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue *ready_var_q, details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] {
try {
- VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+ if (VLOG_IS_ON(10)) {
+ VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+ }
op->Run(strategy_.use_event_);
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 1f7f88d75218e757e4555ad093f3cd6558f624dd..4a2075f1cccb3211316567197da56c01d26f35ce 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -51,6 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details::OpHandleBase *op);
private:
+ std::unique_ptr graph_;
std::unique_ptr<::ThreadPool> pool_;
std::vector local_scopes_;
std::vector places_;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 863053c32b190f4e8497b16f3edd76cb2f76168b..3d68c5fb870d5b575f97eeb286528544402b8ed9 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -220,8 +220,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
ProgramDesc* copy_program = const_cast(&program);
+ std::unique_ptr unique_ptr_of_copy_program;
if (!has_feed_ops || !has_fetch_ops) {
- copy_program = std::unique_ptr(new ProgramDesc(program)).get();
+ unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+ copy_program = unique_ptr_of_copy_program.get();
}
auto* global_block = copy_program->MutableBlock(0);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 50c3468d556bfe05d6c41906cf35cb671f711b1e..003304b85af00165d54efbb199be01b2c5106768 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -23,6 +23,7 @@ limitations under the License. */
#endif
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
@@ -42,8 +43,6 @@ class ParallelExecutorPrivate {
#ifdef PADDLE_WITH_CUDA
std::unique_ptr nccl_ctxs_;
#endif
-
- std::vector> var_types_;
bool own_local_scope;
};
@@ -92,9 +91,18 @@ ParallelExecutor::ParallelExecutor(
local_scopes.empty()) { // Is CUDA
BCastParamsToGPUs(bcast_vars);
}
-// Startup Program has been run. All local scopes has correct parameters.
+ // Startup Program has been run. All local scopes has correct parameters.
+
+ // Step 2. Create vars in each scope;
+ std::vector var_infos;
+ for (auto *var : main_program.Block(0).AllVars()) {
+ var_infos.emplace_back();
+ var_infos.back().name_ = var->Name();
+ var_infos.back().type_ = var->GetType();
+ var_infos.back().persistable_ = var->Persistable();
+ }
-// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
+// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
#ifdef PADDLE_WITH_CUDA
details::MultiDevSSAGraphBuilder builder(
@@ -105,16 +113,15 @@ ParallelExecutor::ParallelExecutor(
params, member_->local_scopes_,
build_strategy);
#endif
+
auto graph = builder.Build(main_program);
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places, std::move(graph)));
- // Step 3. Create vars in each scope;
- for (auto *var : main_program.Block(0).AllVars()) {
- member_->var_types_.emplace_back(var->Name(), var->GetType(),
- var->Persistable());
- }
+ member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+ exec_strategy, member_->local_scopes_, std::move(var_infos),
+ member_->places_, std::move(member_->executor_)));
}
void ParallelExecutor::BCastParamsToGPUs(
@@ -169,42 +176,9 @@ void ParallelExecutor::BCastParamsToGPUs(
void ParallelExecutor::Run(const std::vector &fetch_tensors,
const std::string &fetched_var_name) {
platform::RecordBlock b(0);
- // Create local scopes.
- for (auto it = member_->local_scopes_.rbegin();
- it != member_->local_scopes_.rend(); ++it) {
- auto &scope = *it;
- Scope &local_scope = scope->NewScope();
- *scope->Var(details::kLocalExecScopeName)->GetMutable() =
- &local_scope;
-
- for (auto &name_type_pair : member_->var_types_) {
- if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
- continue;
- }
-
- if (std::get<2>(name_type_pair)) { // Persistable
- InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
- std::get<1>(name_type_pair));
- } else {
- InitializeVariable(local_scope.Var(std::get<0>(name_type_pair)),
- std::get<1>(name_type_pair));
- }
- }
- }
-
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable() =
fetch_data;
-
- // Wait All computational streams
- for (auto p : member_->places_) {
- platform::DeviceContextPool::Instance().Get(p)->Wait();
- }
- for (auto &scope : member_->local_scopes_) {
- auto &local_scope =
- *scope->Var(details::kLocalExecScopeName)->GetMutable();
- scope->DeleteScope(local_scope);
- }
}
void ParallelExecutor::FeedTensorsIntoLocalScopes(
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 9faf5bb3036775a2ba0c08d3d6ea17ffa73753c6..50835784440bfa177e38f9760bb4a47ad335a9e1 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -15,3 +15,9 @@ cc_test(test_subgraph_splitter
DEPS analysis paddle_fluid tensor
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
+
+cc_test(test_dfg_graphviz_draw_pass
+ SRCS dfg_graphviz_draw_pass_tester.cc
+ DEPS analysis
+ ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
+set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec)
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..41d4475382befa1bdaf7473520d64005a472a459
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file create an DFG_GraphvizDrawPass which helps to draw a data flow
+ * graph's structure using graphviz.
+ */
+
+#pragma once
+
+#include
+#include
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Output a dot file and write to some place.
+ */
+class DFG_GraphvizDrawPass : public DataFlowGraphPass {
+ public:
+ DFG_GraphvizDrawPass(const std::string& dir, const std::string& id)
+ : dir_(dir), id_(id) {}
+
+ bool Initialize() override { return Pass::Initialize(); }
+ void Run(DataFlowGraph* graph) override {
+ auto content = Draw(graph);
+ std::ofstream file(GenDotPath());
+ file.write(content.c_str(), content.size());
+ file.close();
+ LOG(INFO) << "draw dot to " << GenDotPath();
+ }
+
+ bool Finalize() override { return Pass::Finalize(); }
+
+ Pass* CreatePrinterPass(std::ostream& os,
+ const std::string& banner) const override {
+ return nullptr;
+ }
+
+ private:
+ // Path of the dot file to output.
+ std::string GenDotPath() const {
+ return dir_ + "/" + "graph_" + id_ + ".dot";
+ }
+
+ std::string Draw(DataFlowGraph* graph) { return graph->DotString(); }
+
+ std::string dir_;
+ std::string id_;
+};
+
+} // namespace analysis
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fc1cc18b855440c54c1ed6a9ab49a104c8c21f0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+#include
+#include
+#include
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
+ auto dfg = ProgramDescToDFG(desc);
+ DFG_GraphvizDrawPass pass("./", "test");
+ pass.Initialize();
+ pass.Run(&dfg);
+
+ // test content
+ std::ifstream file("./graph_test.dot");
+ ASSERT_TRUE(file.is_open());
+
+ std::string line;
+ int no{0};
+ while (std::getline(file, line)) {
+ no++;
+ }
+ ASSERT_EQ(no, 82);
+}
+
+} // namespace analysis
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 5ada1d631269209e912e2d4817382ea2c6c67353..23ca8bfac84f35ebdca2e2a1a8538d366358ca8b 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,3 +8,5 @@ nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+ DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 209936c3bafb0d31546856dc36c1b48053a0634b..668d344f1bba1c012dcb42c71b996209b4703d78 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -21,7 +21,8 @@ namespace tensorrt {
class Conv2dOpConverter : public OpConverter {
public:
Conv2dOpConverter() {}
- void operator()(const framework::proto::OpDesc& op) override {
+ void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) override {
LOG(INFO)
<< "convert a fluid conv2d op to tensorrt conv layer without bias";
}
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45b079559754a8f5c3fe39781b5700a75f425e99
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+ T* odata, nvinfer1::DimsHW ostrides) {
+ for (int h = 0; h < shape.h(); ++h) {
+ for (int w = 0; w < shape.w(); ++w) {
+ odata[h * ostrides.h() + w * ostrides.w()] =
+ idata[h * ostrides.h() + w * ostrides.w()];
+ }
+ }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+ TensorRTEngine::Weight* oweights) {
+ int c = iweights.dims[0];
+ int k = iweights.dims[1];
+ oweights->dims.assign({k, c});
+ nvinfer1::DimsHW istrides = {1, k};
+ nvinfer1::DimsHW ostrides = {c, 1};
+ Reorder2({k, c}, static_cast(iweights.get().values), istrides,
+ static_cast(const_cast(oweights->get().values)),
+ ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+ void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) override {
+ VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+ framework::OpDesc op_desc(op, nullptr);
+ PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+ PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight
+ PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+ // Declare inputs
+ auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+ // Declare weights
+ auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+ PADDLE_ENFORCE_NOT_NULL(Y_v);
+ auto* Y_t = Y_v->GetMutable();
+ // This may trigger a GPU->CPU copy, because TRT's weight can only be
+ // assigned from CPU memory, that can't be avoided.
+ auto* weight_data = Y_t->mutable_data(platform::CPUPlace());
+ PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix
+ size_t n_output = Y_t->dims()[1];
+
+ framework::LoDTensor tmp;
+ tmp.Resize(Y_t->dims());
+ memcpy(tmp.mutable_data(platform::CPUPlace()), Y_t->data(),
+ Y_t->dims()[0] * Y_t->dims()[1]);
+
+ TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+ static_cast(weight_data),
+ Y_t->memory_size() / sizeof(float)};
+ TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+ static_cast(tmp.data()),
+ Y_t->memory_size() / sizeof(float));
+ weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+ tmp_weight.dims = weight.dims;
+
+ // The data layout of TRT FC layer's weight is different from fluid's FC,
+ // need to reorder the elements.
+ ReorderCKtoKC(tmp_weight, &weight);
+
+ // Currently, the framework can only handle one fluid op -> one TRT layer,
+ // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+ // handle `mul`, leave `add` as another layer.
+ // DEBUG
+ TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+ auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+ *const_cast(X),
+ n_output, weight.get(), bias.get());
+
+ auto output_name = op_desc.Output("Out").front();
+ engine_->DeclareOutput(layer, 0, output_name);
+ }
+};
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+
+} // namespace tensorrt
+} // namespace inference
+} // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index aa8e66490f7e40038b0de4da32655f1b168ca332..6bb07709c7ee1c6b29c46425849a4f472d3df59d 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -24,8 +24,9 @@ namespace tensorrt {
class MulOpConverter : public OpConverter {
public:
MulOpConverter() {}
- void operator()(const framework::proto::OpDesc& op) override {
- VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias";
+ void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) override {
+ VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 1cd3ed9a00acead2599420f88499bd0d74c2974b..3beafeefd06f24ec50b0e61c1fabe13d7e53f242 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -31,27 +31,42 @@ namespace tensorrt {
class OpConverter {
public:
OpConverter() {}
- virtual void operator()(const framework::proto::OpDesc& op) {}
- void Run(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
- std::string type = op.type();
- auto* it = Registry::Lookup(type);
- PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", type);
- it->SetEngine(engine);
- (*it)(op);
- }
+ // Converter logic for an op.
+ virtual void operator()(const framework::proto::OpDesc& op,
+ const framework::Scope& scope) {}
+
+ // Convert a single fluid operaotr and add the corresponding layer to TRT.
+ void ConvertOp(const framework::proto::OpDesc& op,
+ const std::unordered_set& parameters,
+ const framework::Scope& scope, TensorRTEngine* engine) {
+ framework::OpDesc op_desc(op, nullptr);
+
+ OpConverter* it{nullptr};
- // convert fluid op to tensorrt layer
- void ConvertOp(const framework::proto::OpDesc& op, TensorRTEngine* engine) {
- OpConverter::Run(op, engine);
+ if (op_desc.Type() == "mul") {
+ PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+ std::string Y = op_desc.Input("Y")[0];
+ if (parameters.count(Y)) {
+ it = Registry::Lookup("fc");
+ }
+ }
+ if (!it) {
+ it = Registry::Lookup(op_desc.Type());
+ }
+ PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+ op_desc.Type());
+ it->SetEngine(engine);
+ (*it)(op, scope);
}
// convert fluid block to tensorrt network
void ConvertBlock(const framework::proto::BlockDesc& block,
- TensorRTEngine* engine) {
+ const std::unordered_set& parameters,
+ const framework::Scope& scope, TensorRTEngine* engine) {
for (int i = 0; i < block.ops_size(); i++) {
const auto& op = block.ops(i);
- OpConverter::Run(op, engine);
+ ConvertOp(op, parameters, scope, engine);
}
}
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30253072ac581ceca85ca10151a176f87a7cb39
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+ std::unordered_set parameters({"mul-Y"});
+ framework::Scope scope;
+ TRTConvertValidation validator(20, parameters, scope, 1000);
+
+ validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+ validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+ validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+ // Prepare Op description
+ framework::OpDesc desc;
+ desc.SetType("mul");
+ desc.SetInput("X", {"mul-X"});
+ desc.SetInput("Y", {"mul-Y"});
+ desc.SetOutput("Out", {"mul-Out"});
+
+ validator.SetOp(*desc.Proto());
+
+ validator.Execute(10);
+}
+
+} // namespace tensorrt
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
index d8b61d5f08ffd071c112b4677fcb6f6f50784bbc..1ce1130e5d660d717a1262a1fbdb4b620462c0b3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -21,7 +21,9 @@ namespace inference {
namespace tensorrt {
TEST(MulOpConverter, main) {
- TRTConvertValidation validator(10, 1000);
+ framework::Scope scope;
+ std::unordered_set parameters;
+ TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 9ae7de9cbfa656fbcbb48557bd4b548115897c6d..1d3f5eabb2f839b2acfa9da6527589df1ec3767f 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
#include
#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
@@ -27,7 +28,9 @@ TEST(OpConverter, ConvertBlock) {
conv2d_op->SetType("conv2d");
OpConverter converter;
- converter.ConvertBlock(*block->Proto(), nullptr /*TensorRTEngine*/);
+ framework::Scope scope;
+ converter.ConvertBlock(*block->Proto(), {}, scope,
+ nullptr /*TensorRTEngine*/);
}
} // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 684bbc208fc1cb02d2a36b4de720309ea6bed173..d7e05dd5b5b235b7b166b22c5b094dc364e28dfc 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -61,7 +61,10 @@ class TRTConvertValidation {
public:
TRTConvertValidation() = delete;
- explicit TRTConvertValidation(int batch_size, int workspace_size = 1024) {
+ TRTConvertValidation(int batch_size,
+ const std::unordered_set& parameters,
+ framework::Scope& scope, int workspace_size = 1 << 10)
+ : parameters_(parameters), scope_(scope) {
// create engine.
engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
engine_->InitNetwork();
@@ -76,19 +79,22 @@ class TRTConvertValidation {
engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
}
+ // Declare a parameter varaible in the scope.
+ void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+ DeclVar(name, dims);
+ }
+
void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
DeclVar(name, dims);
}
+ // Declare a variable in a fluid Scope.
void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// Init Fluid tensor.
- std::vector dim_vec(dims.nbDims);
- for (int i = 0; i < dims.nbDims; i++) {
- dim_vec[i] = dims.d[i];
- }
+ std::vector dim_vec(dims.d, dims.d + dims.nbDims);
auto* x = scope_.Var(name);
auto* x_tensor = x->GetMutable();
x_tensor->Resize(framework::make_ddim(dim_vec));
@@ -99,7 +105,7 @@ class TRTConvertValidation {
op_ = framework::OpRegistry::CreateOp(desc);
OpConverter op_converter;
- op_converter.ConvertOp(desc, engine_.get());
+ op_converter.ConvertOp(desc, parameters_, scope_, engine_.get());
engine_->FreezeNetwork();
@@ -108,11 +114,13 @@ class TRTConvertValidation {
// Set Inputs.
for (const auto& input : op_desc_->InputArgumentNames()) {
+ if (parameters_.count(input)) continue;
auto* var = scope_.FindVar(input);
PADDLE_ENFORCE(var);
auto tensor = var->GetMutable();
+
engine_->SetInputFromCPU(
- input, static_cast(tensor->data()),
+ input, static_cast(tensor->data()),
sizeof(float) *
analysis::AccuDims(tensor->dims(), tensor->dims().size()));
}
@@ -120,18 +128,21 @@ class TRTConvertValidation {
void Execute(int batch_size) {
// Execute Fluid Op
- // Execute TRT
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
- engine_->Execute(batch_size);
-
op_->Run(scope_, place);
+ // Execute TRT.
+ engine_->Execute(batch_size);
+ cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+ const size_t output_space_size = 200;
for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector fluid_out;
- std::vector trt_out(200);
- engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float));
+ std::vector trt_out(output_space_size);
+ engine_->GetOutputInCPU(output, &trt_out[0],
+ output_space_size * sizeof(float));
+ cudaStreamSynchronize(*engine_->stream());
auto* var = scope_.FindVar(output);
auto tensor = var->GetMutable();
@@ -139,7 +150,7 @@ class TRTConvertValidation {
// Compare two output
ASSERT_FALSE(fluid_out.empty());
for (size_t i = 0; i < fluid_out.size(); i++) {
- EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001);
+ EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 1e-6);
}
}
}
@@ -149,9 +160,10 @@ class TRTConvertValidation {
private:
std::unique_ptr engine_;
cudaStream_t stream_;
- framework::Scope scope_;
std::unique_ptr op_;
std::unique_ptr op_desc_;
+ const std::unordered_set& parameters_;
+ framework::Scope& scope_;
};
} // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index a88236ae98e1816fc43796ead596c432b798d7de..3d75fefc1a735168131a6c67ac073e80aba32945 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -106,6 +106,7 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
name);
auto* output = layer->getOutput(offset);
+ SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
infer_network_->markOutput(*output);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index d9d3163b66d4c4c302d12edcc42f00e1cdfa5a30..fabcfd9e80cc0ef2637201a1499ebbe2d6adfd8c 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -37,13 +37,15 @@ class TensorRTEngine : public EngineBase {
// Weight is model parameter.
class Weight {
public:
- Weight(nvinfer1::DataType dtype, void* value, int num_elem) {
+ Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
w_.type = dtype;
w_.values = value;
w_.count = num_elem;
}
const nvinfer1::Weights& get() { return w_; }
+ std::vector dims;
+
private:
nvinfer1::Weights w_;
};
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index dbb81462b8273bd701e9c9f530eaf69817abd6a1..2fa5a9540ba1311c7f87e6675a53044b23dd8276 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -38,3 +38,11 @@ inference_test(recommender_system)
#inference_test(rnn_encoder_decoder)
#inference_test(understand_sentiment ARGS conv)
inference_test(word2vec)
+
+# This is an unly work around to make this test run
+# TODO(TJ): clean me up
+cc_test(test_inference_nlp
+ SRCS test_inference_nlp.cc
+ DEPS paddle_fluid
+ ARGS
+ --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70aa42ac4111c0524a55e26aaefa864338c1d6c1
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -0,0 +1,236 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include
+#include
+#include // NOLINT
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+#ifdef PADDLE_WITH_MKLML
+#include
+#include
+#endif
+
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+
+inline double GetCurrentMs() {
+ struct timeval time;
+ gettimeofday(&time, NULL);
+ return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector* out) {
+ paddle::framework::LoDTensor input;
+ SetupTensor(&input, {1, 1, 28, 28}, -1.f, 1.f);
+ out->emplace_back(input);
+ return 1;
+}
+
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
+size_t LoadData(std::vector* out,
+ const std::string& filename) {
+ if (filename.empty()) {
+ return DummyData(out);
+ }
+
+ size_t sz = 0;
+ std::fstream fin(filename);
+ std::string line;
+ out->clear();
+ while (getline(fin, line)) {
+ std::istringstream iss(line);
+ std::vector ids;
+ std::string field;
+ while (getline(iss, field, ' ')) {
+ ids.push_back(stoi(field));
+ }
+ if (ids.size() >= 1024) {
+ // Synced with NLP guys, they will ignore input larger then 1024
+ continue;
+ }
+
+ paddle::framework::LoDTensor words;
+ paddle::framework::LoD lod{{0, ids.size()}};
+ words.set_lod(lod);
+ int64_t* pdata = words.mutable_data(
+ {static_cast(ids.size()), 1}, paddle::platform::CPUPlace());
+ memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+ out->emplace_back(words);
+ sz += ids.size();
+ }
+ return sz;
+}
+
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
+void SplitData(
+ const std::vector& datasets,
+ std::vector>* jobs,
+ const int num_threads) {
+ size_t s = 0;
+ jobs->resize(num_threads);
+ while (s < datasets.size()) {
+ for (auto it = jobs->begin(); it != jobs->end(); it++) {
+ it->emplace_back(&datasets[s]);
+ s++;
+ if (s >= datasets.size()) {
+ break;
+ }
+ }
+ }
+}
+
+void ThreadRunInfer(
+ const int tid, paddle::framework::Executor* executor,
+ paddle::framework::Scope* scope,
+ const std::unique_ptr& inference_program,
+ const std::vector>& jobs) {
+ auto copy_program = std::unique_ptr(
+ new paddle::framework::ProgramDesc(*inference_program));
+ auto& sub_scope = scope->NewScope();
+
+ std::string feed_holder_name = "feed_" + paddle::string::to_string(tid);
+ std::string fetch_holder_name = "fetch_" + paddle::string::to_string(tid);
+ copy_program->SetFeedHolderName(feed_holder_name);
+ copy_program->SetFetchHolderName(fetch_holder_name);
+
+ const std::vector& feed_target_names =
+ copy_program->GetFeedTargetNames();
+ const std::vector& fetch_target_names =
+ copy_program->GetFetchTargetNames();
+
+ PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+ std::map fetch_targets;
+ paddle::framework::LoDTensor outtensor;
+ fetch_targets[fetch_target_names[0]] = &outtensor;
+
+ std::map feed_targets;
+ PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+ auto& inputs = jobs[tid];
+ auto start_ms = GetCurrentMs();
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ feed_targets[feed_target_names[0]] = inputs[i];
+ executor->Run(*copy_program, &sub_scope, &feed_targets, &fetch_targets,
+ true /*create_local_scope*/, true /*create_vars*/,
+ feed_holder_name, fetch_holder_name);
+ }
+ auto stop_ms = GetCurrentMs();
+ scope->DeleteScope(&sub_scope);
+ LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+ << " samples, avg time per sample: "
+ << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+TEST(inference, nlp) {
+ if (FLAGS_model_path.empty()) {
+ LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
+ }
+ if (FLAGS_data_file.empty()) {
+ LOG(WARNING) << "No data file provided, will use dummy data!"
+ << "Note: if you use nlp model, please provide data file.";
+ }
+ LOG(INFO) << "Model Path: " << FLAGS_model_path;
+ LOG(INFO) << "Data File: " << FLAGS_data_file;
+
+ std::vector datasets;
+ size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
+ LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
+ LOG(INFO) << "Total number of words: " << num_total_words;
+
+ const bool model_combined = false;
+ // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+ // 1. Define place, executor, scope
+ auto place = paddle::platform::CPUPlace();
+ auto executor = paddle::framework::Executor(place);
+ std::unique_ptr scope(
+ new paddle::framework::Scope());
+
+ // 2. Initialize the inference_program and load parameters
+ std::unique_ptr inference_program;
+ inference_program =
+ InitProgram(&executor, scope.get(), FLAGS_model_path, model_combined);
+ if (FLAGS_use_mkldnn) {
+ EnableMKLDNN(inference_program);
+ }
+
+#ifdef PADDLE_WITH_MKLML
+ // only use 1 thread number per std::thread
+ omp_set_dynamic(0);
+ omp_set_num_threads(1);
+ mkl_set_num_threads(1);
+#endif
+
+ double start_ms = 0, stop_ms = 0;
+ if (FLAGS_num_threads > 1) {
+ std::vector> jobs;
+ SplitData(datasets, &jobs, FLAGS_num_threads);
+ std::vector> threads;
+ start_ms = GetCurrentMs();
+ for (int i = 0; i < FLAGS_num_threads; ++i) {
+ threads.emplace_back(
+ new std::thread(ThreadRunInfer, i, &executor, scope.get(),
+ std::ref(inference_program), std::ref(jobs)));
+ }
+ for (int i = 0; i < FLAGS_num_threads; ++i) {
+ threads[i]->join();
+ }
+ stop_ms = GetCurrentMs();
+ } else {
+ if (FLAGS_prepare_vars) {
+ executor.CreateVariables(*inference_program, scope.get(), 0);
+ }
+ // always prepare context
+ std::unique_ptr ctx;
+ ctx = executor.Prepare(*inference_program, 0);
+
+ // preapre fetch
+ const std::vector& fetch_target_names =
+ inference_program->GetFetchTargetNames();
+ PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+ std::map fetch_targets;
+ paddle::framework::LoDTensor outtensor;
+ fetch_targets[fetch_target_names[0]] = &outtensor;
+
+ // prepare feed
+ const std::vector& feed_target_names =
+ inference_program->GetFeedTargetNames();
+ PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+ std::map feed_targets;
+
+ // feed data and run
+ start_ms = GetCurrentMs();
+ for (size_t i = 0; i < datasets.size(); ++i) {
+ feed_targets[feed_target_names[0]] = &(datasets[i]);
+ executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
+ &fetch_targets, !FLAGS_prepare_vars);
+ }
+ stop_ms = GetCurrentMs();
+ LOG(INFO) << "Tid: 0, process " << datasets.size()
+ << " samples, avg time per sample: "
+ << (stop_ms - start_ms) / datasets.size() << " ms";
+ }
+ LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+ << " threads : " << (stop_ms - start_ms) / 1000.0
+ << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
+}
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index b892ac77d9ed60210ddadaecb1a4f214e5a25180..46ed99bcf2234f7621d9f00eb48c846d8a355795 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -222,35 +222,35 @@ struct MKLDNNActivationGradFunc : public BaseActivationFunctor {
};
template
-using ReluMkldnnFunctor =
+using ReluMKLDNNFunctor =
MKLDNNActivationFunc;
template
-using TanhMkldnnFunctor =
+using TanhMKLDNNFunctor =
MKLDNNActivationFunc;
template
-using SqrtMkldnnFunctor =
+using SqrtMKLDNNFunctor =
MKLDNNActivationFunc;
template
-using AbsMkldnnFunctor =
+using AbsMKLDNNFunctor =
MKLDNNActivationFunc;
template
-using ReluMkldnnGradFunctor =
+using ReluMKLDNNGradFunctor =
MKLDNNActivationGradFunc;
template
-using TanhMkldnnGradFunctor =
+using TanhMKLDNNGradFunctor =
MKLDNNActivationGradFunc;
template
-using SqrtMkldnnGradFunctor =
+using SqrtMKLDNNGradFunctor =
MKLDNNActivationGradFunc;
template
-using AbsMkldnnGradFunctor =
+using AbsMKLDNNGradFunctor =
MKLDNNActivationGradFunc;
} // namespace operators
} // namespace paddle
@@ -265,9 +265,9 @@ namespace ops = paddle::operators;
ops::MKLDNNActivationGradKernel>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
- __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
- __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
- __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
- __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
+ __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
+ __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \
+ __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \
+ __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 7a7b8b76e43b1f91a3ba2767c217993cc39f26b6..1828be57b5a54005a0066b18ebebdb740726f67a 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
-DEFINE_bool(cudnn_algo_use_autotune, true,
+DEFINE_bool(cudnn_deterministic, true,
"Whether allow using an autotuning algorithm for convolution "
"operator. The autotuning algorithm may be non-deterministic. If "
"false, the algorithm is deterministic.");
@@ -272,7 +272,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel {
auto& dev_ctx = ctx.template device_context();
auto handle = dev_ctx.cudnn_handle();
if (input_grad) {
- if (FLAGS_cudnn_algo_use_autotune) {
+ if (FLAGS_cudnn_deterministic) {
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
handle, cudnn_filter_desc,
@@ -297,7 +297,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel {
}
if (filter_grad) {
- if (FLAGS_cudnn_algo_use_autotune) {
+ if (FLAGS_cudnn_deterministic) {
PADDLE_ENFORCE(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
handle, cudnn_input_desc, cudnn_output_grad_desc,
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index da9ca1a0c1d55018141f0e4285fe35d7c437fd55..f4d83e86ecb01eed863a387d827023a5d808dad0 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -38,6 +38,25 @@ void RPCClient::Init() {
if (rpc_client_.get() == nullptr) {
rpc_client_.reset(new RPCClient());
}
+ rpc_client_->InitEventLoop();
+}
+
+void RPCClient::InitEventLoop() {
+ // start the client process thread
+ // TODO(wuyi): can make this in a threadpool
+ client_thread_.reset(new std::thread(std::bind(&RPCClient::Proceed, this)));
+}
+
+RPCClient::~RPCClient() {
+ Wait();
+ cq_.Shutdown();
+ {
+ std::lock_guard guard(chan_mutex_);
+ for (auto& it : channels_) {
+ it.second.reset();
+ }
+ }
+ client_thread_->join();
}
bool RPCClient::AsyncSendVariable(const std::string& ep,
@@ -204,70 +223,37 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
req_count_++;
}
-bool RPCClient::Wait() {
- VLOG(3) << "RPCClient begin Wait()"
- << " req_count_:" << req_count_;
- if (req_count_ <= 0) {
- return true;
- }
- const size_t kReqCnt = req_count_;
- bool a[kReqCnt];
- std::vector> waits(req_count_);
- std::mutex mu;
-
- for (int i = 0; i < req_count_; i++) {
- waits[i] = framework::AsyncIO([i, &a, &mu, this] {
- bool ret = Proceed();
- std::lock_guard l(mu);
- a[i] = ret;
- });
- }
-
- for (int i = 0; i < req_count_; i++) {
- waits[i].wait();
- }
-
- int last_req_count = req_count_;
- req_count_ = 0;
-
- for (int i = 0; i < last_req_count; i++) {
- if (!a[i]) {
- return false;
- }
- }
-
- return true;
+void RPCClient::Wait() {
+ std::unique_lock lk(sync_mutex_);
+ sync_cond_.wait(lk, [this] { return req_count_ == 0; });
}
-bool RPCClient::Proceed() {
- void* tag = NULL;
+void RPCClient::Proceed() {
+ void* tag = nullptr;
bool ok = false;
- // request counts.
- if (!cq_.Next(&tag, &ok)) {
- LOG(ERROR) << "Get meets CompletionQueue error";
- return false;
- }
-
- GPR_ASSERT(ok);
- PADDLE_ENFORCE(tag);
-
- // TODO(gongwb): add more retries.
- BaseProcessor* c = static_cast(tag);
- if (!c->status_.ok()) {
- LOG(ERROR) << "proc param error:" << c->var_h_.String()
- << " grpc error:" << c->status_.error_message();
+ while (cq_.Next(&tag, &ok)) {
+ BaseProcessor* c = static_cast(tag);
+ GPR_ASSERT(ok);
+ PADDLE_ENFORCE(c);
+ if (c->status_.ok()) {
+ c->Process();
+ } else {
+ LOG(ERROR) << "var: " << c->var_h_.String()
+ << " grpc error:" << c->status_.error_message();
+ }
delete c;
- return false;
+ {
+ std::lock_guard lk(sync_mutex_);
+ req_count_--;
+ }
+ sync_cond_.notify_all();
}
-
- c->Process();
- delete c;
- return true;
}
+
std::shared_ptr RPCClient::GetChannel(const std::string& ep) {
// TODO(Yancey1989): make grpc client completely thread-safe
- std::unique_lock lock(mutex_);
+ std::lock_guard guard(chan_mutex_);
auto it = channels_.find(ep);
if (it != channels_.end()) {
return it->second;
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
index 449d5105afb8c02294a0ef57610e7de1b1631b35..bb3813efcf4f77a8ec3d2f4b39969faa6216e38f 100644
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
@@ -16,15 +16,18 @@ limitations under the License. */
#include
-#include // NOLINT
+#include // NOLINT
+#include // NOLINT
#include
#include
#include
#include