diff --git a/AUTHORS.md b/AUTHORS.md
index deafa641203ed9d9bd794fe92e4a91e3aaa03f63..da91933f4697f3b7c08feab20d703e22397b3757 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -44,6 +44,7 @@
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| Sand3r- | Michal Gallus |
+| sfraczek | Sylwester Fraczek |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu |
@@ -54,6 +55,7 @@
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang |
+| wojtuss | Wojciech Uss |
| wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61f5e63098c40f140774ba6bfd9a2de8d2d67bfb..8e7ffe72b5fb846fb55ab8dc4809d87a40cfe06c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
-option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
-option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
-option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
-option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
-option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
-option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
-option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
-option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF)
-option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
-option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
-option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
@@ -105,8 +94,6 @@ endif()
if (WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE)
- set(WITH_FLUID_ONLY ON CACHE STRING
- "Enable FLUID_ONLY when compiling for Windows" FORCE)
endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph
include(external/boost) # download boost
-include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
@@ -225,7 +211,6 @@ include(generic) # simplify cmake module
include(package) # set paddle packages
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
-include(rdma) # set rdma libraries
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
@@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
-set(EXTERNAL_LIBS
- gflags
- glog
- ${CBLAS_LIBRARIES}
- protobuf
- zlib
- ${PYTHON_LIBRARIES}
-)
-
-if(WITH_PSLIB)
- list(APPEND EXTERNAL_LIBS pslib)
- list(APPEND EXTERNAL_LIBS pslib_brpc)
- list(APPEND EXTERNAL_LIBS libmct)
-endif(WITH_PSLIB)
-
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
endif(WITH_AMD_GPU)
-if(WITH_MKLML)
- list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
-endif()
-
-if(WITH_LIBXSMM)
- list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
-endif()
-
-if(WITH_MKLDNN)
- list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
-endif()
-
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
deleted file mode 100644
index 8b7dc5b7db800896eb4de2054ab5e584aed93999..0000000000000000000000000000000000000000
--- a/benchmark/IntelOptimizedPaddle.md
+++ /dev/null
@@ -1,112 +0,0 @@
-# Benchmark
-
-Machine:
-
-- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
-- Laptop: TBD
-
-System: CentOS release 6.3 (Final), Docker 1.12.1.
-
-PaddlePaddle:
-- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- - MKL-DNN tag v0.11
- - MKLML 2018.0.1.20171007
-- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- - OpenBLAS v0.2.20
-
-On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
-
-## Benchmark Model
-
-### Server
-
-#### Training
-Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
-
-Input image size - 3 * 224 * 224, Time: images/second
-
-- VGG-19
-
-| BatchSize | 64 | 128 | 256 |
-|--------------|-------| -----| --------|
-| OpenBLAS | 7.80 | 9.00 | 10.80 |
-| MKLML | 12.12 | 13.70 | 16.18 |
-| MKL-DNN | 28.46 | 29.83 | 30.44 |
-
-
-
- - ResNet-50
-
-| BatchSize | 64 | 128 | 256 |
-|--------------|-------| ------| -------|
-| OpenBLAS | 25.22 | 25.68 | 27.12 |
-| MKLML | 32.52 | 31.89 | 33.12 |
-| MKL-DNN | 81.69 | 82.35 | 84.08 |
-
-
-
- - GoogLeNet
-
-| BatchSize | 64 | 128 | 256 |
-|--------------|-------| ------| -------|
-| OpenBLAS | 89.52 | 96.97 | 108.25 |
-| MKLML | 128.46| 137.89| 158.63 |
-| MKL-DNN | 250.46| 264.83| 269.50 |
-
-
-
-- AlexNet
-
-| BatchSize | 64 | 128 | 256 |
-|--------------|--------| ------ | -------|
-| OpenBLAS | 45.62 | 72.79 | 107.22 |
-| MKLML | 66.37 | 105.60 | 144.04 |
-| MKL-DNN | 399.00 | 498.94 | 626.53 |
-
-
-
-#### Inference
-Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
-- VGG-19
-
-| BatchSize | 1 | 2 | 4 | 8 | 16 |
-|-----------|-------|-------|-------|-------|-------|
-| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
-| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
-| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
-
-
-
-- ResNet-50
-
-| BatchSize | 1 | 2 | 4 | 8 | 16 |
-|-----------|-------|--------|--------|--------|--------|
-| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
-| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
-| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
-
-
-
-- GoogLeNet
-
-| BatchSize | 1 | 2 | 4 | 8 | 16 |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
-| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
-| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
-
-
-
-- AlexNet
-
-| BatchSize | 1 | 2 | 4 | 8 | 16 |
-|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
-| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
-| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
-
-
-
-### Laptop
-TBD
diff --git a/benchmark/README.md b/benchmark/README.md
deleted file mode 100644
index 367013f0457f9bbb9ae1335ea63dce181316d444..0000000000000000000000000000000000000000
--- a/benchmark/README.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# Benchmark
-
-Machine:
-
-- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
-- GPU: Tesla K40m
-- cuDNN: v5.1
-- system: Docker 1.12.1, all platforms are tested in docker environment.
-
-Platforms:
-
-- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0
-- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu
-- Caffe: kaixhin/cuda-caffe
-
-Several convolutional neural networks and recurrent neural networks are used to test.
-
-## Image
-
-### Benchmark Model
-
-AlexNet, GoogleNet and a small network used in Caffe.
-
-- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
-
-- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
-
-- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
-
-
-### Single-GPU
-
-- AlexNet: input - 3 * 227 * 227, Time: ms/batch
-
-| BatchSize | 64 | 128 | 256 | 512 |
-|--------------|-----| -----| ------| -----|
-| PaddlePaddle | 195 | 334 | 602 | 1629 |
-| TensorFlow | 223 | 364 | 645 | 1235 |
-| Caffe | 324 | 627 | 1232 | 2513 |
-
-**Notation**
-
-All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
-
-- GoogletNet: input - 3 * 224 * 224, Time: ms/batch
-
-
-| BatchSize | 64 | 128 | 256 |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 613 | 1149 | 2348 |
-| TensorFlow | 644 | 1176 | 2219 |
-| Caffe | 694 | 1364 | out of memory |
-
-- SmallNet: input - 3 * 32 * 32, Time ms/batch
-
-| BatchSize | 64 | 128 | 256 | 512 |
-|--------------|--------| -------- | --------|---------|
-| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 |
-| TensorFlow | 9 | 15 | 28 | 59 |
-| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 |
-
-**Notation**
-
-All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
-
-In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
-
-### Multi-GPU: 4 GPUs
-
-- AlexNet, ms / batch
-
-| total-BatchSize | 128 * 4 | 256 * 4 |
-|------------------|----------| -----------|
-| PaddlePaddle | 347 | 622 |
-| TensorFlow | 377 | 675 |
-| Caffe | 1229 | 2435 |
-
-For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by
-
-```
- time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512
-= (334 * 4)/347
-= 3.85
-```
-
-
-
-
-- GoogleNet, ms / batch
-
-| total-BatchSize | 128 * 4 | 256 * 4 |
-|-------------------|--------------| ----------- |
-| PaddlePaddle | 1178 | 2367 |
-| TensorFlow | 1210 | 2292 |
-| Caffe | 2007 | out of memory |
-
-
-
-
-## RNN
-We use lstm network for text classfication to test benchmark.
-
-### Dataset
-- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
-- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
-- Dictionary size=30000
-- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
-
-### Single-GPU
-
-#### LSTM in Text Classification
-
-Testing `2 lstm layer + fc` network with different hidden size and batch size.
-
-- Batch size = 64, ms / batch
-
-| hidden_size | 256 | 512 | 1280 |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 83 | 184 | 641 |
-| TensorFlow | 175 | 280 | 818 |
-
-- Batch size = 128, ms / batch
-
-| hidden_size | 256 | 512 | 1280 |
-|--------------|------- | -------| --------|
-| PaddlePaddle | 110 | 261 | 1007 |
-| TensorFlow | 181 | 361 | 1237 |
-
-
-- Batch size = 256, ms / batch
-
-| hidden_size | 256 | 512 | 1280 |
-|--------------|-------| -------| --------|
-| PaddlePaddle | 170 | 414 | 1655 |
-| TensorFlow | 238 | 536 | 1905 |
-
-
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
-
-
-### Multi GPU: 4 GPUs
-
-#### LSTM in Text Classification
-
-- hidden_size = 256, ms / batch
-
-| batch_size | 256 | 512 |
-|--------------| -------| --------|
-| PaddlePaddle | 90 | 118 |
-| TensorFlow | 226 | 118 |
-
-
-- hidden_size = 512, ms / batch
-
-| batch_size | 256 | 512 |
-|--------------| -------| --------|
-| PaddlePaddle | 189 | 268 |
-| TensorFlow | 297 | 383 |
-
-
-
-
-#### Seq2Seq
-
-The benchmark of sequence-to-sequence network will be added later.
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
index 2e1e0d376899fd664866621263db62258e7c3869..81ea870050fe5db4a60fee40221991e38de6bd2e 100644
--- a/benchmark/fluid/Dockerfile
+++ b/benchmark/fluid/Dockerfile
@@ -15,9 +15,6 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s
RUN pip install -U pip
RUN pip install -U kubernetes paddlepaddle
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
RUN pip uninstall -y paddlepaddle && mkdir /workspace
ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
diff --git a/benchmark/paddle/image/check_env.sh b/benchmark/fluid/check_env.sh
similarity index 100%
rename from benchmark/paddle/image/check_env.sh
rename to benchmark/fluid/check_env.sh
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
deleted file mode 100644
index 9efc3f0494e4a817a7357f29e684f621bce1921e..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/alexnet.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-height = 227
-width = 227
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-gp = get_config_arg('layer_num', int, 1)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
- 'height': height,
- 'width': width,
- 'color': True,
- 'num_class': num_class,
- 'is_infer': is_infer,
- 'num_samples': num_samples
-}
-define_py_data_sources2(
- "train.list" if not is_infer else None,
- "test.list" if is_infer else None,
- module="provider",
- obj="process",
- args=args)
-
-settings(
- batch_size=batch_size,
- learning_rate=0.01 / batch_size,
- learning_method=MomentumOptimizer(0.9),
- regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
- input=net,
- filter_size=11,
- num_channels=3,
- num_filters=96,
- stride=4,
- padding=1)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv2
-net = img_conv_layer(
- input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
-net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-# conv3
-net = img_conv_layer(
- input=net, filter_size=3, num_filters=384, stride=1, padding=1)
-# conv4
-net = img_conv_layer(
- input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
-
-# conv5
-net = img_conv_layer(
- input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
-net = img_pool_layer(input=net, pool_size=3, stride=2)
-
-net = fc_layer(
- input=net,
- size=4096,
- act=ReluActivation(),
- layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(
- input=net,
- size=4096,
- act=ReluActivation(),
- layer_attr=ExtraAttr(drop_rate=0.5))
-net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-
-if is_infer:
- outputs(net)
-else:
- lab = data_layer('label', num_class)
- loss = cross_entropy(input=net, label=lab)
- outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
deleted file mode 100644
index 2a850ccb7f2c75b467554181fc5f4aa8f2b97a09..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/googlenet.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 128)
-use_gpu = get_config_arg('use_gpu', bool, True)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
- 'height': height,
- 'width': width,
- 'color': True,
- 'num_class': num_class,
- 'is_infer': is_infer,
- 'num_samples': num_samples
-}
-define_py_data_sources2(
- "train.list" if not is_infer else None,
- "test.list" if is_infer else None,
- module="provider",
- obj="process",
- args=args)
-
-settings(
- batch_size=batch_size,
- learning_rate=0.01 / batch_size,
- learning_method=MomentumOptimizer(0.9),
- regularization=L2Regularization(0.0005 * batch_size))
-
-conv_projection = conv_projection if use_gpu else img_conv_layer
-
-def inception2(name, input, channels, \
- filter1,
- filter3R, filter3,
- filter5R, filter5,
- proj):
-
- conv1 = name + '_1'
- conv3r = name + '_3r'
- conv3 = name + '_3'
- conv5r = name + '_5r'
- conv5 = name + '_5'
- maxpool = name + '_max'
- convproj = name + '_proj'
-
- cov1 = img_conv_layer(
- name=conv1,
- input=input,
- filter_size=1,
- num_channels=channels,
- num_filters=filter1,
- stride=1,
- padding=0)
-
- cov3r = img_conv_layer(
- name=conv3r,
- input=input,
- filter_size=1,
- num_channels=channels,
- num_filters=filter3R,
- stride=1,
- padding=0)
- cov3 = img_conv_layer(
- name=conv3,
- input=cov3r,
- filter_size=3,
- num_filters=filter3,
- stride=1,
- padding=1)
-
- cov5r = img_conv_layer(
- name=conv5r,
- input=input,
- filter_size=1,
- num_channels=channels,
- num_filters=filter5R,
- stride=1,
- padding=0)
- cov5 = img_conv_layer(
- name=conv5,
- input=cov5r,
- filter_size=5,
- num_filters=filter5,
- stride=1,
- padding=2)
-
- pool1 = img_pool_layer(
- name=maxpool,
- input=input,
- pool_size=3,
- num_channels=channels,
- stride=1,
- padding=1)
- covprj = img_conv_layer(
- name=convproj,
- input=pool1,
- filter_size=1,
- num_filters=proj,
- stride=1,
- padding=0)
-
- cat = concat_layer(name=name, input=[cov1, cov3, cov5, covprj])
- return cat
-
-def inception(name, input, channels, \
- filter1,
- filter3R, filter3,
- filter5R, filter5,
- proj):
-
- cov1 = conv_projection(
- input=input,
- filter_size=1,
- num_channels=channels,
- num_filters=filter1,
- stride=1,
- padding=0)
-
- cov3r = img_conv_layer(
- name=name + '_3r',
- input=input,
- filter_size=1,
- num_channels=channels,
- num_filters=filter3R,
- stride=1,
- padding=0)
- cov3 = conv_projection(
- input=cov3r, filter_size=3, num_filters=filter3, stride=1, padding=1)
-
- cov5r = img_conv_layer(
- name=name + '_5r',
- input=input,
- filter_size=1,
- num_channels=channels,
- num_filters=filter5R,
- stride=1,
- padding=0)
- cov5 = conv_projection(
- input=cov5r, filter_size=5, num_filters=filter5, stride=1, padding=2)
-
- pool1 = img_pool_layer(
- name=name + '_max',
- input=input,
- pool_size=3,
- num_channels=channels,
- stride=1,
- padding=1)
- covprj = conv_projection(
- input=pool1, filter_size=1, num_filters=proj, stride=1, padding=0)
-
- cat = concat_layer(
- name=name,
- input=[cov1, cov3, cov5, covprj],
- bias_attr=True if use_gpu else False,
- act=ReluActivation())
- return cat
-
-
-data = data_layer(name="input", size=3 * height * width)
-
-# stage 1
-conv1 = img_conv_layer(
- name="conv1",
- input=data,
- filter_size=7,
- num_channels=3,
- num_filters=64,
- stride=2,
- padding=3)
-pool1 = img_pool_layer(
- name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
-
-# stage 2
-conv2_1 = img_conv_layer(
- name="conv2_1",
- input=pool1,
- filter_size=1,
- num_filters=64,
- stride=1,
- padding=0)
-conv2_2 = img_conv_layer(
- name="conv2_2",
- input=conv2_1,
- filter_size=3,
- num_filters=192,
- stride=1,
- padding=1)
-pool2 = img_pool_layer(
- name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
-
-# stage 3
-ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
-ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
-pool3 = img_pool_layer(
- name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
-
-# stage 4
-ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
-ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
-ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
-ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
-ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
-pool4 = img_pool_layer(
- name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
-
-# stage 5
-ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
-ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
-pool5 = img_pool_layer(
- name="pool5",
- input=ince5b,
- num_channels=1024,
- pool_size=7,
- stride=7,
- pool_type=AvgPooling())
-
-# We remove loss1 and loss2 for all system when testing benchmark
-# output 1
-# pool_o1 = img_pool_layer(name="pool_o1", input=ince4a, num_channels=512, pool_size=5, stride=3, pool_type=AvgPooling())
-# conv_o1 = img_conv_layer(name="conv_o1", input=pool_o1, filter_size=1, num_filters=128, stride=1, padding=0)
-# fc_o1 = fc_layer(name="fc_o1", input=conv_o1, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-# out1 = fc_layer(name="output1", input=fc_o1, size=1000, act=SoftmaxActivation())
-# loss1 = cross_entropy(name='loss1', input=out1, label=lab, coeff=0.3)
-
-# output 2
-#pool_o2 = img_pool_layer(name="pool_o2", input=ince4d, num_channels=528, pool_size=5, stride=3, pool_type=AvgPooling())
-#conv_o2 = img_conv_layer(name="conv_o2", input=pool_o2, filter_size=1, num_filters=128, stride=1, padding=0)
-#fc_o2 = fc_layer(name="fc_o2", input=conv_o2, size=1024, layer_attr=ExtraAttr(drop_rate=0.7), act=ReluActivation())
-#out2 = fc_layer(name="output2", input=fc_o2, size=1000, act=SoftmaxActivation())
-#loss2 = cross_entropy(name='loss2', input=out2, label=lab, coeff=0.3)
-
-# output 3
-dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
-out3 = fc_layer(
- name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-
-if is_infer:
- outputs(out3)
-else:
- lab = data_layer(name="label", size=num_class)
- loss3 = cross_entropy(name='loss3', input=out3, label=lab)
- outputs(loss3)
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
deleted file mode 100644
index 8679d4f272d1b7aaf8d5a397f07698a6b70e4fcd..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/plotlog.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import argparse
-import matplotlib.pyplot as plt
-
-
-def parse_args():
- parser = argparse.ArgumentParser('Parse Log')
- parser.add_argument(
- '--file_path', '-f', type=str, help='the path of the log file')
- parser.add_argument(
- '--sample_rate',
- '-s',
- type=float,
- default=1.0,
- help='the rate to take samples from log')
- parser.add_argument(
- '--log_period', '-p', type=int, default=1, help='the period of log')
-
- args = parser.parse_args()
- return args
-
-
-def parse_file(file_name):
- loss = []
- error = []
- with open(file_name) as f:
- for i, line in enumerate(f):
- line = line.strip()
- if not line.startswith('pass'):
- continue
- line_split = line.split(' ')
- if len(line_split) != 5:
- continue
-
- loss_str = line_split[2][:-1]
- cur_loss = float(loss_str.split('=')[-1])
- loss.append(cur_loss)
-
- err_str = line_split[3][:-1]
- cur_err = float(err_str.split('=')[-1])
- error.append(cur_err)
-
- accuracy = [1.0 - err for err in error]
-
- return loss, accuracy
-
-
-def sample(metric, sample_rate):
- interval = int(1.0 / sample_rate)
- if interval > len(metric):
- return metric[:1]
-
- num = len(metric) / interval
- idx = [interval * i for i in range(num)]
- metric_sample = [metric[id] for id in idx]
- return metric_sample
-
-
-def plot_metric(metric,
- batch_id,
- graph_title,
- line_style='b-',
- line_label='y',
- line_num=1):
- plt.figure()
- plt.title(graph_title)
- if line_num == 1:
- plt.plot(batch_id, metric, line_style, label=line_label)
- else:
- for i in range(line_num):
- plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
- plt.xlabel('batch')
- plt.ylabel(graph_title)
- plt.legend()
- plt.savefig(graph_title + '.jpg')
- plt.close()
-
-
-def main():
- args = parse_args()
- assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
-
- loss, accuracy = parse_file(args.file_path)
- batch = [args.log_period * i for i in range(len(loss))]
-
- batch_sample = sample(batch, args.sample_rate)
- loss_sample = sample(loss, args.sample_rate)
- accuracy_sample = sample(accuracy, args.sample_rate)
-
- plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
- plot_metric(
- accuracy_sample,
- batch_sample,
- 'accuracy',
- line_style='g-',
- line_label='accuracy')
-
-
-if __name__ == '__main__':
- main()
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
deleted file mode 100644
index 6ad817ccefab3e44a8f962e907ba2110a6ed4a45..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/provider.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-from paddle.trainer.PyDataProvider2 import *
-
-
-def initHook(settings, height, width, color, num_class, **kwargs):
- settings.height = height
- settings.width = width
- settings.color = color
- settings.num_class = num_class
- if settings.color:
- settings.data_size = settings.height * settings.width * 3
- else:
- settings.data_size = settings.height * settings.width
- settings.is_infer = kwargs.get('is_infer', False)
- settings.num_samples = kwargs.get('num_samples', 2560)
- if settings.is_infer:
- settings.slots = [dense_vector(settings.data_size)]
- else:
- settings.slots = [dense_vector(settings.data_size), integer_value(1)]
-
-
-@provider(
- init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_list):
- for i in xrange(settings.num_samples):
- img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
- if settings.is_infer:
- yield img.astype('float32')
- else:
- lab = random.randint(0, settings.num_class - 1)
- yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
deleted file mode 100644
index 2846e4763f1cda4602f03af5ec649d57ee6cf0d8..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/resnet.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg("layer_num", int, 50)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
- 'height': height,
- 'width': width,
- 'color': True,
- 'num_class': num_class,
- 'is_infer': is_infer,
- 'num_samples': num_samples
-}
-define_py_data_sources2(
- "train.list" if not is_infer else None,
- "test.list" if is_infer else None,
- module="provider",
- obj="process",
- args=args)
-
-settings(
- batch_size=batch_size,
- learning_rate=0.01 / batch_size,
- learning_method=MomentumOptimizer(0.9),
- regularization=L2Regularization(0.0005 * batch_size))
-
-
-#######################Network Configuration #############
-def conv_bn_layer(name,
- input,
- filter_size,
- num_filters,
- stride,
- padding,
- channels=None,
- active_type=ReluActivation()):
- """
- A wrapper for conv layer with batch normalization layers.
- Note:
- conv layer has no activation.
- """
-
- tmp = img_conv_layer(
- name=name + "_conv",
- input=input,
- filter_size=filter_size,
- num_channels=channels,
- num_filters=num_filters,
- stride=stride,
- padding=padding,
- act=LinearActivation(),
- bias_attr=False)
- return batch_norm_layer(
- name=name + "_bn",
- input=tmp,
- act=active_type,
- use_global_stats=is_infer)
-
-
-def bottleneck_block(name, input, num_filters1, num_filters2):
- """
- A wrapper for bottlenect building block in ResNet.
- Last conv_bn_layer has no activation.
- Addto layer has activation of relu.
- """
- last_name = conv_bn_layer(
- name=name + '_branch2a',
- input=input,
- filter_size=1,
- num_filters=num_filters1,
- stride=1,
- padding=0)
- last_name = conv_bn_layer(
- name=name + '_branch2b',
- input=last_name,
- filter_size=3,
- num_filters=num_filters1,
- stride=1,
- padding=1)
- last_name = conv_bn_layer(
- name=name + '_branch2c',
- input=last_name,
- filter_size=1,
- num_filters=num_filters2,
- stride=1,
- padding=0,
- active_type=LinearActivation())
-
- return addto_layer(
- name=name + "_addto", input=[input, last_name], act=ReluActivation())
-
-
-def mid_projection(name, input, num_filters1, num_filters2, stride=2):
- """
- A wrapper for middile projection in ResNet.
- projection shortcuts are used for increasing dimensions,
- and other shortcuts are identity
- branch1: projection shortcuts are used for increasing
- dimensions, has no activation.
- branch2x: bottleneck building block, shortcuts are identity.
- """
- # stride = 2
- branch1 = conv_bn_layer(
- name=name + '_branch1',
- input=input,
- filter_size=1,
- num_filters=num_filters2,
- stride=stride,
- padding=0,
- active_type=LinearActivation())
-
- last_name = conv_bn_layer(
- name=name + '_branch2a',
- input=input,
- filter_size=1,
- num_filters=num_filters1,
- stride=stride,
- padding=0)
- last_name = conv_bn_layer(
- name=name + '_branch2b',
- input=last_name,
- filter_size=3,
- num_filters=num_filters1,
- stride=1,
- padding=1)
-
- last_name = conv_bn_layer(
- name=name + '_branch2c',
- input=last_name,
- filter_size=1,
- num_filters=num_filters2,
- stride=1,
- padding=0,
- active_type=LinearActivation())
-
- return addto_layer(
- name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
-
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
- """
- A wrapper for 50,101,152 layers of ResNet.
- res2_num: number of blocks stacked in conv2_x
- res3_num: number of blocks stacked in conv3_x
- res4_num: number of blocks stacked in conv4_x
- res5_num: number of blocks stacked in conv5_x
- """
- # For ImageNet
- # conv1: 112x112
- tmp = conv_bn_layer(
- "conv1",
- input=img,
- filter_size=7,
- channels=3,
- num_filters=64,
- stride=2,
- padding=3)
- tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
-
- # conv2_x: 56x56
- tmp = mid_projection(
- name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
- for i in xrange(2, res2_num + 1, 1):
- tmp = bottleneck_block(
- name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
-
- # conv3_x: 28x28
- tmp = mid_projection(
- name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
- for i in xrange(2, res3_num + 1, 1):
- tmp = bottleneck_block(
- name="res3_" + str(i),
- input=tmp,
- num_filters1=128,
- num_filters2=512)
-
- # conv4_x: 14x14
- tmp = mid_projection(
- name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
- for i in xrange(2, res4_num + 1, 1):
- tmp = bottleneck_block(
- name="res4_" + str(i),
- input=tmp,
- num_filters1=256,
- num_filters2=1024)
-
- # conv5_x: 7x7
- tmp = mid_projection(
- name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
- for i in xrange(2, res5_num + 1, 1):
- tmp = bottleneck_block(
- name="res5_" + str(i),
- input=tmp,
- num_filters1=512,
- num_filters2=2048)
-
- tmp = img_pool_layer(
- name='avgpool',
- input=tmp,
- pool_size=7,
- stride=1,
- pool_type=AvgPooling())
-
- return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 50:
- resnet = deep_res_net(3, 4, 6, 3)
-elif layer_num == 101:
- resnet = deep_res_net(3, 4, 23, 3)
-elif layer_num == 152:
- resnet = deep_res_net(3, 8, 36, 3)
-else:
- print("Wrong layer number.")
-
-if is_infer:
- outputs(resnet)
-else:
- lbl = data_layer(name="label", size=num_class)
- loss = cross_entropy(name='loss', input=resnet, label=lbl)
- outputs(loss)
diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh
deleted file mode 100755
index 5b58a8d773aab795e5439b0f0e5d81bec66b5f56..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
- cfg=$1
- thread=$2
- bz=$3
- args="batch_size=$3"
- prefix=$4
- paddle train --job=time \
- --config=$cfg \
- --use_gpu=True \
- --trainer_count=$thread \
- --log_period=10 \
- --test_period=100 \
- --config_args=$args \
- > logs/$prefix-${thread}gpu-$bz.log 2>&1
-}
-
-if [ ! -d "train.list" ]; then
- echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
- mkdir logs
-fi
-
-#========single-gpu=========#
-# alexnet
-train alexnet.py 1 64 alexnet
-train alexnet.py 1 128 alexnet
-train alexnet.py 1 256 alexnet
-train alexnet.py 1 512 alexnet
-
-# googlenet
-train googlenet.py 1 64 googlenet
-train googlenet.py 1 128 googlenet
-train googlenet.py 1 256 googlenet
-
-# smallnet
-train smallnet_mnist_cifar.py 1 64 smallnet
-train smallnet_mnist_cifar.py 1 128 smallnet
-train smallnet_mnist_cifar.py 1 256 smallnet
-train smallnet_mnist_cifar.py 1 512 smallnet
-
-
-############################
-#========multi-gpus=========#
-train alexnet.py 4 512 alexnet
-train alexnet.py 4 1024 alexnet
-
-train googlenet.py 4 512 googlenet
-train googlenet.py 4 1024 googlenet
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
deleted file mode 100755
index 0fad5e04cc992a3ec97591d3833957bb7517a8f3..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
- hours=`echo $1 | awk -F ':' '{print $1}'`
- mins=`echo $1 | awk -F ':' '{print $2}'`
- secs=`echo $1 | awk -F ':' '{print $3}'`
- echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
- unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
- topology=$1
- layer_num=$2
- bs=$3
- use_mkldnn=$4
- if [ $4 == "True" ]; then
- thread=1
- log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
- elif [ $4 == "False" ]; then
- thread=`nproc`
- if [ $thread -gt $bs ]; then
- thread=$bs
- fi
- log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
- else
- echo "Wrong input $4, use True or False."
- exit 0
- fi
-
- models_in="models/${topology}-${layer_num}/pass-00000/"
- if [ ! -d $models_in ]; then
- echo "Training model ${topology}_${layer_num}"
- paddle train --job=train \
- --config="${topology}.py" \
- --use_mkldnn=True \
- --use_gpu=False \
- --trainer_count=1 \
- --num_passes=1 \
- --save_dir="models/${topology}-${layer_num}" \
- --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
- > /dev/null 2>&1
- echo "Done"
- fi
- log_period=$((256 / bs))
- paddle train --job=test \
- --config="${topology}.py" \
- --use_mkldnn=$use_mkldnn \
- --use_gpu=False \
- --trainer_count=$thread \
- --log_period=$log_period \
- --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
- --init_model_path=$models_in \
- 2>&1 | tee ${log}
-
- # calculate the last 5 logs period time of 1280 samples,
- # the time before are burning time.
- start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
- end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
- start_sec=`clock_to_seconds $start`
- end_sec=`clock_to_seconds $end`
- fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
- echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
- echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
- echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
- echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
- mkdir logs
-fi
-if [ ! -d "models" ]; then
- mkdir -p models
-fi
-
-# inference benchmark
-for use_mkldnn in True False; do
- for batchsize in 1 2 4 8 16; do
- infer vgg 19 $batchsize $use_mkldnn
- infer resnet 50 $batchsize $use_mkldnn
- infer googlenet v1 $batchsize $use_mkldnn
- infer alexnet 2 $batchsize $use_mkldnn
- done
-done
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
deleted file mode 100755
index 1583bf134a276a08aa2f8e84dc63adbb205a83d6..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
- unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
- topology=$1
- layer_num=$2
- bs=$3
- use_mkldnn=$4
- if [ $4 == "True" ]; then
- thread=1
- log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
- elif [ $4 == "False" ]; then
- thread=`nproc`
- # each trainer_count use only 1 core to avoid conflict
- log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
- else
- echo "Wrong input $4, use True or False."
- exit 0
- fi
- args="batch_size=${bs},layer_num=${layer_num}"
- config="${topology}.py"
- paddle train --job=time \
- --config=$config \
- --use_mkldnn=$use_mkldnn \
- --use_gpu=False \
- --trainer_count=$thread \
- --log_period=10 \
- --test_period=100 \
- --config_args=$args \
- 2>&1 | tee ${log}
-
- avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
- fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
- echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
- echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
- mkdir logs
-fi
-
-# training benchmark
-for use_mkldnn in True False; do
- for batchsize in 64 128 256; do
- train vgg 19 $batchsize $use_mkldnn
- train resnet 50 $batchsize $use_mkldnn
- train googlenet v1 $batchsize $use_mkldnn
- train alexnet 2 $batchsize $use_mkldnn
- done
-done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
deleted file mode 100755
index 987381cabc2e793886099212660723c122b73bb0..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function clock_to_seconds() {
- hours=`echo $1 | awk -F ':' '{print $1}'`
- mins=`echo $1 | awk -F ':' '{print $2}'`
- secs=`echo $1 | awk -F ':' '{print $3}'`
- echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
-}
-
-function infer() {
- export OPENBLAS_MAIN_FREE=1
- topology=$1
- layer_num=$2
- bs=$3
- trainers=`nproc`
- if [ $trainers -gt $bs ]; then
- trainers=$bs
- fi
- log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
- threads=$((`nproc` / trainers))
- if [ $threads -eq 0 ]; then
- threads=1
- fi
- export OPENBLAS_NUM_THREADS=$threads
-
- models_in="models/${topology}-${layer_num}/pass-00000/"
- if [ ! -d $models_in ]; then
- echo "./run_mkl_infer.sh to save the model first"
- exit 0
- fi
- log_period=$((32 / bs))
- paddle train --job=test \
- --config="${topology}.py" \
- --use_mkldnn=False \
- --use_gpu=False \
- --trainer_count=$trainers \
- --log_period=$log_period \
- --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
- --init_model_path=$models_in \
- 2>&1 | tee ${log}
-
- # calculate the last 5 logs period time of 160(=32*5) samples,
- # the time before are burning time.
- start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
- end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
- start_sec=`clock_to_seconds $start`
- end_sec=`clock_to_seconds $end`
- fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
- echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
- echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
- echo " " > train.list
-fi
-if [ ! -f "test.list" ]; then
- echo " " > test.list
-fi
-if [ ! -d "logs" ]; then
- mkdir logs
-fi
-
-# inference benchmark
-for batchsize in 1 2 4 8 16; do
- infer vgg 19 $batchsize
- infer resnet 50 $batchsize
- infer googlenet v1 $batchsize
- infer alexnet 2 $batchsize
-done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
deleted file mode 100755
index cc64e1d09da02087b1737190a0b75dc7758600a6..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
- export OPENBLAS_NUM_THREADS=1
- topology=$1
- layer_num=$2
- bs=$3
- thread=`nproc`
- # each trainer_count use only 1 core to avoid conflict
- log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
- args="batch_size=${bs},layer_num=${layer_num}"
- config="${topology}.py"
- paddle train --job=time \
- --config=$config \
- --use_mkldnn=False \
- --use_gpu=False \
- --trainer_count=$thread \
- --log_period=3 \
- --test_period=30 \
- --config_args=$args \
- 2>&1 | tee ${log}
-
- avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
- fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
- echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
-}
-
-if [ ! -f "train.list" ]; then
- echo " " > train.list
-fi
-if [ ! -d "logs" ]; then
- mkdir logs
-fi
-
-# training benchmark
-for batchsize in 64 128 256; do
- train vgg 19 $batchsize
- train resnet 50 $batchsize
- train googlenet v1 $batchsize
- train alexnet 2 $batchsize
-done
diff --git a/benchmark/paddle/image/smallnet_mnist_cifar.py b/benchmark/paddle/image/smallnet_mnist_cifar.py
deleted file mode 100644
index 58879c454f37991405d83bbb593bb5d1e977ff53..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/smallnet_mnist_cifar.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-
-height = 32
-width = 32
-num_class = 10
-
-batch_size = get_config_arg('batch_size', int, 128)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
-define_py_data_sources2(
- "train.list", None, module="provider", obj="process", args=args)
-
-settings(
- batch_size=batch_size,
- learning_rate=0.01 / batch_size,
- learning_method=MomentumOptimizer(0.9),
- regularization=L2Regularization(0.0005 * batch_size))
-
-# conv1
-net = data_layer('data', size=height * width * 3)
-net = img_conv_layer(
- input=net,
- filter_size=5,
- num_channels=3,
- num_filters=32,
- stride=1,
- padding=2)
-net = img_pool_layer(input=net, pool_size=3, stride=2, padding=1)
-
-# conv2
-net = img_conv_layer(
- input=net, filter_size=5, num_filters=32, stride=1, padding=2)
-net = img_pool_layer(
- input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-# conv3
-net = img_conv_layer(
- input=net, filter_size=3, num_filters=64, stride=1, padding=1)
-net = img_pool_layer(
- input=net, pool_size=3, stride=2, padding=1, pool_type=AvgPooling())
-
-net = fc_layer(input=net, size=64, act=ReluActivation())
-net = fc_layer(input=net, size=10, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
deleted file mode 100644
index ca0a6798fb8c35b68cf84d263855955eb93ba0b0..0000000000000000000000000000000000000000
--- a/benchmark/paddle/image/vgg.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python
-from paddle.trainer_config_helpers import *
-
-height = 224
-width = 224
-num_class = 1000
-batch_size = get_config_arg('batch_size', int, 64)
-layer_num = get_config_arg('layer_num', int, 19)
-is_infer = get_config_arg("is_infer", bool, False)
-num_samples = get_config_arg('num_samples', int, 2560)
-
-args = {
- 'height': height,
- 'width': width,
- 'color': True,
- 'num_class': num_class,
- 'is_infer': is_infer,
- 'num_samples': num_samples
-}
-define_py_data_sources2(
- "train.list" if not is_infer else None,
- "test.list" if is_infer else None,
- module="provider",
- obj="process",
- args=args)
-
-settings(
- batch_size=batch_size,
- learning_rate=0.001 / batch_size,
- learning_method=MomentumOptimizer(0.9),
- regularization=L2Regularization(0.0005 * batch_size))
-
-img = data_layer(name='image', size=height * width * 3)
-
-
-def vgg_network(vgg_num=3):
- tmp = img_conv_group(
- input=img,
- num_channels=3,
- conv_padding=1,
- conv_num_filter=[64, 64],
- conv_filter_size=3,
- conv_act=ReluActivation(),
- pool_size=2,
- pool_stride=2,
- pool_type=MaxPooling())
-
- tmp = img_conv_group(
- input=tmp,
- conv_num_filter=[128, 128],
- conv_padding=1,
- conv_filter_size=3,
- conv_act=ReluActivation(),
- pool_stride=2,
- pool_type=MaxPooling(),
- pool_size=2)
-
- channels = []
- for i in range(vgg_num):
- channels.append(256)
- tmp = img_conv_group(
- input=tmp,
- conv_num_filter=channels,
- conv_padding=1,
- conv_filter_size=3,
- conv_act=ReluActivation(),
- pool_stride=2,
- pool_type=MaxPooling(),
- pool_size=2)
- channels = []
- for i in range(vgg_num):
- channels.append(512)
- tmp = img_conv_group(
- input=tmp,
- conv_num_filter=channels,
- conv_padding=1,
- conv_filter_size=3,
- conv_act=ReluActivation(),
- pool_stride=2,
- pool_type=MaxPooling(),
- pool_size=2)
- tmp = img_conv_group(
- input=tmp,
- conv_num_filter=channels,
- conv_padding=1,
- conv_filter_size=3,
- conv_act=ReluActivation(),
- pool_stride=2,
- pool_type=MaxPooling(),
- pool_size=2)
-
- tmp = fc_layer(
- input=tmp,
- size=4096,
- act=ReluActivation(),
- layer_attr=ExtraAttr(drop_rate=0.5))
-
- tmp = fc_layer(
- input=tmp,
- size=4096,
- act=ReluActivation(),
- layer_attr=ExtraAttr(drop_rate=0.5))
-
- return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
-
-
-if layer_num == 16:
- vgg = vgg_network(3)
-elif layer_num == 19:
- vgg = vgg_network(4)
-else:
- print("Wrong layer number.")
-
-if is_infer:
- outputs(vgg)
-else:
- lab = data_layer('label', num_class)
- loss = cross_entropy(input=vgg, label=lab)
- outputs(loss)
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
deleted file mode 100755
index 2a67f9b0cf52484d9d44fe9db0b1e57cdd20fd43..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/imdb.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import six.moves.cPickle as pickle
-import gzip
-import os
-import numpy
-
-
-def get_dataset_file(dataset, default_dataset, origin):
- data_dir, data_file = os.path.split(dataset)
- if (not os.path.isfile(dataset)) and data_file == default_dataset:
- from six.moves import urllib
- print('Downloading data from %s' % origin)
- urllib.request.urlretrieve(origin, dataset)
-
- return dataset
-
-
-def create_data(path="imdb.pkl"):
-
- if (not os.path.isfile('imdb.train.pkl')):
- path = get_dataset_file(
- path, "imdb.pkl",
- "http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
-
- if path.endswith(".gz"):
- f = gzip.open(path, 'rb')
- else:
- f = open(path, 'rb')
-
- train_set = pickle.load(f)
- test_set = pickle.load(f)
- f.close()
-
- pickle.dump(train_set, open('imdb.train.pkl', 'wb'))
- pickle.dump(test_set, open('imdb.test.pkl', 'wb'))
-
- if (not os.path.isfile('train.list')):
- file('train.list', 'w').write('imdb.train.pkl\n')
-
-
-def main():
- create_data('imdb.pkl')
-
-
-if __name__ == "__main__":
- main()
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
deleted file mode 100644
index 23cc0c44a98d0ae7f586d1a376a603198f2c6144..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/provider.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import io, os
-import random
-import numpy as np
-import six.moves.cPickle as pickle
-from paddle.trainer.PyDataProvider2 import *
-
-
-def remove_unk(x, n_words):
- return [[1 if w >= n_words else w for w in sen] for sen in x]
-
-
-# ==============================================================
-# tensorflow uses fixed length, but PaddlePaddle can process
-# variable-length. Padding is used in benchmark in order to
-# compare with other platform.
-# ==============================================================
-def pad_sequences(sequences,
- maxlen=None,
- dtype='int32',
- padding='post',
- truncating='post',
- value=0.):
- lengths = [len(s) for s in sequences]
-
- nb_samples = len(sequences)
- if maxlen is None:
- maxlen = np.max(lengths)
-
- x = (np.ones((nb_samples, maxlen)) * value).astype(dtype)
- for idx, s in enumerate(sequences):
- if len(s) == 0:
- continue # empty list was found
- if truncating == 'pre':
- trunc = s[-maxlen:]
- elif truncating == 'post':
- trunc = s[:maxlen]
- else:
- raise ValueError("Truncating type '%s' not understood" % padding)
-
- if padding == 'post':
- x[idx, :len(trunc)] = trunc
- elif padding == 'pre':
- x[idx, -len(trunc):] = trunc
- else:
- raise ValueError("Padding type '%s' not understood" % padding)
- return x
-
-
-def initHook(settings, vocab_size, pad_seq, maxlen, **kwargs):
- settings.vocab_size = vocab_size
- settings.pad_seq = pad_seq
- settings.maxlen = maxlen
- settings.input_types = [
- integer_value_sequence(vocab_size), integer_value(2)
- ]
-
-
-@provider(
- init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file):
- f = open(file, 'rb')
- train_set = pickle.load(f)
- f.close()
- x, y = train_set
-
- # remove unk, namely remove the words out of dictionary
- x = remove_unk(x, settings.vocab_size)
- if settings.pad_seq:
- x = pad_sequences(x, maxlen=settings.maxlen, value=0.)
-
- for i in range(len(y)):
- yield map(int, x[i]), int(y[i])
diff --git a/benchmark/paddle/rnn/rnn.py b/benchmark/paddle/rnn/rnn.py
deleted file mode 100755
index 83eb3e565473f7e7e91cddeaa3cd2aafb7e3df2c..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/rnn.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python
-
-from paddle.trainer_config_helpers import *
-import imdb
-
-num_class = 2
-vocab_size = 30000
-fixedlen = 100
-batch_size = get_config_arg('batch_size', int, 128)
-lstm_num = get_config_arg('lstm_num', int, 1)
-hidden_size = get_config_arg('hidden_size', int, 128)
-# whether to pad sequence into fixed length
-pad_seq = get_config_arg('pad_seq', bool, True)
-imdb.create_data('imdb.pkl')
-
-args = {'vocab_size': vocab_size, 'pad_seq': pad_seq, 'maxlen': fixedlen}
-define_py_data_sources2(
- "train.list", None, module="provider", obj="process", args=args)
-
-settings(
- batch_size=batch_size,
- learning_rate=2e-3,
- learning_method=AdamOptimizer(),
- regularization=L2Regularization(8e-4),
- gradient_clipping_threshold=25)
-
-net = data_layer('data', size=vocab_size)
-net = embedding_layer(input=net, size=128)
-
-for i in xrange(lstm_num):
- net = simple_lstm(input=net, size=hidden_size)
-
-net = last_seq(input=net)
-net = fc_layer(input=net, size=2, act=SoftmaxActivation())
-
-lab = data_layer('label', num_class)
-loss = classification_cost(input=net, label=lab)
-outputs(loss)
diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh
deleted file mode 100755
index f99a562b3f88a98560f4bf7aee98ceee9daefe67..0000000000000000000000000000000000000000
--- a/benchmark/paddle/rnn/run.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-
-set -e
-
-function train() {
- cfg=$1
- thread=$2
- args="lstm_num=${3},seq_pad=${4},hidden_size=${5},batch_size=${6}"
- paddle train --job=time \
- --config=$cfg \
- --use_gpu=1 \
- --trainer_count=$thread \
- --log_period=10 \
- --test_period=100 \
- --num_passes=1 \
- --feed_data=1 \
- --config_args=$args \
- >logs/rnn-pad${4}-${thread}gpu-lstm${3}-batch${6}-hid${5}.log 2>&1
-}
-
-if [ ! -d "logs" ]; then
- mkdir logs
-fi
-
-## padding, single gpu
-#-----config--gpu--lstm_num--padding--hidden_size--batch_size
-## lstm_num=2, batch_size=64
-train rnn.py 1 2 1 256 64
-train rnn.py 1 2 1 512 64
-train rnn.py 1 2 1 1280 64
-
-## lstm_num=2, batch_size=128
-train rnn.py 1 2 1 256 128
-train rnn.py 1 2 1 512 128
-train rnn.py 1 2 1 1280 128
-
-## lstm_num=4, batch_size=256
-train rnn.py 1 2 1 256 256
-train rnn.py 1 2 1 512 256
-train rnn.py 1 2 1 1280 256
-
-
-#==================multi gpus=====================#
-# hidden_size=256, lstm_num=2, different batch size
-train rnn.py 4 2 1 256 128
-train rnn.py 4 2 1 256 256
-train rnn.py 4 2 1 256 512
-
-# hidden_size=512, lstm_num=4, different batch size
-train rnn.py 4 2 1 512 128
-train rnn.py 4 2 1 512 256
-train rnn.py 4 2 1 512 512
diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py
index 8f77dce98353af53803246be8dc61063836b7867..7837669edc7a206c03e5b9fa2989bf45b35f0605 100644
--- a/benchmark/tensorflow/machine_translation.py
+++ b/benchmark/tensorflow/machine_translation.py
@@ -35,8 +35,6 @@ import os
import argparse
import time
-import paddle.v2 as paddle
-
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--embedding_dim",
diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py
index 7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf..03d533fecfededddd3956ba83ea600456782cfc9 100644
--- a/benchmark/tensorflow/mnist.py
+++ b/benchmark/tensorflow/mnist.py
@@ -21,7 +21,6 @@ import time
import numpy as np
import tensorflow as tf
-import paddle.v2 as paddle
DTYPE = tf.float32
diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py
index c432fa8d59571e128b9ff9e3ffa1949b792ef3a4..fdb044195766b847e16a0cc33424a999c1d9166e 100644
--- a/benchmark/tensorflow/resnet.py
+++ b/benchmark/tensorflow/resnet.py
@@ -27,7 +27,6 @@ import argparse
import time
import numpy as np
-import paddle.v2 as paddle
import tensorflow as tf
DTYPE = tf.float32
diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py
index 5285033005044d907d0b7e91eb66ee7281c4f27a..1f532dc2fa082ea0f6b1da560e1a57b96d2ef1bb 100644
--- a/benchmark/tensorflow/stacked_dynamic_lstm.py
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -21,8 +21,6 @@ import argparse
import time
import tensorflow as tf
-import paddle.v2 as paddle
-
def parse_args():
parser = argparse.ArgumentParser("LSTM model benchmark.")
diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py
index fba5ec71a46b3ac8b2e1244424c39fd5192e5458..d32c835bd7a7dafaafe0970fb6b422db3c866370 100644
--- a/benchmark/tensorflow/vgg.py
+++ b/benchmark/tensorflow/vgg.py
@@ -13,7 +13,6 @@
# limitations under the License.
"""VGG16 benchmark in TensorFlow"""
import tensorflow as tf
-import paddle.v2 as paddle
import numpy as np
import argparse
import time
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index b0f54bf49aafb65f1a92fa95877de2cc61fc67d3..93d74bb0a8f726ad31685cbfc7831b5441cd5108 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -20,31 +20,10 @@ if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
-if(WITH_DOUBLE)
- add_definitions(-DPADDLE_TYPE_DOUBLE)
-endif(WITH_DOUBLE)
-
-if(WITH_ARM_FP16)
- add_definitions(-DPADDLE_ARM_FP16)
- add_definitions("-march=armv8.2-a+fp16+simd")
-endif(WITH_ARM_FP16)
-
if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
-if(NOT WITH_TIMER)
- add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(USE_EIGEN_FOR_BLAS)
- add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
-endif(USE_EIGEN_FOR_BLAS)
-
-if(EIGEN_USE_THREADS)
- add_definitions(-DEIGEN_USE_THREADS)
-endif(EIGEN_USE_THREADS)
-
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
@@ -78,10 +57,6 @@ if(WIN32)
endif(NOT MSVC)
endif(WIN32)
-if(NOT WITH_GOLANG)
- add_definitions(-DPADDLE_WITHOUT_GOLANG)
-endif(NOT WITH_GOLANG)
-
if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB)
endif()
@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE)
add_definitions(-DPADDLE_WITH_DISTRIBUTE)
endif()
-if(WITH_GOLANG)
- # we need to symlink Paddle directory into GOPATH. If we
- # don't do it and we have code that depends on Paddle, go
- # get ./... will download a new Paddle repo from Github,
- # without the changes in our current Paddle repo that we
- # want to build.
- set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
- file(MAKE_DIRECTORY ${GOPATH})
- set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
- file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
- set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
-
- add_custom_target(go_path)
- add_custom_command(TARGET go_path
- # Symlink Paddle directory into GOPATH
- COMMAND mkdir -p ${PADDLE_IN_GOPATH}
- COMMAND rm -rf ${PADDLE_IN_GOPATH}
- COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
- # Automatically get all dependencies specified in the source code
- # We can't run `go get -d ./...` for every target, because
- # multiple `go get` can not run concurrently, but make need to be
- # able to run with multiple jobs.
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- )
-
- if (GLIDE_INSTALL)
- if(EXISTS $ENV{GOPATH}/bin/glide)
- set(GLIDE "$ENV{GOPATH}/bin/glide")
- else()
- message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
- endif()
-
- # this command will only run when the file it depends is missing
- # or has changed, or the output is missing.
- add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
- COMMAND env GOPATH=${GOPATH} ${GLIDE} install
- COMMAND touch ${CMAKE_BINARY_DIR}/glide
- DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
- WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
- )
-
- # depends on the custom command which outputs
- # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
- # run every time this target is built.
- add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
- endif()
-
-endif(WITH_GOLANG)
-
if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index ef4192ecc98ea6de0c81c1f33320528d547b818a..735846db1db04e3884d72ec62d911d9a0efec147 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
endif()
include_directories(${CUDA_INCLUDE_DIRS})
-list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
- # TODO(panyx0718): CUPTI only allows DSO?
- list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32)
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
index 06fc6061bc98eec8c4c71860333f7d3456952aeb..77f4b34537577c7c5782675e7af19c73bc9f8e32 100644
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin)
-
-list(APPEND external_project_dependencies anakin_shared anakin_saber)
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
deleted file mode 100644
index 85cce80b70a1fcf57015ac7a264e4950616b2717..0000000000000000000000000000000000000000
--- a/cmake/external/any.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-INCLUDE(ExternalProject)
-
-SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
-
-INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
-
-ExternalProject_Add(
- extern_lib_any
- ${EXTERNAL_PROJECT_LOG_ARGS}
- GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
- GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
- PREFIX ${ANY_SOURCE_DIR}
- UPDATE_COMMAND ""
- CONFIGURE_COMMAND ""
- BUILD_COMMAND ""
- INSTALL_COMMAND ""
- TEST_COMMAND ""
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
- set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
- file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
- add_library(lib_any STATIC ${dummyfile})
-else()
- add_library(lib_any INTERFACE)
-endif()
-
-add_dependencies(lib_any extern_lib_any)
-
-add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies lib_any)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 12412a51a0fd1aaa9702bd4547fb935d94012ada..fc204dc9193bb28b654936048dd61a9b461abb2f 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -57,5 +57,4 @@ else()
endif()
add_dependencies(boost ${BOOST_PROJECT})
-list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 6b50cff7a66a33d9413627bfbc663cca06ba86f3..989d1dbd4cf593e779b94f7bb5eda613f000859c 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc)
add_definitions(-DBRPC_WITH_GLOG)
-
-LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index f06728de91e4509be661e56baef641d591928b66..41ad8207743201fbddd1d678fc5122afe68207ae 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -31,5 +31,3 @@ else()
endif()
add_dependencies(cub extern_cub)
-
-LIST(APPEND external_project_dependencies cub)
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index 4587475d7902a134eecd54bf8241fb96d175d0ba..63dd16b28e40a0c2d5310bec011c721285049952 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -27,5 +27,3 @@ else()
endif()
add_dependencies(dlpack extern_dlpack)
-
-LIST(APPEND external_project_dependencies dlpack)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6aef97f21244efd09e22781f703553a19a9e1860..72441160f89d2c188d35fc6b08b5f0b6d746a1ad 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -52,5 +52,3 @@ else()
endif()
add_dependencies(eigen3 extern_eigen3)
-
-LIST(APPEND external_project_dependencies eigen3)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index f3ca74faea3629ddce053c49ef1e629f230fdc49..911920ed6212b87aa25ba9a1faf7696fbcb22587 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
-LIST(APPEND external_project_dependencies gflags)
-
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32)
include(CheckIncludeFileCXX)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index d3a4d69d3a05515fdf72074083470e19b4ec255c..7fa17ce6b7b106c47c486729d0136748c73176a7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
-
-LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 9be625b620287cd4c644ae6908000fd5eec5d5c7..e459526583bd5ee3c89807657f3c30376e57d971 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
ADD_DEPENDENCIES(gtest_main extern_gtest)
- LIST(APPEND external_project_dependencies gtest gtest_main)
ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index 0df61b01ab64c8b751bdc3893dd5294ad39ab928..ac0febd076e659927a6a882ff487c61ac130437a 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb)
-
-LIST(APPEND external_project_dependencies leveldb)
-
diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
index 27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8..b944f2945b7874ca76bf1a19e0a363f564851a62 100644
--- a/cmake/external/libmct.cmake
+++ b/cmake/external/libmct.cmake
@@ -72,7 +72,4 @@ else()
add_library(libmct INTERFACE)
endif()
-#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
-LIST(APPEND external_project_dependencies libmct)
-
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 39f49d210a20d49a06c120361ecf0a5d07d1af28..69cdba7c5921f14a87172d95791332e364045b26 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
include_directories(${LIBXSMM_INCLUDE_DIR})
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
ADD_DEPENDENCIES(libxsmm extern_libxsmm)
-LIST(APPEND external_project_dependencies libxsmm)
-
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 92fe76d05c7507c295b784bc37870abfc31a0a29..94a266c50114a94d125467d55a6367a6999e3298 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 2caff27357687018f29c1efc55b7b82c9dc3ccf6..54826cedb871690a82b535ae3ed102600277c622 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -73,4 +73,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
-LIST(APPEND external_project_dependencies mklml)
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 14af98b2d74d4aa955aac27727e05567788a84c9..5812a61f0ddc3a3233ff212710fc1b16aa140724 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
-LIST(APPEND external_project_dependencies ngraph)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index b347a592929836a473ac764c0af1153b07d54258..d8a4a0be6f5aaa3a1a4977bbc68348743f2fa742 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -11,11 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-IF(USE_EIGEN_FOR_BLAS)
- return()
-ENDIF(USE_EIGEN_FOR_BLAS)
-
INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND})
@@ -91,7 +86,6 @@ ENDIF()
IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas)
- LIST(APPEND external_project_dependencies cblas)
ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e05b7694ddf1e1652b00f156cde1a2d433c9fc46..bc7fe5454f5883108e43b4ca47920995dc13a1ff 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
ADD_DEPENDENCIES(protoc ${dep})
ENDFOREACH()
- LIST(APPEND external_project_dependencies protobuf)
RETURN()
endmacro()
macro(SET_PROTOBUF_VERSION)
@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
-SET(PROTOBUF_VERSION 3.1)
+SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index b4ea268e5a48e29d00b0ec8b957b61a42553ec7e..0287e5cf2a835ed65c5fc26ff69d2653d5db217e 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
-LIST(APPEND external_project_dependencies pslib)
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 8b43f2ef5c999fc351543ba958c7cc4b0856625d..22c8c1b463764b6e107c5f3da25d51b36c2ce59f 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
-LIST(APPEND external_project_dependencies pslib_brpc)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 0159815fed81bdff6de3e561af569e9edc75f947..1f56bc7ab056ef0dd95d603ebe3461ef044b2a79 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -26,5 +26,3 @@ else()
endif()
add_dependencies(simple_threadpool extern_threadpool)
-
-LIST(APPEND external_project_dependencies simple_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7a25aaf15f2c7f46d99394d82d69bc24e4f5cb2c..6f2af8670f25c00ac0970fe4ae2b0c5b03aa0d9e 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
-
-LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 384c2f9328296ce6a8a6293be6cc47e5063dd3c4..1d61154c0d45dea795902d6544deb796693db263 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -55,4 +55,3 @@ else()
endif()
add_dependencies(xbyak ${XBYAK_PROJECT})
-list(APPEND external_project_dependencies xbyak)
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index a0f300c2e8bab9e7402f869eed1b4c2d1c579aab..23b1e02108642df561948a6faa3152effb7ca932 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash)
-
-LIST(APPEND external_project_dependencies xxhash)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 6c8d79c25e6a2655711fe4450e65600c9a584015..5569fefe992d10ad4820e51e677f40271d0214e7 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -57,5 +57,3 @@ ENDIF(WIN32)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib)
-
-LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 4276bc5b08cd88a52bb5782bca87fc37deabd830..c3a748db502037f926dc241e4c3bc26a83ad3468 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust")
-list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
-
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
if(WITH_DSO)
@@ -31,22 +29,12 @@ if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC)
-if(NOT WITH_GOLANG)
- set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
-endif(NOT WITH_GOLANG)
-
if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
-if(NOT WITH_RDMA)
- set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
-endif(NOT WITH_RDMA)
-
-
-
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
deleted file mode 100644
index b698f3bdc3ff586a72badee3e0109e29285b457f..0000000000000000000000000000000000000000
--- a/cmake/rdma.cmake
+++ /dev/null
@@ -1,82 +0,0 @@
-# user should download rdma first from subversion repository
-
-# execute following instruction to download svn mannally
-# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
-# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
-# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
-
-if(WITH_RDMA)
- set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
-
- function(generate_rdma_links)
- #redirect to current DIR to isolate the pollution from system runtime environment
- #it can benifits unified control for different gcc environment.
- #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
- #runtime libraries that will crash process while loading it. That redirect trick
- #can fix it.
- execute_process(
- COMMAND mkdir -p librdma
- COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
- COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
- COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
- COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
- COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
- COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
- WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
- )
- endfunction(generate_rdma_links)
-
- #check and set headers
- find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
- find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
- find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
- find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
- #check and set libs
- find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
- find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
- find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
- find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
- find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
- find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
- find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
- if(
- RDMA_INC_SXISOCK AND
- RDMA_INC_XIO AND
- RDMA_INC_EVENT AND
- RDMA_INC_NUMA AND
- RDMA_LIB_SXISOCK AND
- RDMA_LIB_XIO AND
- RDMA_LIB_EVENT AND
- RDMA_LIB_EVENT_CORE AND
- RDMA_LIB_EVENT_EXTRA AND
- RDMA_LIB_EVENT_PTHREADS AND
- RDMA_LIB_NUMA
- )
-
- set(RDMA_INC_DIR
- ${RDMA_INC_SXISOCK}
- ${RDMA_INC_XIO}
- ${RDMA_INC_EVENT}
- ${RDMA_INC_NUMA})
- set(RDMA_LIBS
- ${RDMA_LIB_SXISOCK}
- ${RDMA_LIB_XIO}
- ${RDMA_LIB_EVENT}
- ${RDMA_LIB_EVENT_CORE}
- ${RDMA_LIB_EVENT_EXTRA}
- ${RDMA_LIB_EVENT_PTHREADS}
- ${RDMA_LIB_NUMA}
- )
- set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
- include_directories("${RDMA_INC_DIR}")
- else()
- #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
- message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
- endif()
-else(WITH_RDMA)
- set(RDMA_LIBS "")
- set(RDMA_LD_FLAGS "")
- add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 3dc7171551bfb7aff8d1e75083c98b00378d247f..891ff222633741f9894c2fdb6c0096a48f8a35e1 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR})
- list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT)
endif()
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
index 031225a85dabb26e5d9ea06f58909c049e7f0c08..34cb7a12db171915f2bc7df8787dd62cd381de68 100755
--- a/paddle/contrib/float16/run_float16_demo.sh
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=ON \
-DWITH_TESTING=ON \
- -DWITH_TIMER=ON \
-DWITH_PROFILER=ON \
- -DWITH_FLUID_ONLY=ON
make -j `nproc`
pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index df961be911537582fb60dd258fcdd4a5dd41a38e..0dde33813fdb9e942406e0b9949f26a36fb1dbf1 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
+paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
-paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -427,7 +427,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
@@ -473,11 +473,11 @@ paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_
paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None 2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None 3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None 4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None 5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None 6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None 7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None 8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None 9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None 10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None 11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None 12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None 13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None 14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None 15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None 16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None 17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None 18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None 19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None 20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None 21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None 22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None 23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None 24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
-paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
-paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
-paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, arg0: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None
paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 1d9678a1ba1409e5c18d3e25b3aa13dfbbf76908..60708bf609d6f8b327d46fe585cbbcf07a62eece 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -244,6 +244,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
auto& block = main_program.Block(0);
for (auto var_name : fetch_var_names) {
auto var_desc = block.FindVar(var_name);
+ PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name);
auto shapes = var_desc->GetShape();
PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
"var %s: Fetched var has wrong shape, "
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e88084424baf7eb5cd1d67ea19966866d71ec3eb..dc308fd2592bb158f46f6eac9dd0df25787559fe 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,7 +50,12 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+if(WITH_GPU)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
+else()
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
+endif()
+
cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index b7d6edd389d8e40835dadf56d7c54d53402f6f4d..2e20c436dfdb61fcda78cd044b86848c750cf22c 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -30,8 +30,6 @@ namespace paddle {
namespace framework {
namespace details {
-static constexpr char kAllOpDescs[] = "all_op_descs";
-
VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast(p);
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index dd77f7099f581a5b825916c4ea010023f3ad5bcd..c1f9c2b60c915370df7793f26fe83812a7ced96d 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
- platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+ platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
auto in_var_handles = DynamicCast(this->Inputs());
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 89d626edddfee3d2c43a3cf2232ad4fc1611e655..fdff83b92819b39974f3b2ce0848710f1ee02a41 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
namespace details {
void BroadcastOpHandle::RunImpl() {
- platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+ platform::RecordEvent record_event(Name());
if (places_.size() == 1) return;
@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle;
{
auto in_var_handles = DynamicCast(inputs_);
- PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
+ PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
in_var_handle = in_var_handles[0];
}
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index f8030c53f72bc8a6f007c1eb6a3072abd8037de2..8c6c9f35e84f4fd7d2b5486ac0eb60beceb512a2 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -34,9 +34,11 @@ namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
// Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang.
+ // NOTE: ParallelGraph would execute this pass on each graph, so
+ // don't need to append it here.
return (!strategy.enable_sequential_execution_ &&
- strategy.num_trainers_ > 1) ||
- strategy.enable_parallel_graph_;
+ strategy.num_trainers_ > 1) &&
+ !strategy.enable_parallel_graph_;
}
class ParallelExecutorPassBuilder : public ir::PassBuilder {
@@ -240,7 +242,9 @@ std::unique_ptr BuildStrategy::Apply(
continue;
}
}
+ VLOG(3) << "Start Apply Pass " << pass->Type();
graph = pass->Apply(std::move(graph));
+ VLOG(3) << "Finish Apply Pass " << pass->Type();
}
return graph;
}
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
index 48dcc52623369f7b0f51cd8c8aeb198b37467d5f..c9b52b68205ade000e21a3d06b80af86cbe01f34 100644
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -86,7 +86,7 @@ std::vector> DataBalanceOpHandle::GetBalancePlan(
}
void DataBalanceOpHandle::RunImpl() {
- PADDLE_ENFORCE_GT(places_.size(), 1,
+ PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast(this->Inputs());
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
index d65b0920698748e8a2ded728d78fbcd69b7bae0e..14292c0a5d06aa3ff12b46b5768b136fa925752d 100644
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
auto in_var_handles = DynamicCast(this->Inputs());
auto out_var_handles = DynamicCast(this->Outputs());
- PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+ PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get();
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
index 51dfa2d0711f49aaefab0af3549283dbf77eee4a..f48561ea32e6a3bbc7e9f2a8326b080ad21c6d61 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc
@@ -22,7 +22,7 @@ namespace framework {
namespace details {
void FusedBroadcastOpHandle::RunImpl() {
- platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+ platform::RecordEvent record_event(Name());
if (places_.size() == 1UL) return;
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index be0d941c4f9c2fe8fbb1da8ec2c11868112fcf9b..6d53dac5c0a20b4340e71274a00a7f3c0cd08ff6 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -34,8 +34,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
->Var(details::kLocalExecScopeName)
->GetMutable() = &local_scope;
for (size_t j = 0; j < input_scope_idxes.size(); ++j) {
- local_scope.Var("out_var" + j);
- if (i == j) local_scope.Var("in_var" + j);
+ local_scope.Var("out_var" + std::to_string(j));
+ if (i == j) local_scope.Var("in_var" + std::to_string(j));
}
param_scopes_.emplace_back(&local_scope);
}
@@ -62,20 +62,21 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
// add input var handle
- nodes_.emplace_back(
- ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable));
- VarHandle* in_var_handle =
- new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i],
- "in_var" + i, place_list_[input_scope_idxes[i]]);
+ nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i),
+ ir::Node::Type::kVariable));
+ VarHandle* in_var_handle = new VarHandle(
+ nodes_.back().get(), 1, input_scope_idxes[i],
+ "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add output var handle
for (size_t j = 0; j < place_list_.size(); ++j) {
- nodes_.emplace_back(
- ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable));
- VarHandle* out_var_handle = new VarHandle(
- nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]);
+ nodes_.emplace_back(ir::CreateNodeForTest(
+ "out_node" + std::to_string(i), ir::Node::Type::kVariable));
+ VarHandle* out_var_handle =
+ new VarHandle(nodes_.back().get(), 2, j,
+ "out_var" + std::to_string(i), place_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
@@ -86,7 +87,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
std::vector> send_vec;
f::LoD lod{{0, 10, 20}};
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
- const std::string varname("in_var" + i);
+ const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast(i);
send_vec.push_back(
InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar));
@@ -96,7 +97,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
- const std::string& varname("out_var" + i);
+ const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) {
LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]);
}
@@ -109,7 +110,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
2, 4, 6, 3, 1, 1, 1, 1, 3, 7};
int height = static_cast(kDims[0] * 2);
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
- const std::string varname("in_var" + i);
+ const std::string varname("in_var" + std::to_string(i));
float val_scalar = static_cast(i);
send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i],
rows, height, val_scalar));
@@ -119,7 +120,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
WaitAll();
for (size_t i = 0; i < input_scope_idxes.size(); ++i) {
- const std::string& varname("out_var" + i);
+ const std::string& varname("out_var" + std::to_string(i));
for (size_t j = 0; j < place_list_.size(); ++j) {
SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows,
height);
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index b0c5968499be3a959dd6103424f25056a6dc2282..c91fc81b2defc9fe6b5720ce652a9aa94b27735e 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add"
- "By default, it's turned on");
+ "By default, it's turned off");
DECLARE_string(memory_optimize_debug);
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc
index 6345ba335997ec42ebc63f90e9bf6a3ed2648edc..db4e805bb692ee44ac50337fae54f8dbfe389e6f 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,13 +13,19 @@
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include
#include
#include
-#include
+#include
#include
#include
#include
#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif // PADDLE_WITH_CUDA
namespace paddle {
namespace framework {
@@ -123,7 +129,13 @@ size_t NodeSize(const VarDesc& node) {
}
size_t NodeSize(ir::Node* n) {
- auto* desc = FindVarDescInBlock(n);
+ VarDesc* desc = nullptr;
+ // some op do not have block pointer
+ if (n->inputs[0]->Op() != nullptr) {
+ desc = FindVarDescInBlock(n);
+ } else {
+ desc = n->Var();
+ }
return NodeSize(*desc);
}
@@ -166,6 +178,11 @@ struct NodeComparator {
bool operator()(ir::Node* lhs, ir::Node* rhs) const {
auto* lhs_desc = FindVarDescInBlock(lhs);
auto* rhs_desc = FindVarDescInBlock(rhs);
+ // match data type
+ if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
+ return false;
+ }
+ // match shape
auto lhs_shape = lhs_desc->GetShape();
auto rhs_shape = rhs_desc->GetShape();
if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
@@ -230,6 +247,27 @@ ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
return found_node;
}
+ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
+ ir::Node* found_node = nullptr;
+ NodeComparator functor;
+ auto it =
+ std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
+ if (v.front() == prev)
+ return true;
+ else
+ return false;
+ });
+ PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
+ for (it = std::next(it); it != nodes_.end(); ++it) {
+ auto& candidate = it->front();
+ if (functor(var, candidate)) {
+ found_node = candidate;
+ break;
+ }
+ }
+ return found_node;
+}
+
bool OrderedSet::Has(ir::Node* var) const {
if (mark_table_.count(var->Name())) {
auto& node_in_samename = mark_table_.at(var->Name());
@@ -241,10 +279,15 @@ bool OrderedSet::Has(ir::Node* var) const {
return false;
}
+void OrderedSet::Erase(const std::string& var) {
+ PADDLE_ENFORCE(mark_table_.count(var));
+ nodes_.erase(mark_table_[var]);
+ mark_table_.erase(var);
+}
+
void OrderedSet::Erase(ir::Node* var) {
- PADDLE_ENFORCE(mark_table_.count(var->Name()));
- nodes_.erase(mark_table_[var->Name()]);
- mark_table_.erase(var->Name());
+ PADDLE_ENFORCE(var != nullptr);
+ Erase(var->Name());
}
std::string OrderedSet::ToString() const {
@@ -274,14 +317,35 @@ bool NodeCanReused(ir::Node* node) {
return flag;
}
+int MinChunkSize() {
+ int size{0};
+#ifdef PADDLE_WITH_CUDA
+ size = platform::GpuMinChunkSize();
+#else
+ size = platform::CpuMinChunkSize();
+#endif // PADDLE_WITH_CUDA
+ return size;
+}
+
bool NodeCanReused(const VarDesc& node) {
auto type = node.GetType();
+ // only these types holds bulk of gpu memory
if (!(type == proto::VarType::LOD_TENSOR ||
type == proto::VarType::SELECTED_ROWS ||
type == proto::VarType::LOD_TENSOR_ARRAY)) {
return false;
}
- if (node.Persistable() || node.GetShape().empty()) {
+ // persistable variable is parameter
+ if (node.Persistable()) {
+ return false;
+ }
+ // shape < min_chunk_size is meaningless.
+ // further more, fetched loss always has size = 1
+ // which should not be reused.
+ auto shape = node.GetShape();
+ int size = std::abs(
+ std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()));
+ if (shape.empty() || size < MinChunkSize()) {
return false;
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@@ -461,7 +525,9 @@ ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
for (auto* node : ops_) {
if (node == op) break;
for (auto& output : node->outputs) {
- if (output->Name() == name) {
+ PADDLE_ENFORCE((output != nullptr && output->IsVar()),
+ "Output is empty!");
+ if (output->Var() && output->Name() == name) {
found_node = output;
}
}
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 0bfaf827fea84030de48a9984197f5b39f5c9261..377367faf3c529496b00004f23159750cc2e4bc4 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -29,8 +29,6 @@ namespace paddle {
namespace framework {
namespace details {
-constexpr char kAllOpDescs[] = "all_op_descs";
-
std::vector SortOpLikeDescOrder(const ir::Graph& graph);
// NOTE(dzh): A ordered set for node reuse in memory optimize.
@@ -55,6 +53,7 @@ class OrderedSet {
void Insert(ir::Node* var);
void Erase(ir::Node* var);
+ void Erase(const std::string& var);
bool Has(ir::Node* var) const;
void Clear() {
mark_table_.clear();
@@ -62,6 +61,7 @@ class OrderedSet {
}
// find the bestfit shape node block with var.
ir::Node* FindBestFitNode(ir::Node* var) const;
+ ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
// map store non-const iterator, can not promise const
int GetNodeIndexInPool(ir::Node* var);
// pool all node to string
diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
index 5c13dda9e5491044d2bcbed4b24d438cc8b8a413..3cfe297a73cf4128b7191cbd432cdceadc6240ec 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -107,6 +107,52 @@ TEST(OrderedSet, Normal) {
ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2]
}
}
+
+TEST(OrderedSet, FindBestFitNode) {
+ OrderedSet pool;
+ std::vector> nodes;
+ ProgramDesc prog;
+ BlockDesc* block_desc = prog.MutableBlock(0);
+ auto* op_desc = block_desc->AppendOp();
+ op_desc->SetType("dummy");
+ std::unique_ptr op = ir::CreateNodeForTest(op_desc);
+
+ {
+ auto desc = block_desc->Var("a");
+ desc->SetShape({128, 128});
+ std::unique_ptr node = ir::CreateNodeForTest(desc);
+ node->inputs.emplace_back(op.get());
+ nodes.emplace_back(std::move(node));
+ }
+ {
+ auto desc = block_desc->Var("b");
+ desc->SetShape({128, 129});
+ std::unique_ptr node = ir::CreateNodeForTest(desc);
+ node->inputs.emplace_back(op.get());
+ nodes.emplace_back(std::move(node));
+ }
+ {
+ auto desc = block_desc->Var("c");
+ desc->SetShape({128, 128});
+ std::unique_ptr node = ir::CreateNodeForTest(desc);
+ node->inputs.emplace_back(op.get());
+ nodes.emplace_back(std::move(node));
+ }
+
+ for (auto& node : nodes) {
+ pool.Insert(node.get());
+ }
+
+ // FindNextBestFitNode
+ auto* n = nodes[0].get();
+ auto* cache = pool.FindBestFitNode(n);
+ PADDLE_ENFORCE(cache->Name() == "a");
+ cache = pool.FindNextBestFitNode(n, cache);
+ PADDLE_ENFORCE(cache->Name() == "c");
+ cache = pool.FindNextBestFitNode(n, cache);
+ PADDLE_ENFORCE(cache->Name() == "b");
+}
+
} // namespace details
} // namespace framework
} // namespace paddle
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index 41e4a834df0abab069ab1f6cd21c8479b911250c..fd02bc4697e72cdd1e5af63d71931b8fe8cc29e3 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -69,55 +69,59 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl(
}
for (auto& var : op->outputs) {
- if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
- skip_set_.count(var->Name()))
+ if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
+ VLOG(3) << "Skip set contains variable of " << var->Name()
+ << "disable reuse on it. skipped";
continue;
- ir::Node* cache = pool_.FindBestFitNode(var);
-
- if (var->Name() == FLAGS_memory_optimize_debug) {
- VLOG(3) << "start match var " << DebugString(var) << " of op "
- << op->Name();
- VLOG(3) << pool_.ToString();
- VLOG(3) << "matched in pool : "
- << ((cache == nullptr) ? "False" : "True");
}
+ if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+ ir::Node* cache = pool_.FindBestFitNode(var);
+ while (cache != nullptr && var->Name() == cache->Name()) {
+ VLOG(3) << "The same cache variable is cascade reused. "
+ << cache->Name() << " is re-filled to the pool after "
+ << "the reused op is finished. Current op can not "
+ << "replace it again. Skip this candidate.";
+ cache = pool_.FindNextBestFitNode(var, cache);
+ }
+ if (var->Name() == FLAGS_memory_optimize_debug) {
+ VLOG(3) << "start match var " << DebugString(var) << " of op "
+ << op->Name();
+ VLOG(3) << pool_.ToString();
+ VLOG(3) << "matched in pool : "
+ << ((cache == nullptr) ? "False" : "True");
+ }
- if (cache == nullptr) continue;
- if (var->Name() == cache->Name()) {
- VLOG(3) << "The same cache variable is cascade reused." << var->Name()
- << " is re-filled to the pool after"
- << "the reused op is finished. Current op can not "
- << "replace it again. Skip this candidate.";
- continue;
-
- int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
- VLOG(3) << string::Sprintf(
- "!!! %s, %s => %s, cache idx %d, pool size %d",
- std::to_string(reuse_id++), DebugString(var), DebugString(cache),
- node_idx_in_pool, static_cast(pool_.size()));
-
- // update CFG Graph on the fly.
- // reused var maybe re-fill into the pool
- cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
- // NOTE(dzhwinter): we need to both update the ProgramDesc
- // and IR Graph. because op_desc/var_desc is used in CreateOp,
- // CreateVar when running happens. But IR Graph
- // define the dependence relationship between nodes.
- RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
- RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
-
- pool_.Erase(cache);
- }
+ if (cache != nullptr) {
+ int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+ VLOG(3) << string::Sprintf(
+ "!!! %s, %s => %s, cache idx %d, pool size %d",
+ std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+ node_idx_in_pool, static_cast(pool_.size()));
+ // NOTE(dzhwinter): update the ProgramDesc/IR Graph
+ // and the CFG Graph on the fly.
+ //
+ // IR Graph define the dependence relationship between nodes.
+ //
+ // ProgramDesc defines the input/output vars. Its used in
+ // CreateOp, CreateVar when running happens.
+ //
+ // CFG Graph store the liveness information, when reuse happens
+ // we also need to update the variable liveness.
+ const std::string var_name = var->Name();
+ const std::string cache_name = cache->Name();
- // fill the pool
- std::unordered_set unlived_vars;
- for (auto var : cfg_->LiveIn(op)) {
- if (cfg_->LiveOut(op).count(var) == 0) {
- unlived_vars.emplace(var);
+ cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
+ RenameVarInGraphDesc(var_name, cache_name, idx);
+ RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
+ pool_.Erase(cache_name);
}
}
- for (auto var : unlived_vars) {
+ }
+ // fill the pool
+ for (auto var : cfg_->LiveIn(op)) {
+ if (cfg_->LiveOut(op).count(var) == 0) {
ir::Node* var_node = cfg_->GetNodeByName(var, op);
+ if (var_node == nullptr || var_node->IsCtrlVar()) continue;
if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
pool_.Insert(var_node);
}
@@ -190,7 +194,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// effect. Because it is a single op in graph. No need to
// update the ir nodes.
sub_op_desc->Rename(var->Name(), cache->Name());
- if (sub_op_desc->Block()->HasVar(var->Name())) {
+ if (sub_op_desc->Block() != nullptr &&
+ sub_op_desc->Block()->HasVar(var->Name())) {
sub_op_desc->Block()->RemoveVar(var->Name());
}
}
@@ -231,7 +236,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var);
- if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+ if (op_desc->Block() != nullptr) {
+ op_desc->Block()->RemoveVar(var);
+ } else {
+ LOG(WARNING) << "op " << op->Name() << " not know its block."
+ << "Is the op_desc created without block pointer? "
+ << "Can not find " << var << " in Block(0)";
+ }
op_desc->Flush();
}
}
@@ -273,8 +284,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var
for (auto* node : op->inputs) {
if (node->Name() == var) {
- ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
- var_nodes_[cache_var].emplace_back(cache_node);
+ ir::Node* cache_node = var_nodes_[cache_var].back();
// swap node to cache_node
cache_node->outputs.insert(cache_node->outputs.end(),
@@ -283,11 +293,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto* prev_op = node->inputs[0];
std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
cache_node);
- cache_node->inputs.emplace_back(prev_op);
for (auto* next_op : node->outputs) {
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
+
+ // erase unused node
+ auto& nodes = var_nodes_.at(var);
+ nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+ graph->RemoveNode(node);
}
}
@@ -307,15 +321,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
cache_node);
}
+
+ // erase unused node
+ auto& nodes = var_nodes_.at(var);
+ nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+ graph->RemoveNode(node);
}
}
}
-
- // release node of unused var in graph
- for (auto* node : var_nodes_[var]) {
- graph->RemoveNode(node);
- }
- var_nodes_.at(var).clear();
}
} // namespace details
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 75f922d2cca6855a67be7284ae407e549a1a1afb..7d1e63f3682bca8965f6c5e695132dff44fa3715 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
ir::Graph *result, const std::string &og) const {
+ OpHandleBase *op_handle = nullptr;
+
+ auto append_allreduce_op = [&](
+ const std::vector &scopes,
+ const std::vector &places) -> OpHandleBase * {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
- result->Get(kGraphOps).emplace_back(new AllReduceOpHandle(
- result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
- local_scopes_, places_, nccl_ctxs_));
+ result->Get(kGraphOps).emplace_back(new AllReduceOpHandle(
+ result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+ scopes, places, nccl_ctxs_));
#else
- result->Get(kGraphOps).emplace_back(new AllReduceOpHandle(
- result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
- local_scopes_, places_));
+ result->Get(kGraphOps).emplace_back(new AllReduceOpHandle(
+ result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+ scopes, places));
#endif
- auto *op_handle = result->Get(kGraphOps).back();
+ return result->Get(kGraphOps).back();
+ };
+
+ if (!strategy_.enable_parallel_graph_)
+ op_handle = append_allreduce_op(local_scopes_, places_);
for (size_t i = 0; i < places_.size(); ++i) {
- auto &p = places_[i];
- SetCommunicationContext(op_handle, p);
+ if (strategy_.enable_parallel_graph_) {
+ op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
+ }
+
+ SetCommunicationContext(op_handle, places_[i]);
auto &vars = result->Get(kGraphVars)[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
- vars.size(), i, og, p);
+ vars.size(), i, og, places_[i]);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1..9afbb91005c9c3a9d2e185f4dfa901ebf812ee19 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -36,13 +36,14 @@ namespace details {
// map from variable name to variables. The variables, who have the same name,
// will have a differsent version. The offset in the
// `std::vector` is the version of varaibles.
-typedef std::vector>>
+typedef std::vector>>
GraphVars;
const char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set GraphDepVars;
+typedef std::unordered_set GraphDepVars;
const char kGraphDepVars[] = "dep_vars";
+
} // namespace details
} // namespace framework
} // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index b1a82e8771b92f2d0af4a1c7732ff2da54d496a8..e0aa352e95bc3685a1f4879bffa6e86eecd7e7f9 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -70,6 +70,9 @@ class OpHandleBase {
auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr;
}
+ const std::map &DeviceContext() {
+ return dev_ctxes_;
+ }
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
dev_ctxes_[place] = ctx_;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index e8deb5bfc6c013addce11e2a04286a828acc1930..4c8f69c68ce17d0143c34e8adbab92cdc90058c8 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,22 +13,92 @@
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
+std::vector>
+ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
+ std::unique_ptr &&graph) {
+ std::vector> graphs;
+ graphs.reserve(places_.size());
+ for (size_t i = 0; i < places_.size(); ++i) {
+ ProgramDesc empty;
+ graphs.emplace_back(std::unique_ptr(new ir::Graph(empty)));
+ auto &g = graphs.back();
+ g->Set(kGraphVars, new GraphVars(1UL));
+ g->Set(kGraphDepVars, new GraphDepVars);
+ }
+ auto op_handles = ir::FilterByNodeWrapper(*graph);
+
+ for (auto &op : op_handles) {
+ auto &dev_ctx = op->DeviceContext();
+ auto &p = dev_ctx.begin()->first;
+ int dev_id = boost::get(p).device;
+ auto &dev_dummys = graphs[dev_id]->Get(kGraphDepVars);
+ graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
+
+ for (auto &var : op->Inputs()) {
+ auto dummy_ptr = dynamic_cast(var);
+ if (dummy_ptr) {
+ dev_dummys.insert(var);
+ if (graph->Nodes().count(var->Node()))
+ graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+ }
+ }
+ for (auto &var : op->Outputs()) {
+ auto dummy_ptr = dynamic_cast(var);
+ if (dummy_ptr) {
+ dev_dummys.insert(var);
+ if (graph->Nodes().count(var->Node()))
+ graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
+ }
+ }
+ }
+
+ for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
+ auto &dev_vars = graphs[dev_id]->Get(kGraphVars)[0];
+ auto &origin_vars = graph->Get(kGraphVars)[dev_id];
+ for (auto &name_pair : origin_vars) {
+ dev_vars.emplace(name_pair.first, name_pair.second);
+ for (auto &version_pair : name_pair.second) {
+ if (graph->Nodes().count(version_pair->Node())) {
+ graphs[dev_id]->AddNode(
+ graph->RemoveNode(version_pair->Node()).release());
+ }
+ }
+ }
+ }
+
+ return graphs;
+}
+
ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector &local_scopes,
const std::vector &places,
- std::vector> &&graphs)
+ const framework::ProgramDesc &main_prog, std::unique_ptr &&graph)
: strategy_(std::move(strategy)),
local_scopes_(std::move(local_scopes)),
pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
places_(std::move(places)),
- graphs_(std::move(graphs)) {
+ main_prog_(main_prog),
+ // TODO(Yancey1989): Copying graphs is not safely since it deleted the
+ // attrs.
+ graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+ auto seq_allreduce_pass =
+ ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
+ seq_allreduce_pass->Erase(details::kAllOpDescs);
+ seq_allreduce_pass->Set>(
+ details::kAllOpDescs,
+ new std::vector(main_prog_.Block(0).AllOps()));
+ for (size_t i = 0; i < graphs_.size(); ++i) {
+ graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+ }
+
// set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
? 1UL
@@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<< " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
- strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
+ strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
}
}
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index c00c5bc2d1b4b78593f99c819b5a3d642150e773..1c35d45fdd356a867d1ad80b345379395e03172e 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -18,7 +18,9 @@
#include
#include "ThreadPool.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
@@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector &local_scopes,
const std::vector &places,
- std::vector> &&graphs);
+ const framework::ProgramDesc &main_prog,
+ std::unique_ptr &&graph);
~ParallelSSAGraphExecutor() final = default;
+
const ir::Graph &Graph() const override { return *graphs_[0]; }
FeedFetchList Run(const std::vector &fetch_tensors) override;
private:
+ std::vector> SeparateMultiDevicesGraph(
+ std::unique_ptr &&graph);
+
ExecutionStrategy strategy_;
std::vector local_scopes_;
std::unique_ptr<::ThreadPool> pool_{nullptr};
std::vector places_;
+ framework::ProgramDesc main_prog_;
std::vector> graphs_;
std::vector> executors_;
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index ee4c8a6ecf77e5d0f23f38b763917d926afdb07a..4e2477c205db5966aa0b2d0c7a608be94a69eb82 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
#endif
void ReduceOpHandle::RunImpl() {
- platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
+ platform::RecordEvent record_event(Name());
if (places_.size() == 1) return;
// the input and output may have dummy var.
@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() {
{
auto out_var_handles = DynamicCast(outputs_);
- PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+ PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one.");
out_var_handle = out_var_handles.front();
}
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 91e4f9adb418978c30f512abe6924c0ace182124..7b13112986f9ad85056a3e8a5a6ed99bd0be95d5 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception();
}
- platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
+ platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
++drop_scope_counter_;
bool stream_end = false;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 677a2937945b03fa577317cb4f26e09354d06957..72acc337b7cc4803fa010373f8817ff5fd25cb2c 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector &fetch_tensors) {
std::unique_ptr event(
- new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
+ new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
std::unordered_map pending_ops;
std::unordered_set pending_vars;
auto ready_vars = std::make_shared>();
@@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp(
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs());
- VLOG(10) << op << " " << op->Name() << "Signal posted";
+ VLOG(10) << op << " " << op->Name() << " Signal posted";
} catch (...) {
exception_holder_.Catch(std::current_exception());
}
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 6338be75a4b1d3c4caf7a6f7add4d05fec690340..96530b2a3f9cfd9462627a42b2bb0fea98758f92 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -44,6 +44,7 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
// Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
+ PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name);
PADDLE_ENFORCE(g_fetch_value->IsType(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc
index 3e4d715c6f089496d1b1f7906e3f10147a073622..bf9d1dcd380cdff886301faf13b0015fd5a2ed5c 100644
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op->SetOutput("Out", {"test2_out"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
- prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
+ prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out");
- prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+ prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
- prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
+ prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
prog.MutableBlock(0)->Var("test2_out");
- prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+ prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
- prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+ prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
@@ -267,12 +267,12 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog.MutableBlock(0)->Var("o0");
prog.MutableBlock(0)->Var("y0");
prog.MutableBlock(0)->Var("z0");
- prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
- prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+ prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+ prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
auto in_to_outs = infer_inplace(*op, op->Block());
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 07c2c970d4de3cecf03e4cf80e60e81e7a9595a8..25d9afbcc8b2bc89ec47654f0dba4cb838be55b0 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
+ cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif ()
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
index a756dfc1b98e1de55c809c73e2c4df1e628950ae..39b0585d3a6f9b52c9ec4b0a24f8532a3410851a 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -22,7 +22,8 @@ namespace ir {
class AttentionLSTMFusePass : public FusePassBase {
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
index ad966e11e6222a4ed4c730089c454b0d1c7bd0b3..8c3c8b56c08cc09e66b20d17bf730edec0499f35 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
virtual ~ConvAffineChannelFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 846a14e365e6bd7f056d409130a3b246371931da..04765dd1440331fb37ed2eb05a9ce762eb2b81bc 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -169,7 +169,7 @@ std::unique_ptr ConvBNFusePass::ApplyImpl(
if (has_bias && conv->Op()->Input("Bias").size() > 0) {
// reuse existing conv bias node
auto conv_bias_names = conv->Op()->Input("Bias");
- PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+ PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
auto* conv_bias_tensor = conv_bias_var->GetMutable();
PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
index 2c9eb574fe8e054e0ae221f08f664b91f05d95c9..cf425a2730904d4ab21c33e66b72db0692cb087c 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
virtual ~ConvBNFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"conv_bn_fuse"};
};
@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
virtual ~ConvEltwiseAddBNFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
};
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
index 3b40a5a92665c07bc2b66e6a96721f573d40393f..9259a4ac5c89b1a7d1413fb2eaaa5fc6a70348f2 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
virtual ~ConvElementwiseAdd2ActFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
index ac69aa6458fc8c19b670dea2af1251c44dc353a8..9c0b50f155821cf2bd815a6fb087e3f6cc513641 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
virtual ~ConvElementwiseAddActFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
index f234603f5856a9238164f7fb0e5cc81ea9b7ed60..bf43bd5ce2602a3e240c56f00f66f13b79151002 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
virtual ~ConvElementwiseAddFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
index e5ad3067ec4060e41f1464395f3fc76183de3e66..fde2a0a4eecdec9ad5ac58ad8e63c26cce482682 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
#pragma once
+#include
+
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
virtual ~EmbeddingFCLSTMFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"embedding_fc_lstm_fuse"};
};
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index 6c69539d1e48268afc2435f8f73b3818d13107cd..783a052edcf84c8c437a7b2e25f0d67c0366691e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#pragma once
+
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
virtual ~FCFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
index 63e1c72bfb2e2641ae5d44858b342d5e427e9045..e359a3289440fffbec622488ecf3a7f49e986574 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
virtual ~FCGRUFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"fc_gru_fuse"};
};
@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
virtual ~MulGRUFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"fc_nobias_gru_fuse"};
};
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9..21482615a6efef930b7328594477a51f4aaf28e7 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -14,6 +14,8 @@
#pragma once
+#include
+
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
virtual ~FCLstmFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"fc_lstm_fuse"};
};
@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
virtual ~MulLstmFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"fc_nobias_lstm_fuse"};
};
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
index b2fecc076efca333539fe81e67eee222873aee2a..0fee5274478e8b8db852774077ff5979f0aaba25 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
virtual ~FuseElewiseAddActPass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
std::unique_ptr FuseElewiseAddAct(
std::unique_ptr graph,
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
index 0d94008ea82d0e09732d4b6448fdded94b60733c..fe844caed2e757fb080dcee398c8903b929b06e5 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -111,7 +111,7 @@ std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
xg_var = subgraph.at(xg)->Var();
}
- PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1);
+ PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
layer_op->SetInput("Input", {x_var->Name()});
subgraph.at(layer)->inputs.push_back(subgraph.at(x));
@@ -119,13 +119,13 @@ std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
if (!only_forward) {
- PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1);
+ PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
layer_g_op->SetInput("Input", {x_var->Name()});
subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
- PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1);
+ PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
yg_var->Name());
layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
index 6bd653775e42c1ee16051e205e9fa9888ea05eaa..efb49b8300e677f17d9e205800d837b88edfd2e9 100644
--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
virtual ~FuseReluDepthwiseConvPass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
std::unique_ptr FuseReluDepthwiseConv(
std::unique_ptr graph, bool only_forward) const;
};
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index feb3330176490e71c51cce826fe49d5499469ad7..296f3b83961c1379ee2c1237aa15784791b46878 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -26,6 +26,14 @@ limitations under the License. */
namespace paddle {
namespace framework {
+
+namespace details {
+
+// This attr is not recommended, because the graph should not dependence
+// the program once it is built.
+constexpr char kAllOpDescs[] = "all_op_descs";
+} // namespace details
+
namespace ir {
/*
@@ -168,10 +176,13 @@ class Graph {
return ret;
}
- void RemoveNode(ir::Node *node) {
+ std::unique_ptr RemoveNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
- node_set_.erase(node);
+ std::unique_ptr ret;
+ ret.reset(nodes_.at(node).release());
nodes_.erase(node);
+ node_set_.erase(node);
+ return ret;
}
// NOTE low performance, but simple and secure.
@@ -184,13 +195,6 @@ class Graph {
return nullptr;
}
- void ResolveHazard(
- const std::map> &var_nodes);
-
- private:
- std::map> InitFromProgram(
- const ProgramDesc &program);
-
// This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
@@ -199,6 +203,13 @@ class Graph {
return node;
}
+ void ResolveHazard(
+ const std::map> &var_nodes);
+
+ private:
+ std::map> InitFromProgram(
+ const ProgramDesc &program);
+
// NOTE: program_ shouldn't be exposed to user.
const ProgramDesc program_;
std::map attrs_;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 9ea0729e1f3339c2f17371ecc8fa51325b9629bb..c0c34d186b00814fe6c6fd42beb78133233a1357 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL;
PDNode *PDPattern::NewNode(const std::string &name) {
if (!name.empty()) {
- PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+ PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
"PDNode's name should be unique, get duplicate [%s]",
name);
}
@@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) {
PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
if (!name.empty()) {
- PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
+ PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
"PDNode's name should be unique, get duplicate [%s]",
name);
}
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 3b738aa159ebfd77f00c9e532fbd94542e2097db..5bdc0c5faed7131b873edf9b43c847c010b6e3f3 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -38,9 +38,13 @@ std::unique_ptr IdentityScaleOpCleanPass::ApplyImpl(
->assert_is_op("scale")
->assert_op_attr("scale", 1.)
->assert_op_attr("bias", 0.);
- auto scale_out = detector.mutable_pattern()
- ->NewNode("scale_out")
- ->assert_is_op_output("scale");
+ auto scale_out =
+ detector.mutable_pattern()
+ ->NewNode("scale_out")
+ ->assert_is_op_output("scale")
+ // scale's output var should has only one consumer, or it can't be
+ // removed.
+ ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
pre_op->LinksTo({scale_in});
scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
index 50a654d82f0e4fb7e8e91c665397716407e6d2a5..6da592561da1e4046acbfd86c04862f69b7a97a8 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
@@ -22,7 +22,8 @@ namespace ir {
class IdentityScaleOpCleanPass : public FusePassBase {
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
private:
virtual ~IdentityScaleOpCleanPass() = default;
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
index 7713ed1eab88ee4fa16d52e7425075ae66f721a3..6607c026a748576f38419b275d71217f3eee0c59 100644
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
std::unordered_set invalid_nodes;
int valid_op = 0;
for (auto* node : graph->Nodes()) {
+ PADDLE_ENFORCE_NOT_NULL(node);
if (is_valid_node(node)) {
invalid_nodes.insert(node);
} else if (node->IsOp()) {
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 7310f596f8a3170e84840be4bab8390b780b6577..f9157b10d9554092a5da6a6f73ecf7ceac1430dd 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
virtual ~LockFreeOptimizePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
private:
// Create a new sgd node via current optimizer node
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index f3ad9f1c2bf14db418629e0c607e2510f01908b8..0ef5c177bf98b354bb18fc1d2ec8e5bef4b58951 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
virtual bool is_conv3d() const { return false; }
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"conv_bias_mkldnn_fuse"};
};
/*
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38b7fe52037c1a264e4251b7a54ef7569ee6d765
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+#include
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+ const std::vector& inputs,
+ const std::vector& outputs) {
+ auto* op = prog->MutableBlock(0)->AppendOp();
+ op->SetType(type);
+ if (type == "conv2d") {
+ op->SetAttr("use_mkldnn", true);
+ op->SetAttr("name", name);
+ op->SetInput("Input", {inputs[0]});
+ op->SetInput("Filter", {inputs[1]});
+ if (inputs.size() > 2)
+ op->SetInput("Bias", {inputs[2]});
+ else
+ op->SetInput("Bias", {});
+ } else if (type == "elementwise_add") {
+ op->SetAttr("use_mkldnn", true);
+ op->SetInput("X", {inputs[0]});
+ op->SetInput("Y", {inputs[1]});
+ }
+ op->SetOutput("Out", outputs);
+ op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+ static_cast(OpRole::kForward));
+}
+
+// (c, weights)->conv->f
+// (f)->elementwise_add->g
+ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
+ ProgramDesc prog;
+ std::vector nodes{"c", "weights", "f", "eltwise_bias", "g"};
+ if (convWithExistingBias) nodes.push_back("conv_bias");
+ for (auto& v : nodes) {
+ auto* var = prog.MutableBlock(0)->Var(v);
+ var->SetType(proto::VarType::LOD_TENSOR);
+ if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
+ var->SetPersistable(true);
+ }
+ }
+
+ // conv+bias, both with MKL-DNN
+ if (convWithExistingBias) {
+ SetOp(&prog, "conv2d", "conv",
+ std::vector({"c", "weights", "conv_bias"}),
+ std::vector({"f"}));
+ } else {
+ SetOp(&prog, "conv2d", "conv", std::vector({"c", "weights"}),
+ std::vector({"f"}));
+ }
+ SetOp(&prog, "elementwise_add", "eltwise",
+ std::vector({"f", "eltwise_bias"}),
+ std::vector({"g"}));
+
+ return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+ const char* var_name) {
+ auto x = scope->Var(var_name);
+ auto tensor = x->GetMutable();
+ tensor->mutable_data(place, proto::VarType::FP32,
+ ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(bool convWithExistingBias) {
+ auto prog = BuildProgramDesc(convWithExistingBias);
+ std::unique_ptr graph(new ir::Graph(prog));
+ auto place = paddle::platform::CPUPlace();
+ NaiveExecutor exe{place};
+ Scope scope;
+ // Init scope, as it is used in pass
+ exe.CreateVariables(prog, 0, true, &scope);
+ if (convWithExistingBias) {
+ InitTensorHolder(&scope, place, "conv_bias");
+ InitTensorHolder(&scope, place, "eltwise_bias");
+ }
+ graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+ auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
+
+ int original_nodes_num = graph->Nodes().size();
+
+ graph = pass->Apply(std::move(graph));
+
+ int current_nodes_num = graph->Nodes().size();
+
+ // Remove 3 Nodes: Conv, Bias, conv_out
+ // Add 1 Node: ConvBias
+ EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+ // Assert conv_bias op in newly generated graph
+ int conv_bias_count = 0;
+
+ for (auto* node : graph->Nodes()) {
+ if (node->IsOp() && node->Op()->Type() == "conv2d") {
+ auto* op = node->Op();
+ ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+ EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn")));
+ // check if "conv" convolution is fused
+ auto op_name = boost::get(op->GetAttr("name"));
+ if (op_name == "conv") {
+ auto input_names = op->InputNames();
+ ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
+ input_names.end());
+ auto bias = boost::get>(op->Input("Bias"));
+ if (bias.size()) {
+ ++conv_bias_count;
+ }
+ }
+ }
+ }
+ EXPECT_EQ(conv_bias_count, 1);
+}
+
+TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
+
+TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
+
+TEST(ConvBiasFusePass, conv3d) {
+ Conv3DBiasFusePass pass;
+ ASSERT_TRUE(pass.is_conv3d());
+}
+
+} // namespace ir
+} // namespace framework
+} // namespace paddle
+
+USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
index 3f3f0846eba1201e57a653f8e515c28d2bcdd5e3..ede0bea07ff4130a0f6b3d21d6e34222a5013170 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
virtual ~RepeatedFCReluFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"repeated_fc_relu_fuse"};
};
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
index 9f5fd1a29adf918806d8f30097d8c7f002f48f3e..06e18f9dc327bf2ffaf8d2ab64edcbddea2eb04c 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#pragma once
+
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
virtual ~SeqConcatFcFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
index dac9de71930c1768bdf416520caae6468449cd3d..c36c6b76a238dd21eb0c9308e780761aa9e4e27a 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
virtual ~SeqConvEltAddReluFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
};
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
index ba2154045e62c687173565c5ad30ea4d45d3c8f4..a5db3528da36ad08bb7f4d2765ee78222c569a5c 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
virtual ~SeqPoolConcatFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"seqpool_concat_fuse"};
};
diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
index 456a03192cc4e4a9d0dbe2dcb649b6c1b4d9cd5a..35d1d5129bba7043026e5489b806480775473257 100644
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
@@ -164,7 +164,7 @@ ProgramDesc BuildProgramDesc(int num_inputs_of_concat) {
};
std::vector concat_inputs;
for (int i = 0; i < num_inputs_of_concat; ++i) {
- std::string prefix = "seqpool_op_" + i;
+ std::string prefix = "seqpool_op_" + std::to_string(i);
new_var(prefix + "in");
new_var(prefix + "out");
new_var(prefix + "out_unused");
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index fb49adc3768ec99cab4321c6b90c93dfed6d32f2..c21ba65c40a8d54c315ab347e5a8a3266a143779 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
virtual ~SquaredMatSubFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
const std::string name_scope_{"squared_mat_sub_fuse"};
};
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
index fb0f0ae9efdc5a25a799d6123fa658a99860cd86..a7d18ec86da1c02aef84c25c378691eb8f651015 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
virtual ~TransposeFlattenConcatFusePass() {}
protected:
- std::unique_ptr ApplyImpl(std::unique_ptr graph) const;
+ std::unique_ptr ApplyImpl(
+ std::unique_ptr graph) const override;
};
} // namespace ir
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 0a0f8f4655bc34cdb25205ff6eaec9f96c801ebd..5f3ce60e1d94ea4078cf0b709df362bad317f621 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -27,7 +27,7 @@ enum class OpRole {
kForward = 0x0000,
kBackward = 0x0001,
kOptimize = 0x0002,
- // RPC role is for send/recv releated op
+ // RPC role is for send/recv related op
kRPC = 0x0004,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b22523e0f426942333640d9e8b14dcf70ce4a8dc..9a0348871b050278da2ad07ac6992188a702da42 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
- platform::DeviceContextPool& pool =
- platform::DeviceContextPool::Instance();
- platform::RecordEvent record_event(Type(), pool.Get(place));
+ platform::RecordEvent record_event(Type());
RunImpl(scope, place);
} else {
RunImpl(scope, place);
@@ -989,11 +987,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+ auto* origin_var = scope.FindVar(var_name);
+ PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
+ var_name);
auto* original_tensor =
- GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
+ GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
auto* var = transfer_scope.FindVar(var_name);
- PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr",
- var_name);
+ PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
+ var_name);
auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
original_tensor->ShareDataWith(*transformed_tensor);
}
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index ff7ef0cce2f12fe89dd087c4a5006b2cfdc5a4a9..56da5660095affa0ba49d8bc533d1da01ffd18be 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
@@ -193,7 +194,6 @@ ParallelExecutor::ParallelExecutor(
member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
member_->nranks_ = build_strategy.num_trainers_ * places.size();
-
if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1,
"If you set build_strategy.reduce with 'Reduce',"
@@ -221,9 +221,10 @@ ParallelExecutor::ParallelExecutor(
// choice the execution strategy.
build_strategy.enable_parallel_graph_ =
EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
-
- VLOG(1) << "Enable ParallelGraph Execution: "
- << build_strategy.enable_parallel_graph_;
+ if (build_strategy.enable_parallel_graph_)
+ VLOG(0) << "The Executor would execute the graph by ParallelGraph "
+ "Execution which can get better performance,"
+ << "you can force it off by env FLAGS_enable_parallel_graph=0";
if (member_->use_cuda_) {
// Bcast Parameters to all GPUs
@@ -257,60 +258,44 @@ ParallelExecutor::ParallelExecutor(
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
- std::vector> graphs;
+ std::unique_ptr graph;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
- if (build_strategy.enable_parallel_graph_) {
- for (size_t i = 0; i < member_->places_.size(); ++i) {
- std::unique_ptr graph = build_strategy.Apply(
- main_program, {member_->places_[i]}, loss_var_name,
- {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
- member_->nccl_ctxs_.get());
- graphs.push_back(std::move(graph));
- }
- } else {
- std::unique_ptr graph = build_strategy.Apply(
- main_program, member_->places_, loss_var_name, member_->local_scopes_,
- member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
- graphs.push_back(std::move(graph));
- }
+ graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+ member_->local_scopes_, member_->nranks_,
+ member_->use_cuda_, member_->nccl_ctxs_.get());
#else
- std::unique_ptr graph = build_strategy.Apply(
- main_program, member_->places_, loss_var_name, member_->local_scopes_,
- member_->nranks_, member_->use_cuda_);
- graphs.push_back(std::move(graph));
+ graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
+ member_->local_scopes_, member_->nranks_,
+ member_->use_cuda_);
#endif
auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold "
<< static_cast(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) {
- for (size_t i = 0; i < graphs.size(); ++i) {
- graphs[i] = member_->PrepareGCAndRefCnts(
- std::move(graphs[i]), static_cast(max_memory_size));
- }
+ graph = member_->PrepareGCAndRefCnts(std::move(graph),
+ static_cast(max_memory_size));
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std::vector var_infos;
- for (auto &graph : graphs) {
- for (auto &node : graph->Nodes()) {
- if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
- var_infos.emplace_back();
- var_infos.back().name_ = node->Var()->Name();
- var_infos.back().type_ = node->Var()->GetType();
- var_infos.back().persistable_ = node->Var()->Persistable();
- }
+ for (auto &node : graph->Nodes()) {
+ if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+ var_infos.emplace_back();
+ var_infos.back().name_ = node->Var()->Name();
+ var_infos.back().type_ = node->Var()->GetType();
+ var_infos.back().persistable_ = node->Var()->Persistable();
}
}
// If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) {
- size_t graph_num = ir::GraphNum(*graphs[0]);
+ size_t graph_num = ir::GraphNum(*graph);
if (graph_num > 1) {
LOG(WARNING)
<< "The number of graph should be only one, "
"but the current graph has "
- << ir::GraphNum(*graphs[0])
+ << ir::GraphNum(*graph)
<< " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
@@ -319,18 +304,25 @@ ParallelExecutor::ParallelExecutor(
}
if (build_strategy.enable_parallel_graph_) {
+#ifdef PADDLE_WITH_CUDA
+ // TODO(Yancey1989): Remove passing in the main_program when
+ // allreduce_seq_pass doesn't need it as the attr.
member_->executor_.reset(new details::ParallelSSAGraphExecutor(
- exec_strategy, member_->local_scopes_, member_->places_,
- std::move(graphs)));
+ exec_strategy, member_->local_scopes_, member_->places_, main_program,
+ std::move(graph)));
+#else
+ PADDLE_THROW(
+ "Paddle should be compiled with CUDA for ParallelGraph Execution.");
+#endif
} else {
if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_,
- std::move(graphs[0])));
+ std::move(graph)));
} else {
member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_,
- std::move(graphs[0])));
+ std::move(graph)));
}
}
@@ -482,11 +474,10 @@ bool ParallelExecutor::EnableParallelGraphExecution(
}
if (!member_->use_all_reduce_ || !member_->use_cuda_)
- enable_parallel_graph = false;
- if (build_strategy.enable_sequential_execution_ ||
- exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
- enable_parallel_graph = false;
+ if (build_strategy.enable_sequential_execution_ ||
+ exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
+ enable_parallel_graph = false;
return enable_parallel_graph;
}
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 2cb5dc895d4e888c48323c55e4c4ce3718caac09..fd1b64ee8be704301e883ff18ecec1ac222b6bb9 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -234,7 +234,7 @@ void VarBase::RunBackward() {
std::map> OpBase::ApplyGrad() {
if (grad_op_descs_.empty() && backward_id_ <= 0) {
- LOG(WARNING) << "op with no grad: " << op_desc_->Type();
+ VLOG(3) << "op with no grad: " << op_desc_->Type();
return {};
}
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index a64f85ee9ac1a7bb8f0ed7bb8678166bbbcd5746..96befe7f8a5d16402338ac337daa96d714b4d310 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inputs.size() == n;
}
-NodesTSIterator::NodesTSIterator(const std::vector &source) {
- PADDLE_ENFORCE(!source.empty(),
- "Start points of topological sorting should not be empty!");
- // CHECK all the inputs' in-degree is 0
- for (auto *node : source) {
- PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
- }
-
- std::unordered_set visited;
- std::unordered_set to_visit{source.begin(), source.end()};
-
- std::vector inlink_visited;
- while (!to_visit.empty()) {
- std::vector queue(to_visit.begin(), to_visit.end());
- for (auto *p : queue) {
- if (Agent(p).deleted()) {
- visited.insert(p);
- to_visit.erase(p);
- }
-
- inlink_visited.clear();
-
- std::copy_if(p->inputs.begin(), p->inputs.end(),
- std::back_inserter(inlink_visited),
- [&](Node *x) -> bool { return visited.count(x) != 0; });
-
- if (inlink_visited.size() == p->inputs.size()) {
- sorted_.push_back(p);
- for (auto *_ : p->outputs) {
- if (!visited.count(_)) {
- to_visit.insert(_);
- }
- }
-
- to_visit.erase(p);
- visited.insert(p);
- }
- }
- }
-}
-
-NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
- : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &NodesTSIterator::operator*() {
- PADDLE_ENFORCE_LT(cursor_, sorted_.size());
- return *sorted_[cursor_];
-}
-
-NodesTSIterator &NodesTSIterator::operator++() {
- if (++cursor_ >= sorted_.size()) {
- sorted_.clear();
- cursor_ = 0;
- }
- return *this;
-}
-NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
- cursor_ = other.cursor_;
- sorted_ = other.sorted_;
- return *this;
-}
-
-bool NodesTSIterator::operator==(const NodesTSIterator &other) {
- return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *NodesTSIterator::operator->() {
- PADDLE_ENFORCE_LT(cursor_, sorted_.size());
- return sorted_[cursor_];
-}
-
} // namespace analysis
} // namespace inference
} // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
index ea88edd042aa9d46f66af1aa92f2cb273696c118..5d11c217b69f11d45c6fb6d552dc404fa8313daf 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -30,6 +30,7 @@ namespace inference {
namespace analysis {
using framework::ir::Graph;
+using framework::ir::NodesTSIterator;
const char kIsFunctionNode[] = "__is_function_node__";
const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
@@ -132,32 +133,6 @@ struct Agent {
framework::ir::Node *x_;
};
-// Topological sorting iterator on nodes.
-struct NodesTSIterator
- : public std::iterator {
- NodesTSIterator() = default;
- explicit NodesTSIterator(const std::vector &source);
- NodesTSIterator(NodesTSIterator &&other)
- : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
- other.cursor_ = 0;
- }
- NodesTSIterator(const NodesTSIterator &other);
-
- framework::ir::Node &operator*();
- NodesTSIterator &operator++();
- // TODO(Superjomn) current implementation just compare the first
- // element, need to compare the graph and all the elements in the queue and
- // set.
- NodesTSIterator &operator=(const NodesTSIterator &other);
- bool operator==(const NodesTSIterator &other);
- bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
- framework::ir::Node *operator->();
-
- private:
- std::vector sorted_;
- size_t cursor_{0};
-};
-
// The nodes those have no input will be treated as start points.
static std::vector ExtractStartPoints(const Graph &g) {
std::vector result;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index e92273b4dd94f11e0e90c91fd82dafe42bf158f3..522ab495227e9b8c52b8d38db696fa9b785ba642 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(params_file_);
CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
// params_file_ fields.
- // Gpu releated.
+ // Gpu related.
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(enable_memory_optim_);
CP_MEMBER(static_memory_optim_);
CP_MEMBER(static_memory_optim_force_update_);
- // TensorRT releated.
+ // TensorRT related.
CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
- // MKLDNN releated.
+ // MKLDNN related.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 712e010db4340bde55945f89c488ba8cc38a1926..e8964c4acea0d220deca048a018eb7de42d7e4e5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -392,7 +392,7 @@ std::unique_ptr CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
if (config.use_gpu()) {
- // 1. GPU memeroy
+ // 1. GPU memory
PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id());
@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return need;
}
-std::string AnalysisPredictor::GetSeriazlizedProgram() const {
+std::string AnalysisPredictor::GetSerializedProgram() const {
return inference_program_->Proto()->SerializeAsString();
}
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 014df4ee8b6d86232212736c43a9aff32ffee011..d5445c58e45ae64a8cfab03cb610e3677729338b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid);
- std::string GetSeriazlizedProgram() const override;
+ std::string GetSerializedProgram() const override;
protected:
// For memory optimization.
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 002ba90e40e69d565f5a54e374a3f0083b84273f..6696839b53fb21c274843afd86b5d8b5c2042c51 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
{
// The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor(config);
- LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
- ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
+ LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
+ ASSERT_FALSE(predictor->GetSerializedProgram().empty());
// Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 6cd18277d63200f5bccf180a7ae3196b0ce126ff..f83537f064187e67a08c8bbce52707d1c824abeb 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -92,7 +92,7 @@ void PaddleBuf::Reset(void *data, size_t length) {
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
- PADDLE_ENFORCE_GT(length_, 0);
+ PADDLE_ENFORCE_GT(length_, 0UL);
free(static_cast(data_));
data_ = nullptr;
length_ = 0;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index e18bc02d92eb517fa20dc83811694b8ac80ae316..97c164bdef7a4b3e66be78526793f3830ada398b 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -290,7 +290,7 @@ std::unique_ptr CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) {
- // 1. GPU memeroy
+ // 1. GPU memory
PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 47361b3279e14dd65a0e6e7f864e508ef1183045..c1c6227cdd8b2042f6765c7932327ecae246c260 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -212,12 +212,12 @@ struct AnalysisConfig {
std::string prog_file_;
std::string params_file_;
- // GPU releated.
+ // GPU related.
bool use_gpu_{false};
int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
- // TensorRT releated.
+ // TensorRT related.
bool use_tensorrt_{false};
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 8ac8bc529183edc2f8f888ca7ba14611acaadc10..c9a45b4aa3b4037d3725622fc960848bc1ccfb2c 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -16,6 +16,12 @@
/*! \file paddle_api.h
*/
+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
#include
#include
#include
@@ -34,26 +40,49 @@ enum PaddleDType {
};
/**
- *\brief Memory menager for PaddleTensor.
+ * \brief Memory manager for `PaddleTensor`.
*
- *The PaddleBuf holds a buffer for data input or output. The memory can be
- *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- *should be reused for better performance.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
*
- *For user allocated memory, the following API can be used:
- *- PaddleBuf(void* data, size_t length) to set an external memory by
- *specifying
- * the memory address and length.
- *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
- *ATTENTION, for user allocated memory, deallocation should be done by users
+ * ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*
- *To have the PaddleBuf allocate and manage the memory:
- *- PaddleBuf(size_t length) will allocate a memory of size `length`.
- *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
+ *
+ * Usage:
+ *
+ * Let PaddleBuf manage the memory internally.
+ * \code{cpp}
+ * const int num_elements = 128;
+ * PaddleBuf buf(num_elements * sizeof(float));
+ * \endcode
+ *
+ * Or
+ * \code{cpp}
+ * PaddleBuf buf;
+ * buf.Resize(num_elements * sizeof(float));
+ * \endcode
+ * Works the exactly the same.
+ *
+ * One can also make the `PaddleBuf` use the external memory.
+ * \code{cpp}
+ * PaddleBuf buf;
+ * void* external_memory = new float[num_elements];
+ * buf.Reset(external_memory, num_elements*sizeof(float));
+ * ...
+ * delete[] external_memory; // manage the memory lifetime outside.
+ * \endcode
*/
class PaddleBuf {
public:
@@ -78,7 +107,7 @@ class PaddleBuf {
/** Tell whether the buffer is empty.
*/
bool empty() const { return length_ == 0; }
- /** Get the memory address.
+ /** Get the data's memory address.
*/
void* data() const { return data_; }
/** Get the memory length.
@@ -110,7 +139,8 @@ struct PaddleTensor {
};
enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-/** Tensor without copy, currently only supports AnalysisPredictor.
+
+/** Tensor without copy, currently only supports `AnalysisPredictor`.
*/
class ZeroCopyTensor {
public:
@@ -218,7 +248,7 @@ class PaddlePredictor {
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
*/
- virtual std::string GetSeriazlizedProgram() const {
+ virtual std::string GetSerializedProgram() const {
assert(false); // Force raise error.
return "NotImplemented";
}
@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
*
* Usage:
*
+ * \code{.cpp}
* NativeConfig config;
* ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config);
+ * \endcode
*
* FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 039389a4cf99da6c2576c148d8c294e5d79aa7a8..f9c13c2fa84b3b5d629297d3f44a6f5889a734f4 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet";
}
+GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
+ passes_.assign({
+ "infer_clean_graph_pass", //
+ "identity_scale_op_clean_pass", //
+ "conv_affine_channel_fuse_pass", //
+ "conv_eltwiseadd_affine_channel_fuse_pass", //
+ "conv_bn_fuse_pass", //
+#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
+ // guaranteed at least v7
+ "conv_elementwise_add_act_fuse_pass", //
+ "conv_elementwise_add2_act_fuse_pass", //
+ "conv_elementwise_add_fuse_pass", //
+#endif
+ });
+
+ for (int i = 6; i >= 3; i--) {
+ passes_.push_back("transpose_flatten" + std::to_string(i) +
+ "_concat_fuse_pass");
+ }
+ use_gpu_ = true;
+}
+
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass);
}
+CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
+ // NOTE the large fusions should be located in the front, so that they will
+ // not be damaged by smaller ones.
+ passes_.assign({
+ "infer_clean_graph_pass", //
+ "attention_lstm_fuse_pass", //
+ "seqpool_concat_fuse_pass", //
+ "seqconv_eltadd_relu_fuse_pass", //
+ // "embedding_fc_lstm_fuse_pass", //
+ "fc_lstm_fuse_pass", //
+ "mul_lstm_fuse_pass", //
+ "fc_gru_fuse_pass", //
+ "mul_gru_fuse_pass", //
+ "seq_concat_fc_fuse_pass", //
+ "fc_fuse_pass", //
+ "repeated_fc_relu_fuse_pass", //
+ "squared_mat_sub_fuse_pass", //
+ "conv_bn_fuse_pass", //
+ "conv_eltwiseadd_bn_fuse_pass", //
+ "is_test_pass", //
+ "identity_scale_op_clean_pass", //
+ });
+ use_gpu_ = false;
+}
} // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index aa353f12ca7333713e2d640cce6b2dfbea3c4e26..2524d89fcd1322e105ad2217347aa2380448f2bc 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
*/
class CpuPassStrategy : public PassStrategy {
public:
- CpuPassStrategy() : PassStrategy({}) {
- // NOTE the large fusions should be located in the front, so that they will
- // not be damaged by smaller ones.
- passes_.assign({
- "infer_clean_graph_pass", //
- "attention_lstm_fuse_pass", //
- "seqpool_concat_fuse_pass", //
- "seqconv_eltadd_relu_fuse_pass", //
- // "embedding_fc_lstm_fuse_pass", //
- "fc_lstm_fuse_pass", //
- "mul_lstm_fuse_pass", //
- "fc_gru_fuse_pass", //
- "mul_gru_fuse_pass", //
- "seq_concat_fc_fuse_pass", //
- "fc_fuse_pass", //
- "repeated_fc_relu_fuse_pass", //
- "squared_mat_sub_fuse_pass", //
- "conv_bn_fuse_pass", //
- "conv_eltwiseadd_bn_fuse_pass", //
- "is_test_pass", //
- "identity_scale_op_clean_pass", //
- });
- use_gpu_ = false;
- }
+ CpuPassStrategy();
explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {}
@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
*/
class GpuPassStrategy : public PassStrategy {
public:
- GpuPassStrategy() : PassStrategy({}) {
- passes_.assign({
- "infer_clean_graph_pass", //
- "identity_scale_op_clean_pass", //
- "conv_affine_channel_fuse_pass", //
- "conv_eltwiseadd_affine_channel_fuse_pass", //
- "conv_bn_fuse_pass", //
-#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
- // guaranteed at least v7
- "conv_elementwise_add_act_fuse_pass", //
- "conv_elementwise_add2_act_fuse_pass", //
- "conv_elementwise_add_fuse_pass", //
-#endif
- });
-
- for (int i = 6; i >= 3; i--) {
- passes_.push_back("transpose_flatten" + std::to_string(i) +
- "_concat_fuse_pass");
- }
- use_gpu_ = true;
- }
+ GpuPassStrategy();
explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7ecd9e35332843e3a391cdad5ce32220d890abd1..55ab04bfe16ec6a3d97c443f59c72e7b85fb1899 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
+# TODO(luotao, Superjom) Disable DAM test, temporarily fix
+# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
+# After inference framework refactor, will reopen it.
# normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
+#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
# small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index dd953e0dccbb3749bfcc87966453c6976dfefa10..bd0059e18485c046df27d5ddbb39df9bbb249113 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,14 +56,14 @@ struct DataRecord {
std::vector slot_data;
split_to_float(data[1], ' ', &slot_data);
std::string name = data[0];
- PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0,
+ PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
"line %d, %s should be divisible", num_lines, name);
datasets[name].emplace_back(std::move(slot_data));
}
num_samples = num_lines / num_slots;
PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast(num_lines),
"num samples should be divisible");
- PADDLE_ENFORCE_GT(num_samples, 0);
+ PADDLE_ENFORCE_GT(num_samples, 0UL);
}
void Prepare(int bs) {
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 29f0f034a2aab50330d4d0127b870a5cb00d56a5..6c5fe043ffa3f3dcafe2dbbebd6244467f859abf 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -1,18 +1,43 @@
+include(ExternalProject)
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
-function (inference_download install_dir url filename)
- message(STATUS "Download inference test stuff from ${url}/${filename}")
- file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
- message(STATUS "finish downloading ${filename}")
+
+function(inference_download INSTALL_DIR URL FILENAME)
+ message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+ string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+ ExternalProject_Add(
+ extern_inference_download_${FILENAME_EX}
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ PREFIX ${INSTALL_DIR}
+ URL ${URL}/${FILENAME}
+ DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
+ DOWNLOAD_DIR ${INSTALL_DIR}
+ DOWNLOAD_NO_PROGRESS 1
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ UPDATE_COMMAND ""
+ INSTALL_COMMAND ""
+ )
endfunction()
-function (inference_download_and_uncompress install_dir url filename)
- inference_download(${install_dir} ${url} ${filename})
- execute_process(
- COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
- WORKING_DIRECTORY ${install_dir}
- )
+function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
+ message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+ string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+ set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
+ set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+ ExternalProject_Add(
+ ${EXTERNAL_PROJECT_NAME}
+ ${EXTERNAL_PROJECT_LOG_ARGS}
+ PREFIX ${INSTALL_DIR}
+ URL ${URL}/${FILENAME}
+ DOWNLOAD_DIR ${INSTALL_DIR}
+ DOWNLOAD_NO_PROGRESS 1
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ UPDATE_COMMAND ""
+ INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
+ )
endfunction()
set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 75fa611c0d701dd81dfe5b33231655e0959c7dbf..861f69f4d2143b16bdec546d92ce7bd13ca53ed3 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
// Enable the profiler
paddle::platform::EnableProfiler(state);
{
- paddle::platform::RecordEvent record_event(
- "init_program",
- paddle::platform::DeviceContextPool::Instance().Get(place));
+ paddle::platform::RecordEvent record_event("init_program");
inference_program = InitProgram(&executor, scope, dirname, is_combined);
}
@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,
// Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) {
- paddle::platform::RecordEvent record_event(
- "run_inference",
- paddle::platform::DeviceContextPool::Instance().Get(place));
+ paddle::platform::RecordEvent record_event("run_inference");
if (PrepareContext) {
// Note: if you change the inference_program, you need to call
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 6f3e512fb0b68df5e86eba3e50a255c18f75214f..e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -111,6 +111,8 @@ size_t BestFitAllocator::NumFreeChunks() const {
}
void BestFitAllocator::Free(Allocation* allocation) {
auto* bf_allocation = dynamic_cast(allocation);
+ PADDLE_ENFORCE_NOT_NULL(bf_allocation,
+ "The input allocation is not BestFitAllocation.");
auto chunk_it = bf_allocation->ChunkIterator();
PADDLE_ENFORCE(!chunk_it->is_free);
chunk_it->is_free = true;
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index e983ae327d69389526ae4cae226b0eb324759700..1936f9d4cd83c53cf7b322ab29a3e0d92e042abc 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) {
usage_ -= size;
}
-uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
+uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; }
LegacyMemMonitor::~LegacyMemMonitor() {
for (auto &item : gpu_mem_info_) delete item.second;
@@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
gpu_mem_info_[device]->Minus(size);
}
-uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
+uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const {
return gpu_mem_info_.find(device) == gpu_mem_info_.end()
? 0
- : gpu_mem_info_[device]->GetPeakUsage();
+ : gpu_mem_info_.at(device)->GetPeakUsage();
}
void LegacyMemMonitor::PrintMemUsage() {
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index ccbc8c70d8e9a16e7edb1be54bf80bec3b368eca..d9bdae153da6439598f76f5cac226897e6e0c596 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -27,20 +27,20 @@ namespace allocation {
class MemInfo {
public:
MemInfo() : usage_(0), peak_usage_(0) {}
- MemInfo(const MemInfo &) = delete;
- MemInfo &operator=(const MemInfo &) = delete;
// return a flag to indicate current operation will create a peak point or not
bool Add(const size_t &);
void Minus(const size_t &);
- uint64_t GetPeakUsage();
+ uint64_t GetPeakUsage() const;
private:
/* current memory usage*/
uint64_t usage_;
uint64_t peak_usage_;
std::mutex mutex_;
+
+ DISABLE_COPY_AND_ASSIGN(MemInfo);
};
class LegacyMemMonitor {
@@ -56,11 +56,11 @@ class LegacyMemMonitor {
void Add(const int &, const size_t &);
void Minus(const int &, const size_t &);
- uint64_t GetMemUsage(const int &);
+ uint64_t GetMemUsage(const int &) const;
void PrintMemUsage();
- protected:
+ private:
MemUsage gpu_mem_info_;
};
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e099425b94221bf1229e936fc1781615d13dbc26..2166b8b545c23025e029d283b7ca43719e31a259 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -97,3 +97,4 @@ if (WITH_PYTHON)
endif()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+add_subdirectory(benchmark)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 189db2317d0544014d9c74e0fd5e9ead54925b9c..65efe2966ce12e86ba7f4944eb57ae72cdf9796f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -37,7 +37,7 @@ using paddle::framework::Tensor;
"(bool, default false) Set to true for inference only, false " \
"for training. Some layers may run faster when this is true.") \
.SetDefault(false); \
- AddComment(#OP_COMMENT); \
+ AddComment(OP_COMMENT); \
} \
}
@@ -124,7 +124,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
UNUSED constexpr char SigmoidDoc[] = R"DOC(
Sigmoid Activation Operator
-$$out = \frac{1}{1 + e^{-x}}$$
+$$out = \\frac{1}{1 + e^{-x}}$$
)DOC";
@@ -187,14 +187,14 @@ $out = |x|$
UNUSED constexpr char CeilDoc[] = R"DOC(
Ceil Activation Operator.
-$out = ceil(x)$
+$out = \left \lceil x \right \rceil$
)DOC";
UNUSED constexpr char FloorDoc[] = R"DOC(
Floor Activation Operator.
-$out = floor(x)$
+$out = \left \lfloor x \right \rfloor$
)DOC";
@@ -252,7 +252,7 @@ $out = \ln(1 + e^{x})$
UNUSED constexpr char SoftsignDoc[] = R"DOC(
Softsign Activation Operator.
-$$out = \frac{x}{1 + |x|}$$
+$$out = \\frac{x}{1 + \|x\|}$$
)DOC";
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index b6996be4b0984bcee3b16da268d79708a68b65b3..912ec79910301b67bc520b1aa78d3fa1fd165d1f 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel {
int len = x_lod[0][i + 1] - x_lod[0][i];
max_seq_len = max_seq_len < len ? len : max_seq_len;
}
- PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
+ PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1.");
PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
fc_out->Resize({max_seq_len, 1});
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..54008336a9f67f0123ba1cfa6fcea35b79b7ac4c
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
+ DEPS memory timer framework_proto proto_desc lod_tensor op_registry
+ device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e179de56cddc5fada2e5833086d351659a7cf540
--- /dev/null
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/benchmark/op_tester.h"
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/timer.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace operators {
+namespace benchmark {
+
+DEFINE_string(op_config_list, "", "Path of op config file.");
+
+void OpTester::Init(const std::string &filename) {
+ Init(OpTesterConfig(filename));
+}
+
+void OpTester::Init(const OpTesterConfig &config) {
+ config_ = config;
+
+ auto &op_desc_info = framework::OpInfoMap::Instance();
+ // Initialize the OpDesc
+ if (op_desc_info.Has(config_.op_type)) {
+ type_ = config_.op_type;
+ op_desc_.SetType(config_.op_type);
+
+ CreateInputVarDesc();
+ CreateOutputVarDesc();
+ } else {
+ LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
+ }
+
+ if (config_.device_id >= 0) {
+ place_ = paddle::platform::CUDAPlace(config_.device_id);
+ } else {
+ place_ = paddle::platform::CPUPlace();
+ }
+
+ framework::InitDevices(false);
+ scope_.reset(new paddle::framework::Scope());
+
+ op_ = framework::OpRegistry::CreateOp(op_desc_);
+ CreateVariables(scope_.get());
+}
+
+void OpTester::Run() {
+ if (config_.print_debug_string) {
+ LOG(INFO) << DebugString();
+ }
+
+ // Warm up
+ RunImpl();
+
+ platform::Timer timer;
+ if (config_.profile) {
+ if (platform::is_cpu_place(place_)) {
+ platform::EnableProfiler(platform::ProfilerState::kCPU);
+ } else {
+#ifdef PADDLE_WITH_CUDA
+ platform::EnableProfiler(platform::ProfilerState::kAll);
+ platform::SetDeviceId(config_.device_id);
+#else
+ PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+ }
+
+ timer.Start();
+ for (int i = config_.repeat; i > 0; --i) {
+ RunImpl();
+ }
+ timer.Pause();
+ platform::DisableProfiler(platform::EventSortingKey::kDefault,
+ "op_tester_profiler");
+ } else {
+ timer.Start();
+ for (int i = config_.repeat; i > 0; --i) {
+ RunImpl();
+ }
+ timer.Pause();
+ }
+ config_.runtime = timer.ElapsedMS() / config_.repeat;
+ LOG(INFO) << "=== Run " << config_.repeat
+ << " times, latency: " << config_.runtime << " ms ===";
+}
+
+void OpTester::RunImpl() {
+ op_->Run(*scope_, place_);
+ platform::DeviceContextPool::Instance().Get(place_)->Wait();
+ scope_->DropKids();
+}
+
+std::vector OpTester::GetOpProtoInputNames() {
+ std::vector